## Loading data... 

In [None]:
import pandas as pd
import json
import csv 

path = "user-text.json"
print("Loading data from", path)

with open(path) as jf:
    d = json.load(jf)

entries = d['entries']
df = open('data_file.csv', 'w')
cw = csv.writer(df)

labels = ['title', 'text', 'tags', 'timestamp', 'id', 'parent_id', 'sentiment', 'toxicity', 'flags']
labeled = False 

for i in entries:
    for label in labels:
        if label not in i: 
            i[label] = None 

    if not labeled:
        h = i.keys()
        cw.writerow(h) # write labels of columns
        labeled = True 
    #cleaning each entry to remove spaces or weird formating 
    cleaned_entry = {}
    for k, v in i.items():
        cleaned_value = str(v).replace("\n", " ").rstrip()
        cleaned_entry[k] = cleaned_value

    cw.writerow(cleaned_entry.values()) # write to the file 

df.close()
print("CSV read successfully")

print("Now creating a dataframe")
df = pd.read_csv("data_file.csv")
print(df.head(1))


Loading data from user-text.json
CSV read successfully
Now creating a dataframe
                      title      text tags                   timestamp  \
0  Re: second post of today  my reply   []  2025-03-12T15:34:35.011895   

                                     id parent_id sentiment toxicity flags  
0  d04a146d-ae09-4ab0-aebe-67a565f6ac5c       NaN       NaN      NaN   NaN  


## Cleaning and Creating the corpus

In [None]:
import numpy as np
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
import string 

 #combining title and text into another column 
df['content'] = df['title'].fillna('') + ' ' + df['text'].fillna('')

def clean(text):
    if pd.issnull(text):
        return ''
    text = text.lower()
    # text = re.sub(r'[^a-zA-Z\s]', '', text)
    regex_pattern = f"[{re.escape(string.punctuation)}]"
    text = re.sub(regex_pattern, "", text)
    text =  re.sub(r'\s+', ' ', text).strip()
    tokens = [word for word in text.split() if word not in ENGLISH_STOP_WORDS]
    return ' '.join(tokens) 

df['cleaned_content'] = df['content'].apply(clean)
df['cleaned_content'].head()

0                              second post today reply
1                     response phoene replying dsfadsf
2                                             replying
3             testing replies trying test reply button
4    testing replies replying trying test reply button
Name: cleaned_content, dtype: object

## Applying Non-negative Matrix for Topic Modelling

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from time import time


#Creating Vectors from tokens 
#initialzing vectorizer, can change different parameters such as 
#how many features or how many words features contain

n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

t0 = time()
tokens = df['cleaned_content']
vectorizer = TfidfVectorizer()
vectorizer = TfidfVectorizer(max_features = n_features, ngram_range=(1,2))

X = vectorizer.fit_transform(tokens)

features = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(X.toarray(), columns = features)

print("Showing TFIDF matrix... ")
tfidf_df.head()
print("Fitting the NMF model with n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

nmf = NMF(n_components = n_topics, random_state = 1).fit(tfidf_df)
print("done in %0.3fs." % (time() - t0))

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([features[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


Showing TFIDF matrix... 
Fitting the NMF model with n_samples=2000 and n_features=1000...
done in 0.052s.
Topic #0:
testing keywords keywords keywords site site ass site testing ass keywords fuck fuck fuck ass moderation hello fuck site hello fuck testing moderation fuck fuck hello jhgkhj ass jhgkhj modertaaiondf bad modertaaiondf

Topic #1:
fuck fuck fuck fuck ass moderation ass dont dont like moderation test test fuck dont like ass test keywords fuck testing testing moderation hello fuck fuck site moderation hello hello fucking ass

Topic #2:
testing test area textarea input test area textarea text textarea input body text text body area replying replying body test testing area body replying text body hello asking medical asking

Topic #3:
blah words fuck bad words bad words ass bad blah ass blah blah adslkfjdsf adslkfjdsf blah fuck ass modertaaiondf modertaaiondf bad testing modertaaiondf fuck ass testing fuck fucking fucking fucking ass

Topic #4:
replies testing replies reply test

## Using results for tag mapping
