In [13]:
import os
import pandas as pd
import nltk
from nltk import word_tokenize
import gensim
import gensim.corpora as corpora
import gensim.utils as gu
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import ldamallet

In [14]:
def load_data(path):
    reddit_df = pd.read_csv(path, lineterminator='\n', encoding='utf8')
    return reddit_df

In [100]:
def preprocess_gensim(text):
    """Tokenizes and processes the text using Gensim."""
    if isinstance(text, str):
        return ' '.join(gu.simple_preprocess(text))
    else:
        return ''  # Return an empty string for non-string inputs

In [113]:
def preprocessing(df):
    df['clean_text'] = df['text'].str.lower()
    print("cleaned_lower")
    df['clean_text'] = df['clean_text'].str.replace(r'[^a-zA-Z\s]', ' ',regex=True) 
    df['clean_text'] = df['clean_text'].str.replace(r'\s{2,}', ' ',regex=True)   
    print("cleaned_regex") 
    df['clean_text'] = df['clean_text'].apply(preprocess_gensim)
    print("cleaned_preprocessed")
    df['clean_text'] = df['clean_text'].apply(word_tokenize)
    print("cleaned_tokenized")
    df['clean_text'] = df['clean_text'].apply(lambda x:[word for word in x if word not in stopwords.words("english") and word.isalpha()])
    print("cleaned_stopwords")
    df['clean_text'] = df['clean_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word) for word in x])
    print("cleaned_Lemmatized")
    df['clean_text'] = df['clean_text'].apply(lambda x: [word for word in x if nltk.pos_tag([word])[0][1] == 'NN'])
    print("cleaned_tagged")
    df = df[df['clean_text'].map(lambda x: len(x)) > 1].reset_index(drop=True)
    return df

In [114]:
def create_dictionary(reddit_df):
    texts = reddit_df['clean_text']
    id2word = corpora.Dictionary(texts)
    corpus = [id2word.doc2bow(text) for text in texts]
    return texts, id2word, corpus

In [115]:
def load_mallet(system,folder_path):
    os.environ['MALLET_HOME']=folder_path
    if system == 'windows': mallet_path = folder_path+"\\bin\\mallet.bat"
    elif system == 'mac': mallet_path = folder_path+"/bin/mallet"
    return mallet_path

In [117]:
# Define your seed topics
seed_topics = {
    0: "Political",
    1: "Covid-19",
    2: "Race & Religion",
    3: "Transport",
    4: "Relationships",
    5: "Crime",
    6: "Housing",
    7: "Education",
    8: "Work"
}

# Define the seed words for each topic
seed_words = {
    "Political": ["ge", "general election", "affair", "mp", "politician", "politics"],
    "Covid-19": ["covid-19", "infection", "vaccine", "lockdown", "circuit breaker", "mask", "cough"],
    "Race & Religion": ["chinese", "malay", "indian", "angmoh", "culture", "christian", "buddhist", "muslim", "racist", "CECA"],
    "Transport": ["breakdown", "train", "mrt", "lrt", "bus", "simplygo"],
    "Relationships": ["relationships", "husband", "wife", "bf", "gf", "breakup", "cheat", "affair", "lover", "divorce", "love"],
    "Crime": ["crime", "case", "police", "murder", "kill", "death", "scam"],
    "Housing": ["hdb", "price", "bto", "resale"],
    "Education": ["student", "psle", "study", "alevel", "olevel", "exam", "school"],
    "Work": ["ot", "salary", "unemployed", "boss", "job", "laoban", "colleague"]
}


In [167]:
import pandas as pd

# Define the function for topic modeling
def topic_modelling(model, corpus, texts, data, seed_topics):
    output_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        print(f"Document {i}, Topics: {row}")

        # If the text is empty, classify it as 'Others'
        if not texts[i]:
            output_df = pd.concat([
                output_df,
                pd.DataFrame([[10, 'Others', 1.000, '']], columns=['Topic Number', 'Topic', 'Perc_Contribution', 'Topic_Keywords'])
            ], ignore_index=True)
        else:
            for j, (topic_num, prop_topic) in enumerate(row):
                if j == 0:  # Dominant topic (highest contribution)
                    wp = model.show_topic(topic_num)
                    topic_keywords = ", ".join([word for word, prop in wp])

                    # Map the topic number to a predefined topic using seed_topics
                    topic_name = seed_topics.get(int(topic_num), 'Unknown')

                    output_df = pd.concat([
                        output_df,
                        pd.DataFrame([[int(topic_num), topic_name, round(prop_topic, 4), topic_keywords]], 
                                      columns=['Topic Number', 'Topic', 'Perc_Contribution', 'Topic_Keywords'])
                    ], ignore_index=True)
                else:
                    break

    # Concatenate the original data with the topic modeling results
    output_df = pd.concat([data, output_df], axis=1)

    # Remove any unnecessary columns such as 'clean_text' if needed
    output_df = output_df.drop(['clean_text', 'Perc_Contribution', 'Topic_Keywords'], axis=1, errors='ignore')

    return output_df


In [119]:
reddit_df = load_data('./data/RedditToxicityScores.csv') #Change File Path


In [120]:
reddit_df_processed = preprocessing(reddit_df)


cleaned_lower
cleaned_regex
cleaned_preprocessed
cleaned_tokenized
cleaned_stopwords
cleaned_Lemmatized
cleaned_tagged


In [121]:
print(reddit_df_processed['clean_text'])

0          [think, singaporean, dont, damn, taiwan, belong]
1                      [fair, point, secrecy, aspect, mind]
2         [gt, binary, think, im, blind, majority, privi...
3                                          [boo, boo, lmao]
4         [simple, trick, insta, wedding, need, surface,...
                                ...                        
347598    [gt, gon, force, purchase, try, public, call, ...
347599                                         [need, grow]
347600                            [bickering, adult, level]
347601    [kid, tbh, age, theyre, life, point, need, wan...
347602                   [lianhua, government, doubt, work]
Name: clean_text, Length: 347603, dtype: object


In [122]:
print()




In [123]:
texts, id2word, corpus = create_dictionary(reddit_df_processed)


In [124]:
print(texts)

0          [think, singaporean, dont, damn, taiwan, belong]
1                      [fair, point, secrecy, aspect, mind]
2         [gt, binary, think, im, blind, majority, privi...
3                                          [boo, boo, lmao]
4         [simple, trick, insta, wedding, need, surface,...
                                ...                        
347598    [gt, gon, force, purchase, try, public, call, ...
347599                                         [need, grow]
347600                            [bickering, adult, level]
347601    [kid, tbh, age, theyre, life, point, need, wan...
347602                   [lianhua, government, doubt, work]
Name: clean_text, Length: 347603, dtype: object


In [125]:
print(id2word)

Dictionary<89191 unique tokens: ['belong', 'damn', 'dont', 'singaporean', 'taiwan']...>


In [131]:
print(corpus[:5])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)], [(6, 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(5, 2), (9, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 3), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1)], [(46, 2), (47, 1)], [(48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1)]]


In [155]:
mallet_path = r'C:\Users\rhyde\mallet-2.0.8\mallet-2.0.8\bin\mallet.bat'

In [156]:
import os

os.environ['MALLET_HOME'] = r'C:\Users\rhyde\mallet-2.0.8\mallet-2.0.8'
os.environ['PATH'] = os.environ['PATH'] + os.pathsep + r'C:\Users\rhyde\mallet-2.0.8\mallet-2.0.8\bin'

In [153]:
def create_mallet(mallet_path, num_topics, id2word, corpus):
    # Use Gensim's wrapper for MALLET
    return ldamallet.LdaMallet(
        mallet_path=mallet_path, 
        corpus=corpus, 
        num_topics=num_topics, 
        id2word=id2word
    )

In [157]:
mallet = create_mallet(mallet_path=mallet_path, num_topics=10, id2word=id2word, corpus=corpus)

In [None]:
output_df = topic_modelling(model=mallet,corpus=corpus,texts=texts,data=reddit_df, seed_topics = seed_topics)

In [169]:
#output_df.to_csv('topic_model_results.csv', index=False)

In [172]:
output_df.head

<bound method NDFrame.head of                                                      text  \
0       i think most singaporeans dont give a damn who...   
1       fair point the secrecy aspect of it slipped my...   
2                                                   range   
3       gt this is binary thinking because you think t...   
4                                     boo boo poor u lmao   
...                                                   ...   
396293  gt what are they gonna force people to purchas...   
396294    you did what you could he just needs to grow up   
396295            indeed\n\nsame bickering on adult level   
396296  thats all kids tbh depending on the age theyre...   
396297  lianhua is propagated by the chinese governmen...   

                  timestamp           username  \
0       2020-04-11 15:49:23           invigo79   
1       2020-04-03 09:59:08  potatetoe_tractor   
2       2020-02-15 15:07:03     CrossfittJesus   
3       2020-06-04 07:07:39          