# DSA4264 Project

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import pandas as pd
nltk.download('omw-1.4')
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
clean_data = pd.read_csv('./data/combined_cleaned_500k.csv',  lineterminator='\n', encoding='utf8')

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('stopwords')
nltk.download('punkt')

from sklearn.feature_extraction.text import CountVectorizer

df = clean_data.dropna()

In [None]:
# Get the list of stopwords
stop_words = set(stopwords.words('english'))

# Define a function to remove stopwords from a single text
def remove_stopwords(text):
    # Tokenize the text
    words = word_tokenize(text)
    # Filter out stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Join the remaining words back into a single string
    return ' '.join(filtered_words)

# Apply the function to the 'comments' column
df['cleaned_comments'] = df['text'].apply(remove_stopwords)

# LDA Method 1: sklearn

In [7]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
data_vectorized = vectorizer.fit_transform(df['cleaned_comments'])

lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(data_vectorized)

In [8]:
# Function to display the top words for each topic
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Number of words to display per topic
no_top_words = 10

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Display the topics
display_topics(lda, feature_names, no_top_words)

Topic 0:
gt singapore said cases police case covid19 public new source
Topic 1:
people dont think like good want pap make things say
Topic 2:
like car buy people road got bus use think really
Topic 3:
thanks read know people post like lol im dont comment
Topic 4:
years hdb yes price prices market money year pay new
Topic 5:
food like eat time good work looks nice chicken home
Topic 6:
people like covid dont mask wear need comments vaccine masks
Topic 7:
day good thank lol time like school need use im
Topic 8:
like got dont im na man time know ah really
Topic 9:
singapore people work like job dont sg country pay money


# LDA Method 2: Gensim

In [None]:
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis.gensim_models
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Sample DataFrame with cleaned comments
# df = pd.DataFrame({'cleaned_comments': ['sample text data for LDA', ...]})

# Ensure you have NLTK stopwords downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Tokenize and clean the text
stop_words = set(stopwords.words('english'))
df['tokenized_comments'] = df['cleaned_comments'].apply(
    lambda x: [word for word in word_tokenize(x.lower()) if word.isalnum() and word not in stop_words]
)

# Create a dictionary and corpus
dictionary = corpora.Dictionary(df['tokenized_comments'])
corpus = [dictionary.doc2bow(text) for text in df['tokenized_comments']]

# Train the LDA model
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15, random_state=42)

# Visualize the results with pyLDAvis
pyLDAvis.enable_notebook()
lda_vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)

# Optional: Print topics
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")


In [11]:
pyLDAvis.display(lda_vis)

In [26]:
#pyLDAvis.save_html(lda_vis, 'lda_visualization.html')

In [12]:
# Save the trained LDA model
lda_model.save("lda_model.gensim")

In [13]:
# Save the dictionary
dictionary.save("lda_dictionary.gensim")

In [14]:
import pandas as pd

# Extract topics and their words
topics = lda_model.print_topics(num_words=10)  # Adjust num_words as needed
topics_list = [(f'Topic {idx}', topic) for idx, topic in topics]

# Create a DataFrame
topics_df = pd.DataFrame(topics_list, columns=['Topic', 'Words'])

# Save to CSV
topics_df.to_csv('lda_topics.csv', index=False)

In [25]:
topics_df

Unnamed: 0,Topic,Words
0,Topic 0,"0.018*""u"" + 0.012*""like"" + 0.011*""please"" + 0.009*""one"" + 0.009*""lol"" + 0.009*""post"" + 0.008*""got"" + 0.008*""thanks"" + 0.008*""ah"" + 0.007*""see"""
1,Topic 1,"0.015*""people"" + 0.014*""dont"" + 0.014*""like"" + 0.010*""think"" + 0.008*""even"" + 0.008*""would"" + 0.007*""know"" + 0.007*""im"" + 0.006*""one"" + 0.006*""get"""
2,Topic 2,"0.011*""time"" + 0.010*""get"" + 0.008*""years"" + 0.008*""go"" + 0.007*""work"" + 0.007*""money"" + 0.007*""2"" + 0.007*""one"" + 0.006*""got"" + 0.006*""pay"""
3,Topic 3,"0.027*""gt"" + 0.011*""said"" + 0.010*""public"" + 0.009*""singapore"" + 0.008*""pap"" + 0.006*""police"" + 0.005*""also"" + 0.004*""vote"" + 0.004*""party"" + 0.003*""law"""
4,Topic 4,"0.020*""singapore"" + 0.007*""people"" + 0.007*""china"" + 0.005*""us"" + 0.005*""countries"" + 0.005*""covid"" + 0.005*""also"" + 0.005*""yes"" + 0.005*""high"" + 0.005*""sg"""


In [15]:
# Load the model
#lda_model_loaded = LdaModel.load("lda_model.gensim")

# Load the dictionary
#dictionary_loaded = corpora.Dictionary.load("lda_dictionary.gensim")