In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd

In [24]:
reddit = pd.read_csv("../../../Data/Final Dataset /Reddit/Reddit_Final.csv")

print(reddit.head())

print(reddit.shape)
print(reddit.columns)

            Author  Timestamp  \
0    KuntaWuKnicks  5 mo. ago   
1  Ikewiththebeard  5 mo. ago   
2    JamesJakes000  5 mo. ago   
3         XscytheD  5 mo. ago   
4             muws  5 mo. ago   

                                             Comment  \
0  Can I make it louderOr customize the sound, no...   
1             Customisable sound would be something!   
2                                Metal gear alert...   
3                                                  !   
4                                      I heard this.   

                      Title  Upvotes  
0  No more waking the wife…    24300  
1  No more waking the wife…    24300  
2  No more waking the wife…    24300  
3  No more waking the wife…    24300  
4  No more waking the wife…    24300  
(171358, 5)
Index(['Author', 'Timestamp', 'Comment', 'Title', 'Upvotes'], dtype='object')


In [25]:
reddit_data_clean = reddit.dropna(subset=['Comment'])

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
data_vectorized = vectorizer.fit_transform(reddit_data_clean['Comment'])

data_vectorized.shape

(171015, 30070)

In [26]:
num_topics = 10
no_top_words = 15
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=0)
lda_model.fit(data_vectorized)

In [27]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict[f"Topic {topic_idx}"] = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
    return topic_dict

In [28]:
topics = display_topics(lda_model, vectorizer.get_feature_names_out(), no_top_words)
topics

{'Topic 0': 'game like just really story good think games world fun people gameplay great make don',
 'Topic 1': 'like games game love souls ring elden play thanks just remake https evil com dark',
 'Topic 2': 'game ps5 years ps4 like just ve release gen version 10 new year launch gta',
 'Topic 3': 'just order issue sony work time got post people reddit did issues update know fallen',
 'Topic 4': 'vr like just psvr2 better headset tv screen looks yes look use good don really',
 'Topic 5': 'people just like lol don shit fuck know fucking want think dude guy lmao let',
 'Topic 6': 'games game sony ps just price don buy like people plus money play xbox extra',
 'Topic 7': 'controller game fog just right left work don button controllers works way did worked use',
 'Topic 8': 'ps5 got play disc just thank drive home use digital download internet don house friend',
 'Topic 9': 'play game games played time just playing man ve god like got hours war ps4'}

In [29]:
topics_df = pd.DataFrame(topics, index=[0])

topics_df

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9
0,game like just really story good think games w...,like games game love souls ring elden play tha...,game ps5 years ps4 like just ve release gen ve...,just order issue sony work time got post peopl...,vr like just psvr2 better headset tv screen lo...,people just like lol don shit fuck know fuckin...,games game sony ps just price don buy like peo...,controller game fog just right left work don b...,ps5 got play disc just thank drive home use di...,play game games played time just playing man v...


In [30]:
topic_df_clean = {
    "Ranking": [f"Topic {i+1}" for i in range(len(topics))],
    "Topic": list(topics.values())
}
topic_df_clean = pd.DataFrame(topic_df_clean)

topic_df_clean

Unnamed: 0,Ranking,Topic
0,Topic 1,game like just really story good think games w...
1,Topic 2,like games game love souls ring elden play tha...
2,Topic 3,game ps5 years ps4 like just ve release gen ve...
3,Topic 4,just order issue sony work time got post peopl...
4,Topic 5,vr like just psvr2 better headset tv screen lo...
5,Topic 6,people just like lol don shit fuck know fuckin...
6,Topic 7,games game sony ps just price don buy like peo...
7,Topic 8,controller game fog just right left work don b...
8,Topic 9,ps5 got play disc just thank drive home use di...
9,Topic 10,play game games played time just playing man v...
