In [2]:
import pandas as pd #Provides text processing capabilities
import numpy as np #Provides Python with better math processing capabilities
from sklearn.feature_extraction.text import CountVectorizer

#The next line of code reads your Reddit data into this program's memory
#Place your reddit data into the same directory of this script and change the below filename
reviews_datasets = pd.read_csv(r'C:\Users\saisa\Downloads\CIS 518 Big Data\Group Project\processed_text.csv')

reviews_datasets = reviews_datasets.head(20000) #The 20,000 number listed as a parameter here is a limitor of how many records you want to analyze. Adjust this number according to the size of your dataset and whether you run into memory limitations
reviews_datasets.dropna() #Drops any records that have a missing value

reviews_datasets.head() #Print first 5 rows to console inspect data 

#This specifies which column to extract for text analysis. It is referenced again a few lines from this comment (doc_term_matrix = count_vect...)
reviews_datasets['MsgBody'][10]

count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english') #Hyperparameters; max_df = maximum document frequency; min_df = minimum document frequency, stop words = 'english')
doc_term_matrix = count_vect.fit_transform(reviews_datasets['MsgBody'].values.astype('U')) #Create document-term matrix
doc_term_matrix

from sklearn.decomposition import LatentDirichletAllocation #Import LDA

#n_components is how many topics you want to generate. 
#This is one of the "hyperparameters" for LDA
#Many machine learning models have similar hyperparameters
#You can adjust hyperparameters to tune model performance
LDA = LatentDirichletAllocation(n_components=10, random_state=42) #n_components = number of topics to generate; random_state = a seed to produce reproducible results
#More documentation here: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
LDA.fit(doc_term_matrix)

first_topic = LDA.components_[0]

top_topic_words = first_topic.argsort()[-10:]
       
#Prints out the most "important" words for forming topic distribution     
print("Most \"Important\" words for forming topic distribution")  
for i in top_topic_words:
    print(count_vect.get_feature_names_out()[i])
    

for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print('\n')
    
topic_values = LDA.transform(doc_term_matrix)
topic_values.shape

Most "Important" words for forming topic distribution
time
sen
play
like
game
na
win
make
best
team
Top 10 words for topic #0:
['time', 'sen', 'play', 'like', 'game', 'na', 'win', 'make', 'best', 'team']


Top 10 words for topic #1:
['better', 'think', 'map', 'win', 'team', 'lose', 'game', 'say', 'sen', 'play']


Top 10 words for topic #2:
['coach', 'man', 'good', 'dont', 'really', 'yay', 'post', 'people', 'lol', 'like']


Top 10 words for topic #3:
['prx', 'way', 'thats', 'team', 'fuck', 'valorant', 'like', 'game', 'play', 'watch']


Top 10 words for topic #4:
['say', 'season', 'like', 'think', 'ult', 'better', 'smoke', 'good', 'play', 'team']


Top 10 words for topic #5:
['nrg', 'champ', 'sen', 'win', 'time', 'think', 'players', 'year', 'like', 'team']


Top 10 words for topic #6:
['hes', 'sen', 'dont', 'list', 'event', 'johnqt', 'look', 'good', 'like', 'think']


Top 10 words for topic #7:
['really', 'want', 'master', 'make', 'like', 'roster', 'think', 'say', 'https', 'monyet']


To

(3243, 10)