In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [3]:
comments_df = pd.read_csv('../data/comments.csv')
replies_df = pd.read_csv('../data/replies.csv')

In [4]:
all_comments_df = pd.concat([comments_df, replies_df], axis=0)

In [5]:
all_comments_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1524 entries, 0 to 957
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   comment         1524 non-null   object 
 1   comment_id      1524 non-null   object 
 2   parent_comment  958 non-null    object 
 3   user            1524 non-null   object 
 4   user_avatar     1524 non-null   object 
 5   user_page       1524 non-null   object 
 6   comment_time    1524 non-null   object 
 7   likes           1524 non-null   int64  
 8   replies         566 non-null    float64
dtypes: float64(1), int64(1), object(7)
memory usage: 119.1+ KB


In [6]:
# replace the NaN's in the reply column with 'n/a' for comments that are comment threads
all_comments_df['parent_comment'] = all_comments_df['parent_comment'].fillna(value='n/a')

# replace the NaN's in the replies column with a value of 0 since these are replies to comments
all_comments_df['replies'] = all_comments_df['replies'].fillna(value=0)

# convert the comment_time column into a datetime format
all_comments_df['comment_time'] = pd.to_datetime(all_comments_df['comment_time'])

# convert certain columns to save space and perform quicker
all_comments_df = all_comments_df.astype({'likes': 'int32', 'replies': 'int32'})

In [7]:
# take a DataFrame of comments and return a cleaned version of them

def clean_comments(comments):

    words  = []

    # apply a lambda (temproray) function to split each comment into a list of words where the split is done on spaces and punctuation marks, then use a for loop to append all the words into a list of comments, one comment at a time
    #for comment in singh_comments['comment'].apply(lambda x: re.split(r'\W+', x)):
    #    words += comment

    # apply a lambda (temproray) function to split each comment into a list of words, then use a for loop to append all the words into a list of comments, one comment at a time
    for comment in comments['comment'].apply(lambda x: x.split()):
        words += comment

    # create a mapping table that won't replace any characters but will remove punctuation characters
    translate_table = str.maketrans('', '', string.punctuation)

    # apply the mapping table to the list of words to strip all the punctuation
    cleaned_words = [word.translate(translate_table) for word in words]

    # make everything lower case
    cleaned_words = [word.lower() for word in cleaned_words]

    cleaned_words_df = pd.DataFrame({'words': cleaned_words})
    
    return cleaned_words_df

In [8]:
all_comments_clean = clean_comments(all_comments_df)
print(all_comments_clean.value_counts().to_string())

words                               
the                                     1771
and                                     1034
a                                        922
to                                       869
of                                       785
is                                       744
i                                        681
you                                      632
it                                       600
that                                     585
in                                       506
games                                    492
are                                      447
game                                     372
for                                      328
its                                      318
they                                     316
like                                     307
with                                     301
but                                      298
not                                      287
this              

In [9]:
all_comments_df['comment'].value_counts()

comment
Based                                                                                                                                                                                                                                                                  3
over ten thousand view's     Mabey synthleeeer will stop saying no one will watch these an play some actual good game's now                                                                                                                                            3
"Just play indie games, bro" is flawed, "play older games" is the correct choice.                                                                                                                                                                                      2
@@kingmorleyyou chose the foid    In dishonoured 2                                                                                                                                                   

In [10]:
# topic modeling
no_features = 10
no_topics = 10

In [11]:
# apply TFIDF to the comments
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(all_comments_df['comment'])
tfidf_vectorizer.get_feature_names_out()

array(['don', 'game', 'games', 'good', 'indie', 'just', 'like', 'make',
       'people', 'play'], dtype=object)

In [12]:
# apply NMF to the TFIDF
nmf = NMF()
nmf.fit(tfidf)

In [13]:
# use a count vectorizer to get feature names
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
count = count_vectorizer.fit_transform(all_comments_df['comment'])
count_vectorizer.get_feature_names_out()

array(['don', 'game', 'games', 'good', 'indie', 'just', 'like', 'make',
       'people', 'play'], dtype=object)

In [14]:
# Use LDA for extracting topics
lda = LatentDirichletAllocation()
lda.fit(count)

In [15]:
# function to display top n words from each topic

def display_topics(model, vectorizer, n_words):
    if n_words > model.components_.shape[1]:
        n_words = model.components_.shape[1]
    for i, topic in enumerate(model.components_):
        print(f'topic {i}:\n')
        print(f'top {n_words} words:')
        for i in topic.argsort()[-n_words:]:
            print('word:', vectorizer.get_feature_names_out()[i], end=", ")
            print('index:', i, end="; ")
        print('\n')

In [16]:
display_topics(nmf, tfidf_vectorizer, 10)

topic 0:

top 10 words:
word: don, index: 0; word: game, index: 1; word: good, index: 3; word: indie, index: 4; word: just, index: 5; word: like, index: 6; word: make, index: 7; word: people, index: 8; word: play, index: 9; word: games, index: 2; 

topic 1:

top 10 words:
word: don, index: 0; word: games, index: 2; word: good, index: 3; word: just, index: 5; word: like, index: 6; word: make, index: 7; word: people, index: 8; word: play, index: 9; word: indie, index: 4; word: game, index: 1; 

topic 2:

top 10 words:
word: don, index: 0; word: game, index: 1; word: games, index: 2; word: good, index: 3; word: indie, index: 4; word: make, index: 7; word: people, index: 8; word: play, index: 9; word: just, index: 5; word: like, index: 6; 

topic 3:

top 10 words:
word: don, index: 0; word: game, index: 1; word: games, index: 2; word: good, index: 3; word: indie, index: 4; word: like, index: 6; word: make, index: 7; word: people, index: 8; word: play, index: 9; word: just, index: 5; 

topi

In [17]:
userId = '@spider-manunknown9193'
user_comments = all_comments_df[all_comments_df['user'] == userId]

In [18]:
user_comments['comment'].value_counts()

comment
Telltales The Walking Dead is a good game. Not my fault you don’t know what good storytelling is.                                                                                                                                                                                                                      1
@@pikminologueraisin2139Bruh, are u kidding me right now? How do you not see the type of person synthetic man is? Are you blind like his fanbase?                                                                                                                                                                      1
@@TheLonesomePagan. and you think Mr.Racist Synthetic man is any better? He’s a 30 year old who can’t accept characters that aren’t white males. 😂                                                                                                                                                                     1
@@arkgaharandan5881 Not true.                        

In [19]:
print(user_comments.to_string())

                                                                                                                                                                                                                                                                                                                 comment                                         comment_id              parent_comment                    user                                                                                                                  user_avatar                                      user_page              comment_time  likes  replies
285                                                                                                                                                                                                                    Telltales The Walking Dead is a good game. Not my fault you don’t know what good storytelling is.  UgzZsp3OsMf90JY6aB94AaABAg.A4OxXCfGQL3A4PE0kdQVHo  UgzZs

In [20]:
user_comments_clean = clean_comments(user_comments)
print(user_comments_clean.value_counts().to_string())

words                       
you                             28
a                               20
synthetic                       19
the                             15
to                              13
is                              13
and                             13
man                             12
i                               10
that                            10
not                             10
see                              8
can’t                            8
what                             8
racist                           8
game                             8
because                          6
😂                                6
he’s                             6
people                           5
need                             5
how                              5
it                               5
you’re                           5
who                              4
he                               4
his                              4
are                       