In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import itertools
from wordcloud import WordCloud, STOPWORDS

from src.features.preprocess import PreProcess

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re

from gensim import corpora, models
import gensim

[nltk_data] Downloading package punkt to /home/andy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/andy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Read the posts csv file

In [2]:
df_posts = pd.read_csv("../data/raw/computerscience_posts.csv")
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created
0,n2n0ax,New to programming or computer science? Want a...,353,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1619890000.0
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,402,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0
2,s2qf5f,This book demonstrates an infinite loop in a p...,945,0.97,computerscience,https://i.redd.it/jx92aw75udb81.jpg,26,,1642048000.0
3,s36y35,Novel view tennis from single camera input,13,1.0,computerscience,https://v.redd.it/v6xlgqq17ib81,0,,1642101000.0
4,s3241k,Confirmation on if I am understanding hardware...,3,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,3,Recently I have begun learning about how compu...,1642088000.0


# Preprocess

In [3]:
PreProcess.preprocess(df_posts, 'title')
PreProcess.preprocess(df_posts, 'body')
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_filtered,title_stem,body_word_token,body_filtered,body_stem
0,n2n0ax,New to programming or computer science? Want a...,353,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,..."
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,402,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0,"[THIS, IS, NOT, A, TECH, SUPPORT, SUB, OR, A, ...","[tech, support, sub, computer, recommendation,...","[tech, support, sub, comput, recommend, sub]","[Tech, Support, :, /r/techsupport, Computer, R...","[tech, support, rtechsupport, computer, recomm...","[tech, support, rtechsupport, comput, recommen..."
2,s2qf5f,This book demonstrates an infinite loop in a p...,945,0.97,computerscience,https://i.redd.it/jx92aw75udb81.jpg,26,,1642048000.0,"[This, book, demonstrates, an, infinite, loop,...","[book, demonstrates, infinite, loop, pretty, c...","[book, demonstr, infinit, loop, pretti, cool, ...",[],[],[]
3,s36y35,Novel view tennis from single camera input,13,1.0,computerscience,https://v.redd.it/v6xlgqq17ib81,0,,1642101000.0,"[Novel, view, tennis, from, single, camera, in...","[novel, view, tennis, single, camera, input]","[novel, view, tenni, singl, camera, input]",[],[],[]
4,s3241k,Confirmation on if I am understanding hardware...,3,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,3,Recently I have begun learning about how compu...,1642088000.0,"[Confirmation, on, if, I, am, understanding, h...","[confirmation, understanding, hardwaresoftware...","[confirm, understand, hardwaresoftwar, interac...","[Recently, I, have, begun, learning, about, ho...","[recently, begun, learning, computers, actuall...","[recent, begun, learn, comput, actual, know, e..."


# Topic Modeling (LDA)

LDA on titles of all posts

In [4]:
posts = list(df_posts['title_stem'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [5]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, 
                                            id2word = dictionary, 
                                            passes=100)

topic_results = lda_model.print_topics(num_topics=6, num_words=5)
topic_results

[(0,
  '0.026*"data" + 0.026*"comput" + 0.021*"structur" + 0.016*"algorithm" + 0.016*"take"'),
 (1,
  '0.037*"machin" + 0.032*"ture" + 0.017*"languag" + 0.017*"way" + 0.017*"best"'),
 (2,
  '0.013*"cpu" + 0.013*"singl" + 0.013*"book" + 0.013*"understand" + 0.013*"node"'),
 (3,
  '0.020*"algorithm" + 0.020*"memori" + 0.020*"comput" + 0.016*"scienc" + 0.016*"recommend"'),
 (4,
  '0.043*"comput" + 0.015*"program" + 0.015*"take" + 0.015*"call" + 0.015*"random"')]

LDA on body of all posts

In [6]:
posts = list(df_posts['body_stem'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [7]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, 
                                            id2word = dictionary, 
                                            passes=100)

topic_results = lda_model.print_topics(num_topics=6, num_words=5)
topic_results

[(0, '0.014*"bit" + 0.014*"time" + 0.014*"tri" + 0.014*"paper" + 0.011*"hi"'),
 (1,
  '0.016*"byte" + 0.011*"comput" + 0.011*"ask" + 0.011*"book" + 0.011*"support"'),
 (2,
  '0.017*"know" + 0.013*"part" + 0.013*"http" + 0.010*"work" + 0.010*"cpu"'),
 (3,
  '0.020*"http" + 0.020*"width2048" + 0.020*"autowebp" + 0.020*"formatpng" + 0.014*"code"'),
 (4,
  '0.028*"http" + 0.023*"formatpng" + 0.023*"autowebp" + 0.017*"x200b" + 0.017*"n"')]

In [8]:
"""
Visualize the LDA topic modeling
"""

# cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

# cloud = WordCloud(background_color='white',
#                   width=2500,
#                   height=1800,
#                   max_words=10,
#                   colormap='tab10',
#                   color_func=lambda *args, **kwargs: cols[i],
#                   prefer_horizontal=1.0)

# topics = lda_model.show_topics(formatted=False)

# fig, axes = plt.subplots(3, 2, figsize=(10,10), sharex=True, sharey=True)

# for i, ax in enumerate(axes.flatten()):
#     fig.add_subplot(ax)
#     topic_words = dict(topics[i][1])
#     cloud.generate_from_frequencies(topic_words, max_font_size=300)
#     plt.gca().imshow(cloud)
#     plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
#     plt.gca().axis('off')


# plt.subplots_adjust(wspace=0, hspace=0)
# plt.axis('off')
# plt.margins(x=0, y=0)
# plt.tight_layout()
# plt.show()

'\nVisualize the LDA topic modeling\n'

# Read and preprocess the comments CSV

In [9]:
df_comments = pd.read_csv("../data/raw/computerscience_comments.csv")
df_comments.head()

Unnamed: 0,post_id,comment
0,n2n0ax,How late is too late to start a career in prog...
1,n2n0ax,I am a freshman at a university and haven't be...
2,n2n0ax,I'm still in highschool but really interested ...
3,n2n0ax,"This is probably a common question, but how we..."
4,n2n0ax,I am planning on starting a CS major this fall...


In [10]:
PreProcess.preprocess(df_comments, 'comment')
df_comments.head()

Unnamed: 0,post_id,comment,comment_word_token,comment_filtered,comment_stem
0,n2n0ax,How late is too late to start a career in prog...,"[How, late, is, too, late, to, start, a, caree...","[late, late, start, career, programming]","[late, late, start, career, program]"
1,n2n0ax,I am a freshman at a university and haven't be...,"[I, am, a, freshman, at, a, university, and, h...","[freshman, university, nt, able, work, side, p...","[freshman, univers, nt, abl, work, side, proje..."
2,n2n0ax,I'm still in highschool but really interested ...,"[I, 'm, still, in, highschool, but, really, in...","[still, highschool, really, interested, comput...","[still, highschool, realli, interest, comput, ..."
3,n2n0ax,"This is probably a common question, but how we...","[This, is, probably, a, common, question, ,, b...","[probably, common, question, well, coding, boo...","[probabl, common, question, well, code, bootca..."
4,n2n0ax,I am planning on starting a CS major this fall...,"[I, am, planning, on, starting, a, CS, major, ...","[planning, starting, cs, major, fall]","[plan, start, cs, major, fall]"


# Merge the df comments and posts together

In [11]:
df_merge = df_posts.merge(df_comments, how='left', left_on='post_id', right_on='post_id')
df_merge.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_filtered,title_stem,body_word_token,body_filtered,body_stem,comment,comment_word_token,comment_filtered,comment_stem
0,n2n0ax,New to programming or computer science? Want a...,353,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...",How late is too late to start a career in prog...,"[How, late, is, too, late, to, start, a, caree...","[late, late, start, career, programming]","[late, late, start, career, program]"
1,n2n0ax,New to programming or computer science? Want a...,353,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...",I am a freshman at a university and haven't be...,"[I, am, a, freshman, at, a, university, and, h...","[freshman, university, nt, able, work, side, p...","[freshman, univers, nt, abl, work, side, proje..."
2,n2n0ax,New to programming or computer science? Want a...,353,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...",I'm still in highschool but really interested ...,"[I, 'm, still, in, highschool, but, really, in...","[still, highschool, really, interested, comput...","[still, highschool, realli, interest, comput, ..."
3,n2n0ax,New to programming or computer science? Want a...,353,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...","This is probably a common question, but how we...","[This, is, probably, a, common, question, ,, b...","[probably, common, question, well, coding, boo...","[probabl, common, question, well, code, bootca..."
4,n2n0ax,New to programming or computer science? Want a...,353,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...",I am planning on starting a CS major this fall...,"[I, am, planning, on, starting, a, CS, major, ...","[planning, starting, cs, major, fall]","[plan, start, cs, major, fall]"


In [12]:
# Check the number of posts with no comments
print("There are {} posts with no comments".format(df_merge.comment.isna().sum()))

There are 9 posts with no comments


# Generate a LDA model for each post

In [13]:
topics = []

# Drop the posts that do not have any comments
df_merge_dropped = df_merge.dropna()

# for post_id in ['n2n0ax', 'qb4bof']:
for post_id in df_merge_dropped['post_id'].unique():
    df_temp = df_merge_dropped[df_merge_dropped['post_id'] == post_id]
    posts = list(df_temp['comment_stem'])
    dictionary = corpora.Dictionary(posts)
    corpus = [dictionary.doc2bow(article) for article in 
              posts] # All except the last one
    
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, 
                                            id2word = dictionary, 
                                            passes=100)
    topic_results = lda_model.print_topics(num_topics=6, num_words=5)
    topics.append({'post_id':post_id, 'topics':topic_results})
topics

[{'post_id': 'n2n0ax',
  'topics': [(0,
    '0.033*"love" + 0.033*"side" + 0.033*"studi" + 0.018*"learn" + 0.018*"realli"'),
   (1,
    '0.038*"cs" + 0.038*"cours" + 0.038*"univers" + 0.038*"go" + 0.038*"august"'),
   (2,
    '0.066*"work" + 0.045*"late" + 0.025*"hi" + 0.025*"code" + 0.025*"need"'),
   (3,
    '0.091*"scienc" + 0.091*"comput" + 0.047*"degre" + 0.032*"america" + 0.032*"colleg"'),
   (4,
    '0.070*"comput" + 0.054*"start" + 0.037*"plan" + 0.037*"scienc" + 0.037*"work"')]},
 {'post_id': 'qb4bof',
  'topics': [(0,
    '0.051*"got" + 0.051*"said" + 0.051*"discord" + 0.051*"activ" + 0.051*"still"'),
   (1,
    '0.015*"correct" + 0.015*"remov" + 0.015*"mean" + 0.015*"said" + 0.015*"fix"'),
   (2,
    '0.065*"pretti" + 0.036*"peopl" + 0.036*"place" + 0.036*"rpcmasterrac" + 0.036*"laid"'),
   (3, '0.038*"post" + 0.038*"im" + 0.038*"go" + 0.038*"cs" + 0.038*"mani"'),
   (4,
    '0.086*"need" + 0.045*"help" + 0.045*"see" + 0.025*"said" + 0.025*"someon"')]},
 {'post_id': 's2qf5f'