In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import itertools
from wordcloud import WordCloud, STOPWORDS

from src.features.preprocess import PreProcess

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re

from gensim import corpora, models
import gensim

[nltk_data] Downloading package punkt to /home/andy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/andy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Read the posts csv file

In [2]:
subreddit = "computerscience"

In [3]:
df_posts = pd.read_csv("../data/raw/" + subreddit + "_posts.csv")
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I'm a university student, and while I've taken...",1642654000.0
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0


# Preprocess

In [4]:
PreProcess.preprocess(df_posts, 'title')
PreProcess.preprocess(df_posts, 'body')
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_filtered,title_stem,body_word_token,body_filtered,body_stem
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,..."
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0,"[THIS, IS, NOT, A, TECH, SUPPORT, SUB, OR, A, ...","[tech, support, sub, computer, recommendation,...","[tech, support, sub, comput, recommend, sub]","[Tech, Support, :, /r/techsupport, Computer, R...","[tech, support, rtechsupport, computer, recomm...","[tech, support, rtechsupport, comput, recommen..."
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I'm a university student, and while I've taken...",1642654000.0,"[Ways, to, learn, higher, level, math, through...","[ways, learn, higher, level, math, cs]","[way, learn, higher, level, math, cs]","[I, 'm, a, university, student, ,, and, while,...","[university, student, taken, math, classes, la...","[univers, student, taken, math, class, la, mul..."
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0,"[Why, is, there, nothing, between, 8bit, and, ...","[nothing, 8bit, 16bit]","[noth, 8bit, 16bit]","[For, example, ,, if, i, want, to, change, the...","[example, want, change, image, specifications,...","[exampl, want, chang, imag, specif, photoshop,..."
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0,"[Why, is, Internet/transmission, speed, in, Bi...","[internettransmission, speed, bit, storage, ca...","[internettransmiss, speed, bit, storag, capaci...","[I, am, always, wondering, about, this, .]","[always, wondering]","[alway, wonder]"


# Topic Modeling (LDA)

LDA Config Settings

In [5]:
# Number of topics to generate
num_topics = 5
# Number of passes the LDA should run
num_passes = 100

# Number of words to print out
num_words = 5

LDA on titles of all posts

In [6]:
posts = list(df_posts['title_stem'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [7]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.017*"question" + 0.011*"know" + 0.010*"program" + 0.009*"comput" + 0.008*"power"'),
 (1,
  '0.016*"time" + 0.015*"question" + 0.014*"code" + 0.014*"algorithm" + 0.012*"complex"'),
 (2,
  '0.046*"comput" + 0.024*"scienc" + 0.012*"learn" + 0.011*"program" + 0.010*"work"'),
 (3,
  '0.033*"comput" + 0.015*"algorithm" + 0.013*"program" + 0.009*"data" + 0.009*"book"'),
 (4,
  '0.012*"book" + 0.012*"read" + 0.011*"use" + 0.011*"softwar" + 0.010*"design"')]

LDA on body of all posts

In [8]:
posts = list(df_posts['body_stem'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [9]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.013*"know" + 0.011*"program" + 0.011*"recent" + 0.010*"understand" + 0.009*"comput"'),
 (1,
  '0.031*"http" + 0.021*"formatpng" + 0.021*"autowebp" + 0.018*"width2048" + 0.010*"comput"'),
 (2, '0.012*"nt" + 0.010*"2" + 0.009*"use" + 0.009*"like" + 0.007*"get"'),
 (3,
  '0.020*"comput" + 0.019*"learn" + 0.017*"scienc" + 0.012*"hello" + 0.009*"hi"'),
 (4,
  '0.013*"comput" + 0.013*"know" + 0.012*"work" + 0.009*"explain" + 0.008*"question"')]

In [10]:
"""
Visualize the LDA topic modeling
"""

# cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

# cloud = WordCloud(background_color='white',
#                   width=2500,
#                   height=1800,
#                   max_words=10,
#                   colormap='tab10',
#                   color_func=lambda *args, **kwargs: cols[i],
#                   prefer_horizontal=1.0)

# topics = lda_model.show_topics(formatted=False)

# fig, axes = plt.subplots(3, 2, figsize=(10,10), sharex=True, sharey=True)

# for i, ax in enumerate(axes.flatten()):
#     fig.add_subplot(ax)
#     topic_words = dict(topics[i][1])
#     cloud.generate_from_frequencies(topic_words, max_font_size=300)
#     plt.gca().imshow(cloud)
#     plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
#     plt.gca().axis('off')


# plt.subplots_adjust(wspace=0, hspace=0)
# plt.axis('off')
# plt.margins(x=0, y=0)
# plt.tight_layout()
# plt.show()

'\nVisualize the LDA topic modeling\n'

# Read and preprocess the comments CSV

In [11]:
df_comments = pd.read_csv("../data/raw/" + subreddit + "_comments.csv", lineterminator='\n')
df_comments.head()

Unnamed: 0,post_id,comment
0,n2n0ax,How late is too late to start a career in prog...
1,n2n0ax,I am a freshman at a university and haven't be...
2,n2n0ax,I'm still in highschool but really interested ...
3,n2n0ax,"This is probably a common question, but how we..."
4,n2n0ax,I am planning on starting a CS major this fall...


In [12]:
PreProcess.preprocess(df_comments, 'comment')
df_comments.head()

Unnamed: 0,post_id,comment,comment_word_token,comment_filtered,comment_stem
0,n2n0ax,How late is too late to start a career in prog...,"[How, late, is, too, late, to, start, a, caree...","[late, late, start, career, programming]","[late, late, start, career, program]"
1,n2n0ax,I am a freshman at a university and haven't be...,"[I, am, a, freshman, at, a, university, and, h...","[freshman, university, nt, able, work, side, p...","[freshman, univers, nt, abl, work, side, proje..."
2,n2n0ax,I'm still in highschool but really interested ...,"[I, 'm, still, in, highschool, but, really, in...","[still, highschool, really, interested, comput...","[still, highschool, realli, interest, comput, ..."
3,n2n0ax,"This is probably a common question, but how we...","[This, is, probably, a, common, question, ,, b...","[probably, common, question, well, coding, boo...","[probabl, common, question, well, code, bootca..."
4,n2n0ax,I am planning on starting a CS major this fall...,"[I, am, planning, on, starting, a, CS, major, ...","[planning, starting, cs, major, fall]","[plan, start, cs, major, fall]"


# Merge the df comments and posts together

In [13]:
df_merge = df_posts.merge(df_comments, how='left', left_on='post_id', right_on='post_id')
df_merge.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_filtered,title_stem,body_word_token,body_filtered,body_stem,comment,comment_word_token,comment_filtered,comment_stem
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...",How late is too late to start a career in prog...,"[How, late, is, too, late, to, start, a, caree...","[late, late, start, career, programming]","[late, late, start, career, program]"
1,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...",I am a freshman at a university and haven't be...,"[I, am, a, freshman, at, a, university, and, h...","[freshman, university, nt, able, work, side, p...","[freshman, univers, nt, abl, work, side, proje..."
2,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...",I'm still in highschool but really interested ...,"[I, 'm, still, in, highschool, but, really, in...","[still, highschool, really, interested, comput...","[still, highschool, realli, interest, comput, ..."
3,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...","This is probably a common question, but how we...","[This, is, probably, a, common, question, ,, b...","[probably, common, question, well, coding, boo...","[probabl, common, question, well, code, bootca..."
4,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...",I am planning on starting a CS major this fall...,"[I, am, planning, on, starting, a, CS, major, ...","[planning, starting, cs, major, fall]","[plan, start, cs, major, fall]"


In [14]:
# Check the number of posts with no comments
print("There are {} posts with no comments".format(df_merge.comment.isna().sum()))
print("There are {} posts with no content after filtering".format(len(df_merge[df_merge['comment_stem'].str.len() == 0])))

There are 45 posts with no comments
There are 21 posts with no content after filtering


# Generate LDA models for each post using comments

In [15]:
topics = []

# Drop the posts that do not have any comments
df_merge_dropped = df_merge[df_merge['comment_stem'].str.len() > 0]

# for post_id in ['n2n0ax', 'qb4bof']:
for post_id in df_merge_dropped['post_id'].unique():
    df_temp = df_merge_dropped[df_merge_dropped['post_id'] == post_id]
    posts = list(df_temp['comment_stem'])
    dictionary = corpora.Dictionary(posts)
    corpus = [dictionary.doc2bow(article) for article in 
              posts] # All except the last one
    
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)
    topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
    topics.append({'post_id':post_id, 'topics':topic_results})
topics

[{'post_id': 'n2n0ax',
  'topics': [(0,
    '0.046*"comput" + 0.037*"scienc" + 0.024*"colleg" + 0.024*"degre" + 0.017*"year"'),
   (1,
    '0.018*"graduat" + 0.018*"hello" + 0.015*"work" + 0.014*"make" + 0.012*"know"'),
   (2,
    '0.058*"comput" + 0.053*"scienc" + 0.028*"hi" + 0.020*"want" + 0.019*"start"'),
   (3,
    '0.024*"school" + 0.022*"year" + 0.018*"cs" + 0.016*"high" + 0.016*"look"'),
   (4,
    '0.025*"learn" + 0.024*"comput" + 0.020*"scienc" + 0.015*"univers" + 0.015*"studi"')]},
 {'post_id': 'qb4bof',
  'topics': [(0,
    '0.085*"need" + 0.059*"said" + 0.032*"someon" + 0.032*"delet" + 0.032*"work"'),
   (1,
    '0.042*"would" + 0.042*"also" + 0.042*"homework" + 0.042*"either" + 0.042*"clarifi"'),
   (2,
    '0.064*"see" + 0.064*"need" + 0.035*"post" + 0.035*"got" + 0.035*"first"'),
   (3,
    '0.042*"help" + 0.042*"laid" + 0.042*"ask" + 0.042*"back" + 0.042*"get"'),
   (4,
    '0.056*"peopl" + 0.056*"pretti" + 0.056*"see" + 0.056*"sure" + 0.056*"comput"')]},
 {'post_id': 

# Generate a single LDA model for all comments

In [16]:
posts = list(df_merge_dropped['comment_stem'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [17]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.039*"http" + 0.011*"would" + 0.010*"like" + 0.010*"problem" + 0.007*"lot"'),
 (1,
  '0.030*"comput" + 0.014*"use" + 0.012*"data" + 0.011*"engin" + 0.011*"scienc"'),
 (2, '0.012*"mean" + 0.010*"use" + 0.010*"data" + 0.010*"1" + 0.009*"true"'),
 (3,
  '0.028*"comput" + 0.016*"scienc" + 0.012*"nt" + 0.012*"like" + 0.011*"think"'),
 (4,
  '0.012*"memori" + 0.010*"use" + 0.009*"system" + 0.009*"like" + 0.008*"algorithm"')]