In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import itertools
from wordcloud import WordCloud, STOPWORDS

from src.features.preprocess import PreProcess

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer

import re
from gensim import corpora, models
import gensim

[nltk_data] Downloading package punkt to /home/andy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/andy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/andy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/andy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/andy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Read the posts csv file

In [2]:
subreddit = "computerscience"

In [3]:
df_posts = pd.read_csv("../data/raw/" + subreddit + "_posts.csv")
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I'm a university student, and while I've taken...",1642654000.0
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0


# Preprocess

In [4]:
PreProcess.preprocess(df_posts, 'title')
PreProcess.preprocess(df_posts, 'body')
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_filtered,title_stem,body_word_token,body_filtered,body_stem
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,..."
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0,"[THIS, IS, NOT, A, TECH, SUPPORT, SUB, OR, A, ...","[tech, support, sub, computer, recommendation,...","[tech, support, sub, comput, recommend, sub]","[Tech, Support, :, /r/techsupport, Computer, R...","[tech, support, rtechsupport, computer, recomm...","[tech, support, rtechsupport, comput, recommen..."
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I'm a university student, and while I've taken...",1642654000.0,"[Ways, to, learn, higher, level, math, through...","[ways, learn, higher, level, math, cs]","[way, learn, higher, level, math, cs]","[I, 'm, a, university, student, ,, and, while,...","[university, student, taken, math, classes, la...","[univers, student, taken, math, class, la, mul..."
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0,"[Why, is, there, nothing, between, 8bit, and, ...","[nothing, 8bit, 16bit]","[noth, 8bit, 16bit]","[For, example, ,, if, i, want, to, change, the...","[example, want, change, image, specifications,...","[exampl, want, chang, imag, specif, photoshop,..."
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0,"[Why, is, Internet/transmission, speed, in, Bi...","[internettransmission, speed, bit, storage, ca...","[internettransmiss, speed, bit, storag, capaci...","[I, am, always, wondering, about, this, .]","[always, wondering]","[alway, wonder]"


# Topic Modeling (LDA)

LDA Config Settings

In [5]:
# Number of topics to generate
num_topics = 5
# Number of passes the LDA should run
num_passes = 100

# Number of words to print out
num_words = 5

LDA on titles of all posts

In [6]:
posts = list(df_posts['title_stem'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [7]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.018*"algorithm" + 0.017*"use" + 0.015*"complex" + 0.014*"comput" + 0.013*"time"'),
 (1,
  '0.053*"comput" + 0.034*"scienc" + 0.023*"book" + 0.012*"program" + 0.011*"differ"'),
 (2,
  '0.024*"comput" + 0.016*"learn" + 0.014*"resourc" + 0.011*"calcul" + 0.011*"good"'),
 (3,
  '0.021*"data" + 0.019*"comput" + 0.013*"code" + 0.013*"structur" + 0.009*"type"'),
 (4,
  '0.015*"question" + 0.014*"program" + 0.013*"learn" + 0.012*"algorithm" + 0.011*"comput"')]

LDA on body of all posts

In [8]:
posts = list(df_posts['body_stem'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [9]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.021*"comput" + 0.013*"scienc" + 0.012*"know" + 0.012*"like" + 0.010*"differ"'),
 (1,
  '0.011*"use" + 0.010*"would" + 0.008*"program" + 0.008*"data" + 0.008*"want"'),
 (2,
  '0.022*"comput" + 0.017*"http" + 0.010*"work" + 0.009*"scienc" + 0.009*"2"'),
 (3,
  '0.011*"case" + 0.010*"learn" + 0.010*"http" + 0.009*"algorithm" + 0.007*"memori"'),
 (4,
  '0.019*"http" + 0.013*"autowebp" + 0.013*"formatpng" + 0.011*"hello" + 0.011*"width2048"')]

In [10]:
"""
Visualize the LDA topic modeling
"""

# cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

# cloud = WordCloud(background_color='white',
#                   width=2500,
#                   height=1800,
#                   max_words=10,
#                   colormap='tab10',
#                   color_func=lambda *args, **kwargs: cols[i],
#                   prefer_horizontal=1.0)

# topics = lda_model.show_topics(formatted=False)

# fig, axes = plt.subplots(3, 2, figsize=(10,10), sharex=True, sharey=True)

# for i, ax in enumerate(axes.flatten()):
#     fig.add_subplot(ax)
#     topic_words = dict(topics[i][1])
#     cloud.generate_from_frequencies(topic_words, max_font_size=300)
#     plt.gca().imshow(cloud)
#     plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
#     plt.gca().axis('off')


# plt.subplots_adjust(wspace=0, hspace=0)
# plt.axis('off')
# plt.margins(x=0, y=0)
# plt.tight_layout()
# plt.show()

'\nVisualize the LDA topic modeling\n'

# Read and preprocess the comments CSV

In [11]:
df_comments = pd.read_csv("../data/raw/" + subreddit + "_comments.csv", lineterminator='\n')
df_comments.head()

Unnamed: 0,post_id,comment
0,n2n0ax,How late is too late to start a career in prog...
1,n2n0ax,I am a freshman at a university and haven't be...
2,n2n0ax,I'm still in highschool but really interested ...
3,n2n0ax,"This is probably a common question, but how we..."
4,n2n0ax,I am planning on starting a CS major this fall...


In [12]:
PreProcess.preprocess(df_comments, 'comment')
df_comments.head()

Unnamed: 0,post_id,comment,comment_word_token,comment_filtered,comment_stem
0,n2n0ax,How late is too late to start a career in prog...,"[How, late, is, too, late, to, start, a, caree...","[late, late, start, career, programming]","[late, late, start, career, program]"
1,n2n0ax,I am a freshman at a university and haven't be...,"[I, am, a, freshman, at, a, university, and, h...","[freshman, university, nt, able, work, side, p...","[freshman, univers, nt, abl, work, side, proje..."
2,n2n0ax,I'm still in highschool but really interested ...,"[I, 'm, still, in, highschool, but, really, in...","[still, highschool, really, interested, comput...","[still, highschool, realli, interest, comput, ..."
3,n2n0ax,"This is probably a common question, but how we...","[This, is, probably, a, common, question, ,, b...","[probably, common, question, well, coding, boo...","[probabl, common, question, well, code, bootca..."
4,n2n0ax,I am planning on starting a CS major this fall...,"[I, am, planning, on, starting, a, CS, major, ...","[planning, starting, cs, major, fall]","[plan, start, cs, major, fall]"


# Merge the df comments and posts together

In [13]:
df_merge = df_posts.merge(df_comments, how='left', left_on='post_id', right_on='post_id')
df_merge.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_filtered,title_stem,body_word_token,body_filtered,body_stem,comment,comment_word_token,comment_filtered,comment_stem
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...",How late is too late to start a career in prog...,"[How, late, is, too, late, to, start, a, caree...","[late, late, start, career, programming]","[late, late, start, career, program]"
1,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...",I am a freshman at a university and haven't be...,"[I, am, a, freshman, at, a, university, and, h...","[freshman, university, nt, able, work, side, p...","[freshman, univers, nt, abl, work, side, proje..."
2,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...",I'm still in highschool but really interested ...,"[I, 'm, still, in, highschool, but, really, in...","[still, highschool, really, interested, comput...","[still, highschool, realli, interest, comput, ..."
3,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...","This is probably a common question, but how we...","[This, is, probably, a, common, question, ,, b...","[probably, common, question, well, coding, boo...","[probabl, common, question, well, code, bootca..."
4,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...",I am planning on starting a CS major this fall...,"[I, am, planning, on, starting, a, CS, major, ...","[planning, starting, cs, major, fall]","[plan, start, cs, major, fall]"


In [14]:
# Check the number of posts with no comments
print("There are {} posts with no comments".format(df_merge.comment.isna().sum()))
print("There are {} posts with no content after filtering".format(len(df_merge[df_merge['comment_stem'].str.len() == 0])))

There are 45 posts with no comments
There are 21 posts with no content after filtering


# Generate LDA models for each post using comments

In [15]:
topics = []

# Drop the posts that do not have any comments
df_merge_dropped = df_merge[df_merge['comment_stem'].str.len() > 0]

# for post_id in ['n2n0ax', 'qb4bof']:
for post_id in df_merge_dropped['post_id'].unique():
    df_temp = df_merge_dropped[df_merge_dropped['post_id'] == post_id]
    posts = list(df_temp['comment_stem'])
    dictionary = corpora.Dictionary(posts)
    corpus = [dictionary.doc2bow(article) for article in 
              posts] # All except the last one
    
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)
    topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
    topics.append({'post_id':post_id, 'topics':topic_results})
topics

[{'post_id': 'n2n0ax',
  'topics': [(0,
    '0.027*"year" + 0.027*"colleg" + 0.024*"comput" + 0.018*"go" + 0.018*"engin"'),
   (1,
    '0.023*"comput" + 0.017*"scienc" + 0.015*"get" + 0.015*"program" + 0.013*"year"'),
   (2,
    '0.069*"comput" + 0.061*"scienc" + 0.032*"start" + 0.020*"major" + 0.015*"learn"'),
   (3,
    '0.045*"hi" + 0.018*"work" + 0.016*"know" + 0.013*"everyon" + 0.011*"look"'),
   (4,
    '0.031*"get" + 0.028*"comput" + 0.026*"scienc" + 0.021*"program" + 0.021*"year"')]},
 {'post_id': 'qb4bof',
  'topics': [(0,
    '0.045*"help" + 0.045*"pretti" + 0.045*"laid" + 0.045*"place" + 0.045*"question"'),
   (1,
    '0.106*"need" + 0.056*"said" + 0.056*"see" + 0.056*"someon" + 0.030*"post"'),
   (2,
    '0.061*"got" + 0.061*"maaaaad" + 0.061*"mod" + 0.061*"time" + 0.061*"mean"'),
   (3,
    '0.030*"also" + 0.030*"nice" + 0.030*"homework" + 0.030*"either" + 0.030*"clarifi"'),
   (4,
    '0.049*"im" + 0.049*"laptop" + 0.049*"go" + 0.049*"cs" + 0.049*"mani"')]},
 {'post_id': 

# Generate a single LDA model for all comments

In [16]:
posts = list(df_merge_dropped['comment_stem'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [17]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.062*"comput" + 0.029*"scienc" + 0.023*"http" + 0.013*"engin" + 0.010*"like"'),
 (1, '0.015*"nt" + 0.012*"use" + 0.011*"ai" + 0.011*"learn" + 0.010*"know"'),
 (2, '0.019*"data" + 0.014*"n" + 0.013*"1" + 0.011*"bit" + 0.011*"use"'),
 (3,
  '0.021*"program" + 0.013*"math" + 0.011*"algorithm" + 0.011*"use" + 0.008*"http"'),
 (4,
  '0.014*"code" + 0.011*"nt" + 0.010*"like" + 0.010*"realli" + 0.010*"answer"')]

# Redo the above with Lemm instead of Stem

In [18]:
PreProcess.preprocess(df_posts, 'title', lemm=True)
PreProcess.preprocess(df_posts, 'body', lemm=True)
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_filtered,title_stem,body_word_token,body_filtered,body_stem,title_tag,title_lemm,body_tag,body_lemm
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]","[new, programming, computer, science]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,...","[[(new, JJ)], [(programming, VBG)], [(computer...","[new, program, computer, science]","[[(previous, JJ)], [(thread, NN)], [(finally, ...","[previous, thread, finally, archive, 500, comm..."
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0,"[THIS, IS, NOT, A, TECH, SUPPORT, SUB, OR, A, ...","[tech, support, sub, computer, recommendation,...","[tech, support, sub, comput, recommend, sub]","[Tech, Support, :, /r/techsupport, Computer, R...","[tech, support, rtechsupport, computer, recomm...","[tech, support, rtechsupport, comput, recommen...","[[(tech, NN)], [(support, NN)], [(sub, NN)], [...","[tech, support, sub, computer, recommendation,...","[[(tech, NN)], [(support, NN)], [(rtechsupport...","[tech, support, rtechsupport, computer, recomm..."
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I'm a university student, and while I've taken...",1642654000.0,"[Ways, to, learn, higher, level, math, through...","[ways, learn, higher, level, math, cs]","[way, learn, higher, level, math, cs]","[I, 'm, a, university, student, ,, and, while,...","[university, student, taken, math, classes, la...","[univers, student, taken, math, class, la, mul...","[[(ways, NNS)], [(learn, NN)], [(higher, JJR)]...","[way, learn, high, level, math, c]","[[(university, NN)], [(student, NN)], [(taken,...","[university, student, take, math, class, la, m..."
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0,"[Why, is, there, nothing, between, 8bit, and, ...","[nothing, 8bit, 16bit]","[noth, 8bit, 16bit]","[For, example, ,, if, i, want, to, change, the...","[example, want, change, image, specifications,...","[exampl, want, chang, imag, specif, photoshop,...","[[(nothing, NN)], [(8bit, CD)], [(16bit, CD)]]","[nothing, 8bit, 16bit]","[[(example, NN)], [(want, NN)], [(change, NN)]...","[example, want, change, image, specification, ..."
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0,"[Why, is, Internet/transmission, speed, in, Bi...","[internettransmission, speed, bit, storage, ca...","[internettransmiss, speed, bit, storag, capaci...","[I, am, always, wondering, about, this, .]","[always, wondering]","[alway, wonder]","[[(internettransmission, NN)], [(speed, NN)], ...","[internettransmission, speed, bit, storage, ca...","[[(always, RB)], [(wondering, VBG)]]","[always, wonder]"


##### LDA on title

In [19]:
posts = list(df_posts['title_lemm'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [20]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.020*"computer" + 0.012*"program" + 0.010*"way" + 0.010*"would" + 0.010*"write"'),
 (1,
  '0.037*"computer" + 0.020*"science" + 0.013*"algorithm" + 0.010*"work" + 0.010*"machine"'),
 (2,
  '0.021*"program" + 0.017*"algorithm" + 0.016*"good" + 0.016*"book" + 0.015*"learn"'),
 (3,
  '0.013*"hash" + 0.008*"cpu" + 0.008*"compute" + 0.008*"store" + 0.007*"get"'),
 (4,
  '0.019*"computer" + 0.016*"data" + 0.015*"memory" + 0.010*"question" + 0.010*"structure"')]

##### LDA on body

In [21]:
posts = list(df_posts['body_lemm'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [22]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.017*"know" + 0.009*"say" + 0.009*"time" + 0.008*"understand" + 0.007*"nt"'),
 (1,
  '0.034*"http" + 0.021*"formatpng" + 0.021*"autowebp" + 0.018*"width2048" + 0.012*"n"'),
 (2,
  '0.011*"algorithm" + 0.010*"learn" + 0.010*"know" + 0.010*"2" + 0.010*"book"'),
 (3,
  '0.023*"computer" + 0.016*"science" + 0.012*"hi" + 0.009*"want" + 0.008*"algorithm"'),
 (4,
  '0.011*"computer" + 0.010*"question" + 0.009*"recently" + 0.008*"hello" + 0.008*"work"')]

##### LDA on comments

In [23]:
df_comments = pd.read_csv("../data/raw/" + subreddit + "_comments.csv", lineterminator='\n')
df_comments.head()

Unnamed: 0,post_id,comment
0,n2n0ax,How late is too late to start a career in prog...
1,n2n0ax,I am a freshman at a university and haven't be...
2,n2n0ax,I'm still in highschool but really interested ...
3,n2n0ax,"This is probably a common question, but how we..."
4,n2n0ax,I am planning on starting a CS major this fall...


In [24]:
PreProcess.preprocess(df_comments, 'comment', lemm=True)
df_comments.head()

Unnamed: 0,post_id,comment,comment_word_token,comment_filtered,comment_stem,comment_tag,comment_lemm
0,n2n0ax,How late is too late to start a career in prog...,"[How, late, is, too, late, to, start, a, caree...","[late, late, start, career, programming]","[late, late, start, career, program]","[[(late, RB)], [(late, RB)], [(start, NN)], [(...","[late, late, start, career, program]"
1,n2n0ax,I am a freshman at a university and haven't be...,"[I, am, a, freshman, at, a, university, and, h...","[freshman, university, nt, able, work, side, p...","[freshman, univers, nt, abl, work, side, proje...","[[(freshman, NN)], [(university, NN)], [(nt, N...","[freshman, university, nt, able, work, side, p..."
2,n2n0ax,I'm still in highschool but really interested ...,"[I, 'm, still, in, highschool, but, really, in...","[still, highschool, really, interested, comput...","[still, highschool, realli, interest, comput, ...","[[(still, RB)], [(highschool, NN)], [(really, ...","[still, highschool, really, interested, comput..."
3,n2n0ax,"This is probably a common question, but how we...","[This, is, probably, a, common, question, ,, b...","[probably, common, question, well, coding, boo...","[probabl, common, question, well, code, bootca...","[[(probably, RB)], [(common, JJ)], [(question,...","[probably, common, question, well, cod, bootca..."
4,n2n0ax,I am planning on starting a CS major this fall...,"[I, am, planning, on, starting, a, CS, major, ...","[planning, starting, cs, major, fall]","[plan, start, cs, major, fall]","[[(planning, NN)], [(starting, VBG)], [(cs, NN...","[planning, start, c, major, fall]"


In [25]:
df_merge = df_posts.merge(df_comments, how='left', left_on='post_id', right_on='post_id')
df_merge.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,...,title_tag,title_lemm,body_tag,body_lemm,comment,comment_word_token,comment_filtered,comment_stem,comment_tag,comment_lemm
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]",...,"[[(new, JJ)], [(programming, VBG)], [(computer...","[new, program, computer, science]","[[(previous, JJ)], [(thread, NN)], [(finally, ...","[previous, thread, finally, archive, 500, comm...",How late is too late to start a career in prog...,"[How, late, is, too, late, to, start, a, caree...","[late, late, start, career, programming]","[late, late, start, career, program]","[[(late, RB)], [(late, RB)], [(start, NN)], [(...","[late, late, start, career, program]"
1,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]",...,"[[(new, JJ)], [(programming, VBG)], [(computer...","[new, program, computer, science]","[[(previous, JJ)], [(thread, NN)], [(finally, ...","[previous, thread, finally, archive, 500, comm...",I am a freshman at a university and haven't be...,"[I, am, a, freshman, at, a, university, and, h...","[freshman, university, nt, able, work, side, p...","[freshman, univers, nt, abl, work, side, proje...","[[(freshman, NN)], [(university, NN)], [(nt, N...","[freshman, university, nt, able, work, side, p..."
2,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]",...,"[[(new, JJ)], [(programming, VBG)], [(computer...","[new, program, computer, science]","[[(previous, JJ)], [(thread, NN)], [(finally, ...","[previous, thread, finally, archive, 500, comm...",I'm still in highschool but really interested ...,"[I, 'm, still, in, highschool, but, really, in...","[still, highschool, really, interested, comput...","[still, highschool, realli, interest, comput, ...","[[(still, RB)], [(highschool, NN)], [(really, ...","[still, highschool, really, interested, comput..."
3,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]",...,"[[(new, JJ)], [(programming, VBG)], [(computer...","[new, program, computer, science]","[[(previous, JJ)], [(thread, NN)], [(finally, ...","[previous, thread, finally, archive, 500, comm...","This is probably a common question, but how we...","[This, is, probably, a, common, question, ,, b...","[probably, common, question, well, coding, boo...","[probabl, common, question, well, code, bootca...","[[(probably, RB)], [(common, JJ)], [(question,...","[probably, common, question, well, cod, bootca..."
4,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New, to, programming, or, computer, science, ?]",...,"[[(new, JJ)], [(programming, VBG)], [(computer...","[new, program, computer, science]","[[(previous, JJ)], [(thread, NN)], [(finally, ...","[previous, thread, finally, archive, 500, comm...",I am planning on starting a CS major this fall...,"[I, am, planning, on, starting, a, CS, major, ...","[planning, starting, cs, major, fall]","[plan, start, cs, major, fall]","[[(planning, NN)], [(starting, VBG)], [(cs, NN...","[planning, start, c, major, fall]"


In [26]:
# Check the number of posts with no comments
print("There are {} posts with no comments".format(df_merge.comment.isna().sum()))
print("There are {} posts with no content after filtering".format(len(df_merge[df_merge['comment_stem'].str.len() == 0])))

There are 45 posts with no comments
There are 21 posts with no content after filtering


##### LDA for comments in each post

In [27]:
topics = []

# Drop the posts that do not have any comments
df_merge_dropped = df_merge[df_merge['comment_lemm'].str.len() > 0]

for post_id in df_merge_dropped['post_id'].unique():
    df_temp = df_merge_dropped[df_merge_dropped['post_id'] == post_id]
    posts = list(df_temp['comment_lemm'])
    dictionary = corpora.Dictionary(posts)
    corpus = [dictionary.doc2bow(article) for article in 
              posts] # All except the last one
    
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)
    topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
    topics.append({'post_id':post_id, 'topics':topic_results})
topics

[{'post_id': 'n2n0ax',
  'topics': [(0,
    '0.017*"graduate" + 0.016*"engineering" + 0.015*"computer" + 0.014*"course" + 0.014*"recommend"'),
   (1,
    '0.049*"hi" + 0.022*"career" + 0.022*"c" + 0.018*"everyone" + 0.018*"start"'),
   (2,
    '0.017*"program" + 0.017*"degree" + 0.014*"learn" + 0.012*"c" + 0.012*"would"'),
   (3,
    '0.021*"c" + 0.021*"get" + 0.020*"look" + 0.015*"major" + 0.013*"help"'),
   (4,
    '0.072*"computer" + 0.068*"science" + 0.021*"college" + 0.021*"year" + 0.020*"learn"')]},
 {'post_id': 'qb4bof',
  'topics': [(0,
    '0.073*"discord" + 0.073*"active" + 0.073*"still" + 0.012*"mean" + 0.012*"correct"'),
   (1,
    '0.057*"see" + 0.057*"post" + 0.057*"need" + 0.031*"laptop" + 0.031*"first"'),
   (2,
    '0.071*"need" + 0.048*"say" + 0.048*"help" + 0.048*"someone" + 0.048*"get"'),
   (3,
    '0.062*"feel" + 0.062*"relative" + 0.062*"despite" + 0.062*"think" + 0.062*"time"'),
   (4,
    '0.072*"pretty" + 0.040*"get" + 0.039*"people" + 0.039*"laid" + 0.039*"pl

##### LDA for all comments

In [28]:
posts = list(df_merge_dropped['comment_lemm'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [29]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0, '0.014*"like" + 0.013*"know" + 0.013*"nt" + 0.009*"use" + 0.009*"yes"'),
 (1,
  '0.022*"computer" + 0.013*"http" + 0.012*"science" + 0.010*"system" + 0.009*"think"'),
 (2,
  '0.035*"http" + 0.016*"n" + 0.015*"program" + 0.014*"language" + 0.013*"problem"'),
 (3, '0.014*"would" + 0.012*"use" + 0.012*"1" + 0.012*"get" + 0.011*"learn"'),
 (4,
  '0.034*"computer" + 0.022*"science" + 0.017*"math" + 0.010*"program" + 0.009*"want"')]