In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import itertools
from wordcloud import WordCloud, STOPWORDS

from src.features.preprocess import PreProcess

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer

import re
from gensim import corpora, models
import gensim

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package punkt to /home/andy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/andy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/andy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/andy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/andy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/andy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/andy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/and

# Read the posts csv file

In [2]:
subreddit = "computerscience"

In [3]:
df_posts = pd.read_csv("../data/raw/" + subreddit + "_posts.csv")
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I'm a university student, and while I've taken...",1642654000.0
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0


# Preprocess

In [4]:
preprocessor = PreProcess()

preprocessor.preprocess(df_posts, 'title', lemm=True)
preprocessor.preprocess(df_posts, 'body', lemm=True)
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_tag,body_word_token,body_tag
0,n2n0ax,new to programming or computer science? want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,the previous thread was finally archived with ...,1619890000.0,"[new, program, computer, science, want, advice...","[[(new, JJ)], [(programming, VBG)], [(computer...","[previous, thread, finally, archive, 500, comm...","[[(previous, JJ)], [(thread, NN)], [(finally, ..."
1,qb4bof,this is not a tech support sub or a computer r...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,tech support: /r/techsupportcomputer recommend...,1634619000.0,"[tech, support, sub, computer, recommendation,...","[[(tech, NN)], [(support, NN)], [(sub, NN)], [...","[tech, support, rtechsupportcomputer, recommen...","[[(tech, NN)], [(support, NN)], [(rtechsupport..."
2,s8afrp,ways to learn higher level math through cs?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"i am a university student, and while i have ta...",1642654000.0,"[way, learn, high, level, math, c]","[[(ways, NNS)], [(learn, NN)], [(higher, JJR)]...","[university, student, take, math, class, la, m...","[[(university, NN)], [(student, NN)], [(taken,..."
3,s7yrsa,why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"for example, if i want to change the image spe...",1642622000.0,"[nothing, 8bit, 16bit]","[[(nothing, NN)], [(8bit, CD)], [(16bit, CD)]]","[example, want, change, image, specification, ...","[[(example, NN)], [(want, NN)], [(change, NN)]..."
4,s7li8v,why is internet/transmission speed in bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,i am always wondering about this. why internet...,1642581000.0,"[internettransmission, speed, bit, storage, ca...","[[(internettransmission, NN)], [(speed, NN)], ...","[always, wonder, internettransmission, speed, ...","[[(always, RB)], [(wondering, VBG)], [(interne..."


# Compute Vader polarity scores for title and body of posts

In [5]:
analyzer = SentimentIntensityAnalyzer()


df_posts['title_polarity'] = df_posts['title'].apply(analyzer.polarity_scores)
df_posts['body_polarity'] = df_posts['body'].apply(analyzer.polarity_scores)
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_tag,body_word_token,body_tag,title_polarity,body_polarity
0,n2n0ax,new to programming or computer science? want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,the previous thread was finally archived with ...,1619890000.0,"[new, program, computer, science, want, advice...","[[(new, JJ)], [(programming, VBG)], [(computer...","[previous, thread, finally, archive, 500, comm...","[[(previous, JJ)], [(thread, NN)], [(finally, ...","{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...","{'neg': 0.02, 'neu': 0.855, 'pos': 0.125, 'com..."
1,qb4bof,this is not a tech support sub or a computer r...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,tech support: /r/techsupportcomputer recommend...,1634619000.0,"[tech, support, sub, computer, recommendation,...","[[(tech, NN)], [(support, NN)], [(sub, NN)], [...","[tech, support, rtechsupportcomputer, recommen...","[[(tech, NN)], [(support, NN)], [(rtechsupport...","{'neg': 0.079, 'neu': 0.84, 'pos': 0.081, 'com...","{'neg': 0.026, 'neu': 0.898, 'pos': 0.075, 'co..."
2,s8afrp,ways to learn higher level math through cs?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"i am a university student, and while i have ta...",1642654000.0,"[way, learn, high, level, math, c]","[[(ways, NNS)], [(learn, NN)], [(higher, JJR)]...","[university, student, take, math, class, la, m...","[[(university, NN)], [(student, NN)], [(taken,...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.033, 'neu': 0.92, 'pos': 0.047, 'com..."
3,s7yrsa,why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"for example, if i want to change the image spe...",1642622000.0,"[nothing, 8bit, 16bit]","[[(nothing, NN)], [(8bit, CD)], [(16bit, CD)]]","[example, want, change, image, specification, ...","[[(example, NN)], [(want, NN)], [(change, NN)]...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 0.978, 'pos': 0.022, 'comp..."
4,s7li8v,why is internet/transmission speed in bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,i am always wondering about this. why internet...,1642581000.0,"[internettransmission, speed, bit, storage, ca...","[[(internettransmission, NN)], [(speed, NN)], ...","[always, wonder, internettransmission, speed, ...","[[(always, RB)], [(wondering, VBG)], [(interne...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.04, 'neu': 0.905, 'pos': 0.055, 'com..."


# Topic Modeling (LDA)

LDA Config Settings

In [6]:
# Number of topics to generate
num_topics = 5
# Number of passes the LDA should run
num_passes = 100

# Number of words to print out
num_words = 5

LDA on titles of all posts

##### LDA on title

In [7]:
posts = list(df_posts['title_word_token'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [8]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.026*"computer" + 0.016*"learn" + 0.014*"algorithm" + 0.014*"book" + 0.012*"question"'),
 (1,
  '0.019*"program" + 0.010*"use" + 0.009*"cpu" + 0.006*"find" + 0.006*"question"'),
 (2,
  '0.029*"computer" + 0.016*"data" + 0.012*"algorithm" + 0.012*"software" + 0.011*"structure"'),
 (3,
  '0.020*"computer" + 0.018*"machine" + 0.015*"work" + 0.011*"get" + 0.010*"anyone"'),
 (4,
  '0.018*"computer" + 0.010*"get" + 0.009*"science" + 0.008*"program" + 0.007*"study"')]

##### LDA on body

In [9]:
posts = list(df_posts['body_word_token'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [10]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.012*"would" + 0.009*"use" + 0.007*"make" + 0.007*"computer" + 0.006*"could"'),
 (1,
  '0.014*"memory" + 0.008*"address" + 0.007*"like" + 0.007*"data" + 0.006*"know"'),
 (2, '0.015*"n" + 0.009*"1" + 0.009*"0" + 0.008*"case" + 0.008*"2"'),
 (3,
  '0.012*"learn" + 0.011*"like" + 0.011*"know" + 0.010*"would" + 0.009*"computer"'),
 (4,
  '0.021*"abazbz2abz" + 0.017*"computer" + 0.009*"time" + 0.008*"use" + 0.007*"2cd"')]

##### LDA on comments

In [11]:
df_comments = pd.read_csv("../data/raw/" + subreddit + "_comments.csv", lineterminator='\n')
df_comments.head()

Unnamed: 0,post_id,comment
0,n2n0ax,How late is too late to start a career in prog...
1,n2n0ax,I am a freshman at a university and haven't be...
2,n2n0ax,I'm still in highschool but really interested ...
3,n2n0ax,"This is probably a common question, but how we..."
4,n2n0ax,I am planning on starting a CS major this fall...


In [12]:
preprocessor.preprocess(df_comments, 'comment', lemm=True)
df_comments.head()

Unnamed: 0,post_id,comment,comment_word_token,comment_tag
0,n2n0ax,how late is too late to start a career in prog...,"[late, late, start, career, program, 40, do, d...","[[(late, RB)], [(late, RB)], [(start, NN)], [(..."
1,n2n0ax,i am a freshman at a university and have not b...,"[freshman, university, able, work, side, proje...","[[(freshman, NN)], [(university, NN)], [(able,..."
2,n2n0ax,i am still in highschool but really interested...,"[still, highschool, really, interested, comput...","[[(still, RB)], [(highschool, NN)], [(really, ..."
3,n2n0ax,"this is probably a common question, but how we...","[probably, common, question, well, cod, bootca...","[[(probably, RB)], [(common, JJ)], [(question,..."
4,n2n0ax,i am planning on starting a cs major this fall...,"[planning, start, c, major, fall, academically...","[[(planning, NN)], [(starting, VBG)], [(cs, NN..."


##### Compute polarity of comments

In [13]:
df_comments['comment_polarity'] = df_comments['comment'].apply(analyzer.polarity_scores)
df_comments.head()

Unnamed: 0,post_id,comment,comment_word_token,comment_tag,comment_polarity
0,n2n0ax,how late is too late to start a career in prog...,"[late, late, start, career, program, 40, do, d...","[[(late, RB)], [(late, RB)], [(start, NN)], [(...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,n2n0ax,i am a freshman at a university and have not b...,"[freshman, university, able, work, side, proje...","[[(freshman, NN)], [(university, NN)], [(able,...","{'neg': 0.085, 'neu': 0.84, 'pos': 0.075, 'com..."
2,n2n0ax,i am still in highschool but really interested...,"[still, highschool, really, interested, comput...","[[(still, RB)], [(highschool, NN)], [(really, ...","{'neg': 0.0, 'neu': 0.828, 'pos': 0.172, 'comp..."
3,n2n0ax,"this is probably a common question, but how we...","[probably, common, question, well, cod, bootca...","[[(probably, RB)], [(common, JJ)], [(question,...","{'neg': 0.02, 'neu': 0.849, 'pos': 0.131, 'com..."
4,n2n0ax,i am planning on starting a cs major this fall...,"[planning, start, c, major, fall, academically...","[[(planning, NN)], [(starting, VBG)], [(cs, NN...","{'neg': 0.0, 'neu': 0.968, 'pos': 0.032, 'comp..."


##### Merge posts and comments together

In [14]:
df_merge = df_posts.merge(df_comments, how='left', left_on='post_id', right_on='post_id')
df_merge.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_tag,body_word_token,body_tag,title_polarity,body_polarity,comment,comment_word_token,comment_tag,comment_polarity
0,n2n0ax,new to programming or computer science? want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,the previous thread was finally archived with ...,1619890000.0,"[new, program, computer, science, want, advice...","[[(new, JJ)], [(programming, VBG)], [(computer...","[previous, thread, finally, archive, 500, comm...","[[(previous, JJ)], [(thread, NN)], [(finally, ...","{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...","{'neg': 0.02, 'neu': 0.855, 'pos': 0.125, 'com...",how late is too late to start a career in prog...,"[late, late, start, career, program, 40, do, d...","[[(late, RB)], [(late, RB)], [(start, NN)], [(...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,n2n0ax,new to programming or computer science? want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,the previous thread was finally archived with ...,1619890000.0,"[new, program, computer, science, want, advice...","[[(new, JJ)], [(programming, VBG)], [(computer...","[previous, thread, finally, archive, 500, comm...","[[(previous, JJ)], [(thread, NN)], [(finally, ...","{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...","{'neg': 0.02, 'neu': 0.855, 'pos': 0.125, 'com...",i am a freshman at a university and have not b...,"[freshman, university, able, work, side, proje...","[[(freshman, NN)], [(university, NN)], [(able,...","{'neg': 0.085, 'neu': 0.84, 'pos': 0.075, 'com..."
2,n2n0ax,new to programming or computer science? want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,the previous thread was finally archived with ...,1619890000.0,"[new, program, computer, science, want, advice...","[[(new, JJ)], [(programming, VBG)], [(computer...","[previous, thread, finally, archive, 500, comm...","[[(previous, JJ)], [(thread, NN)], [(finally, ...","{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...","{'neg': 0.02, 'neu': 0.855, 'pos': 0.125, 'com...",i am still in highschool but really interested...,"[still, highschool, really, interested, comput...","[[(still, RB)], [(highschool, NN)], [(really, ...","{'neg': 0.0, 'neu': 0.828, 'pos': 0.172, 'comp..."
3,n2n0ax,new to programming or computer science? want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,the previous thread was finally archived with ...,1619890000.0,"[new, program, computer, science, want, advice...","[[(new, JJ)], [(programming, VBG)], [(computer...","[previous, thread, finally, archive, 500, comm...","[[(previous, JJ)], [(thread, NN)], [(finally, ...","{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...","{'neg': 0.02, 'neu': 0.855, 'pos': 0.125, 'com...","this is probably a common question, but how we...","[probably, common, question, well, cod, bootca...","[[(probably, RB)], [(common, JJ)], [(question,...","{'neg': 0.02, 'neu': 0.849, 'pos': 0.131, 'com..."
4,n2n0ax,new to programming or computer science? want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,the previous thread was finally archived with ...,1619890000.0,"[new, program, computer, science, want, advice...","[[(new, JJ)], [(programming, VBG)], [(computer...","[previous, thread, finally, archive, 500, comm...","[[(previous, JJ)], [(thread, NN)], [(finally, ...","{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...","{'neg': 0.02, 'neu': 0.855, 'pos': 0.125, 'com...",i am planning on starting a cs major this fall...,"[planning, start, c, major, fall, academically...","[[(planning, NN)], [(starting, VBG)], [(cs, NN...","{'neg': 0.0, 'neu': 0.968, 'pos': 0.032, 'comp..."


In [15]:
# Check the number of posts with no comments
print("There are {} posts with no comments".format(df_merge.comment.isna().sum()))
print("There are {} posts with no content after filtering".format(len(df_merge[df_merge['comment_word_token'].str.len() == 0])))

There are 45 posts with no comments
There are 20 posts with no content after filtering


##### LDA for all comments

In [16]:
# Drop the posts that do not have any comments
df_merge_dropped = df_merge[df_merge['comment_word_token'].str.len() > 0]

posts = list(df_merge_dropped['comment_word_token'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [17]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.020*"computer" + 0.012*"learn" + 0.012*"program" + 0.011*"like" + 0.011*"science"'),
 (1,
  '0.010*"language" + 0.008*"hash" + 0.007*"code" + 0.007*"password" + 0.006*"use"'),
 (2, '0.027*"n" + 0.013*"1" + 0.012*"2" + 0.010*"algorithm" + 0.008*"loop"'),
 (3,
  '0.013*"cpu" + 0.010*"use" + 0.010*"memory" + 0.010*"time" + 0.009*"instruction"'),
 (4,
  '0.012*"data" + 0.012*"use" + 0.010*"algorithm" + 0.009*"number" + 0.009*"would"')]