In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import itertools
from wordcloud import WordCloud, STOPWORDS

from src.features.preprocess import PreProcess

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer

import re
from gensim import corpora, models
import gensim

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package punkt to /home/ajz55/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ajz55/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ajz55/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ajz55/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ajz55/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/ajz55/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ajz55/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /h

# Read the posts csv file

In [2]:
subreddit = "computerscience"

In [3]:
df_posts = pd.read_csv("../data/raw/" + subreddit + "_hot_posts.csv")
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created
0,n2n0ax,New to programming or computer science? Want a...,408,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1080,The previous thread was finally archived with ...,1619890000.0
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,415,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,56,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0
2,skltie,Confusion Between Different Types of Optimizat...,10,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,2,I do not have a background in optimization and...,1644002000.0
3,sk7puv,Behaviour of TCP and UDP traffic on the same n...,22,0.91,computerscience,https://www.reddit.com/r/computerscience/comme...,4,"So, as we all know , one of the main advantage...",1643961000.0
4,skp5fm,Equivalent propositions to P=NP that are not c...,1,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1,"Hello all,\n\nSo obviously, existence of a pol...",1644011000.0


# Preprocess

In [4]:
preprocessor = PreProcess()

preprocessor.preprocess(df_posts, 'title', lemm=True)
preprocessor.preprocess(df_posts, 'body', lemm=True)
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_tag,body_word_token,body_tag
0,n2n0ax,new to programming or computer science? want a...,408,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1080,the previous thread was finally archived with ...,1619890000.0,"[new, program, computer, science, want, advice...","[[(new, JJ)], [(programming, VBG)], [(computer...","[previous, thread, finally, archive, 500, comm...","[[(previous, JJ)], [(thread, NN)], [(finally, ..."
1,qb4bof,this is not a tech support sub or a computer r...,415,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,56,tech support: /r/techsupportcomputer recommend...,1634619000.0,"[tech, support, sub, computer, recommendation,...","[[(tech, NN)], [(support, NN)], [(sub, NN)], [...","[tech, support, rtechsupportcomputer, recommen...","[[(tech, NN)], [(support, NN)], [(rtechsupport..."
2,skltie,confusion between different types of optimizat...,10,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,2,i do not have a background in optimization and...,1644002000.0,"[confusion, different, type, optimization, pro...","[[(confusion, NN)], [(different, JJ)], [(types...","[background, optimization, try, teach, topic, ...","[[(background, NN)], [(optimization, NN)], [(t..."
3,sk7puv,behaviour of tcp and udp traffic on the same n...,22,0.91,computerscience,https://www.reddit.com/r/computerscience/comme...,4,"so, as we all know , one of the main advantage...",1643961000.0,"[behaviour, tcp, udp, traffic, network]","[[(behaviour, NN)], [(tcp, NN)], [(udp, NN)], ...","[know, one, main, advantage, tcp, udp, flow, c...","[[(know, VB)], [(one, CD)], [(main, JJ)], [(ad..."
4,skp5fm,equivalent propositions to p=np that are not c...,1,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1,"hello all,so obviously, existence of a polynom...",1644011000.0,"[equivalent, proposition, pnp, complexity, the...","[[(equivalent, NN)], [(propositions, NNS)], [(...","[hello, obviously, existence, polynomial, time...","[[(hello, NN)], [(obviously, RB)], [(existence..."


# Compute Vader polarity scores for title and body of posts

In [5]:
analyzer = SentimentIntensityAnalyzer()


df_posts['title_polarity'] = df_posts['title'].apply(analyzer.polarity_scores)
df_posts['body_polarity'] = df_posts['body'].apply(analyzer.polarity_scores)
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_tag,body_word_token,body_tag,title_polarity,body_polarity
0,n2n0ax,new to programming or computer science? want a...,408,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1080,the previous thread was finally archived with ...,1619890000.0,"[new, program, computer, science, want, advice...","[[(new, JJ)], [(programming, VBG)], [(computer...","[previous, thread, finally, archive, 500, comm...","[[(previous, JJ)], [(thread, NN)], [(finally, ...","{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...","{'neg': 0.02, 'neu': 0.855, 'pos': 0.125, 'com..."
1,qb4bof,this is not a tech support sub or a computer r...,415,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,56,tech support: /r/techsupportcomputer recommend...,1634619000.0,"[tech, support, sub, computer, recommendation,...","[[(tech, NN)], [(support, NN)], [(sub, NN)], [...","[tech, support, rtechsupportcomputer, recommen...","[[(tech, NN)], [(support, NN)], [(rtechsupport...","{'neg': 0.079, 'neu': 0.84, 'pos': 0.081, 'com...","{'neg': 0.026, 'neu': 0.898, 'pos': 0.075, 'co..."
2,skltie,confusion between different types of optimizat...,10,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,2,i do not have a background in optimization and...,1644002000.0,"[confusion, different, type, optimization, pro...","[[(confusion, NN)], [(different, JJ)], [(types...","[background, optimization, try, teach, topic, ...","[[(background, NN)], [(optimization, NN)], [(t...","{'neg': 0.426, 'neu': 0.348, 'pos': 0.226, 'co...","{'neg': 0.174, 'neu': 0.663, 'pos': 0.164, 'co..."
3,sk7puv,behaviour of tcp and udp traffic on the same n...,22,0.91,computerscience,https://www.reddit.com/r/computerscience/comme...,4,"so, as we all know , one of the main advantage...",1643961000.0,"[behaviour, tcp, udp, traffic, network]","[[(behaviour, NN)], [(tcp, NN)], [(udp, NN)], ...","[know, one, main, advantage, tcp, udp, flow, c...","[[(know, VB)], [(one, CD)], [(main, JJ)], [(ad...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.051, 'neu': 0.912, 'pos': 0.037, 'co..."
4,skp5fm,equivalent propositions to p=np that are not c...,1,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1,"hello all,so obviously, existence of a polynom...",1644011000.0,"[equivalent, proposition, pnp, complexity, the...","[[(equivalent, NN)], [(propositions, NNS)], [(...","[hello, obviously, existence, polynomial, time...","[[(hello, NN)], [(obviously, RB)], [(existence...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.055, 'neu': 0.936, 'pos': 0.009, 'co..."


# Topic Modeling (LDA)

LDA Config Settings

In [6]:
# Number of topics to generate
num_topics = 5
# Number of passes the LDA should run
num_passes = 100

# Number of words to print out
num_words = 5

LDA on titles of all posts

##### LDA on title

In [7]:
posts = list(df_posts['title_word_token'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [8]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.011*"algorithm" + 0.011*"system" + 0.011*"computer" + 0.010*"work" + 0.009*"theory"'),
 (1,
  '0.023*"book" + 0.018*"computer" + 0.016*"question" + 0.013*"algorithm" + 0.013*"program"'),
 (2,
  '0.036*"computer" + 0.021*"science" + 0.016*"program" + 0.011*"use" + 0.010*"would"'),
 (3,
  '0.016*"data" + 0.015*"learn" + 0.014*"algorithm" + 0.012*"computer" + 0.010*"software"'),
 (4,
  '0.013*"computer" + 0.011*"data" + 0.009*"learn" + 0.009*"find" + 0.009*"science"')]

##### LDA on body

In [9]:
posts = list(df_posts['body_word_token'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [10]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0, '0.007*"make" + 0.006*"x" + 0.006*"like" + 0.006*"use" + 0.006*"server"'),
 (1,
  '0.010*"know" + 0.009*"bill" + 0.008*"ernie" + 0.008*"tony" + 0.008*"oscar"'),
 (2,
  '0.016*"2520" + 0.010*"computer" + 0.009*"question" + 0.008*"problem" + 0.007*"250a"'),
 (3,
  '0.013*"like" + 0.012*"computer" + 0.011*"would" + 0.009*"know" + 0.009*"learn"'),
 (4,
  '0.016*"abazbz2abz" + 0.015*"n" + 0.010*"time" + 0.009*"algorithm" + 0.008*"1"')]

##### LDA on comments

In [11]:
df_comments = pd.read_csv("../data/raw/" + subreddit + "_hot_comments.csv", lineterminator='\n')
df_comments.head()

Unnamed: 0,post_id,comment,comment_id,parent_id,created,is_submitter
0,n2n0ax,How late is too late to start a career in prog...,gwkmb17,t3_n2n0ax,1619895000.0,False
1,n2n0ax,You’ll be 40 in a few years anyways. Why not t...,gwlbcha,t1_gwkmb17,1619906000.0,False
2,n2n0ax,that is like the smartest thing Ive read in a ...,h06cu2y,t1_gwlbcha,1622541000.0,False
3,n2n0ax,Hang in there!!!,h9d1o4m,t1_h06cu2y,1629252000.0,False
4,n2n0ax,I'm so glad that I'm still seeing people repea...,h4oqtdr,t1_gwlbcha,1625911000.0,False


In [12]:
preprocessor.preprocess(df_comments, 'comment', lemm=True)
df_comments.head()

Unnamed: 0,post_id,comment,comment_id,parent_id,created,is_submitter,comment_word_token,comment_tag
0,n2n0ax,how late is too late to start a career in prog...,gwkmb17,t3_n2n0ax,1619895000.0,False,"[late, late, start, career, program, 40, do, d...","[[(late, RB)], [(late, RB)], [(start, NN)], [(..."
1,n2n0ax,you will be 40 in a few years anyways. why not...,gwlbcha,t1_gwkmb17,1619906000.0,False,"[40, year, anyways, try, get, degree, way]","[[(40, CD)], [(years, NNS)], [(anyways, NNS)],..."
2,n2n0ax,that is like the smartest thing i have read in...,h06cu2y,t1_gwlbcha,1622541000.0,False,"[like, smartest, thing, read, long, time]","[[(like, IN)], [(smartest, NN)], [(thing, NN)]..."
3,n2n0ax,hang in there!!!,h9d1o4m,t1_h06cu2y,1629252000.0,False,[hang],"[[(hang, NN)]]"
4,n2n0ax,i am so glad that i am still seeing people rep...,h4oqtdr,t1_gwlbcha,1625911000.0,False,"[glad, still, see, people, repeat, year, first...","[[(glad, NN)], [(still, RB)], [(seeing, VBG)],..."


##### Compute polarity of comments

In [13]:
df_comments['comment_polarity'] = df_comments['comment'].apply(analyzer.polarity_scores)
df_comments.head()

Unnamed: 0,post_id,comment,comment_id,parent_id,created,is_submitter,comment_word_token,comment_tag,comment_polarity
0,n2n0ax,how late is too late to start a career in prog...,gwkmb17,t3_n2n0ax,1619895000.0,False,"[late, late, start, career, program, 40, do, d...","[[(late, RB)], [(late, RB)], [(start, NN)], [(...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,n2n0ax,you will be 40 in a few years anyways. why not...,gwlbcha,t1_gwkmb17,1619906000.0,False,"[40, year, anyways, try, get, degree, way]","[[(40, CD)], [(years, NNS)], [(anyways, NNS)],...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
2,n2n0ax,that is like the smartest thing i have read in...,h06cu2y,t1_gwlbcha,1622541000.0,False,"[like, smartest, thing, read, long, time]","[[(like, IN)], [(smartest, NN)], [(thing, NN)]...","{'neg': 0.0, 'neu': 0.629, 'pos': 0.371, 'comp..."
3,n2n0ax,hang in there!!!,h9d1o4m,t1_h06cu2y,1629252000.0,False,[hang],"[[(hang, NN)]]","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,n2n0ax,i am so glad that i am still seeing people rep...,h4oqtdr,t1_gwlbcha,1625911000.0,False,"[glad, still, see, people, repeat, year, first...","[[(glad, NN)], [(still, RB)], [(seeing, VBG)],...","{'neg': 0.0, 'neu': 0.897, 'pos': 0.103, 'comp..."


##### Merge posts and comments together

In [14]:
df_merge = df_posts.merge(df_comments, how='left', left_on='post_id', right_on='post_id')
df_merge.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created_x,title_word_token,...,title_polarity,body_polarity,comment,comment_id,parent_id,created_y,is_submitter,comment_word_token,comment_tag,comment_polarity
0,n2n0ax,new to programming or computer science? want a...,408,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1080,the previous thread was finally archived with ...,1619890000.0,"[new, program, computer, science, want, advice...",...,"{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...","{'neg': 0.02, 'neu': 0.855, 'pos': 0.125, 'com...",how late is too late to start a career in prog...,gwkmb17,t3_n2n0ax,1619895000.0,False,"[late, late, start, career, program, 40, do, d...","[[(late, RB)], [(late, RB)], [(start, NN)], [(...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,n2n0ax,new to programming or computer science? want a...,408,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1080,the previous thread was finally archived with ...,1619890000.0,"[new, program, computer, science, want, advice...",...,"{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...","{'neg': 0.02, 'neu': 0.855, 'pos': 0.125, 'com...",you will be 40 in a few years anyways. why not...,gwlbcha,t1_gwkmb17,1619906000.0,False,"[40, year, anyways, try, get, degree, way]","[[(40, CD)], [(years, NNS)], [(anyways, NNS)],...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
2,n2n0ax,new to programming or computer science? want a...,408,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1080,the previous thread was finally archived with ...,1619890000.0,"[new, program, computer, science, want, advice...",...,"{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...","{'neg': 0.02, 'neu': 0.855, 'pos': 0.125, 'com...",that is like the smartest thing i have read in...,h06cu2y,t1_gwlbcha,1622541000.0,False,"[like, smartest, thing, read, long, time]","[[(like, IN)], [(smartest, NN)], [(thing, NN)]...","{'neg': 0.0, 'neu': 0.629, 'pos': 0.371, 'comp..."
3,n2n0ax,new to programming or computer science? want a...,408,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1080,the previous thread was finally archived with ...,1619890000.0,"[new, program, computer, science, want, advice...",...,"{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...","{'neg': 0.02, 'neu': 0.855, 'pos': 0.125, 'com...",hang in there!!!,h9d1o4m,t1_h06cu2y,1629252000.0,False,[hang],"[[(hang, NN)]]","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,n2n0ax,new to programming or computer science? want a...,408,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1080,the previous thread was finally archived with ...,1619890000.0,"[new, program, computer, science, want, advice...",...,"{'neg': 0.0, 'neu': 0.885, 'pos': 0.115, 'comp...","{'neg': 0.02, 'neu': 0.855, 'pos': 0.125, 'com...",i am so glad that i am still seeing people rep...,h4oqtdr,t1_gwlbcha,1625911000.0,False,"[glad, still, see, people, repeat, year, first...","[[(glad, NN)], [(still, RB)], [(seeing, VBG)],...","{'neg': 0.0, 'neu': 0.897, 'pos': 0.103, 'comp..."


In [15]:
# Check the number of posts with no comments
print("There are {} posts with no comments".format(df_merge.comment.isna().sum()))
print("There are {} posts with no content after filtering".format(len(df_merge[df_merge['comment_word_token'].str.len() == 0])))

There are 42 posts with no comments
There are 45 posts with no content after filtering


##### LDA for all comments

In [16]:
# Drop the posts that do not have any comments
df_merge_dropped = df_merge[df_merge['comment_word_token'].str.len() > 0]

posts = list(df_merge_dropped['comment_word_token'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [17]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.010*"software" + 0.009*"like" + 0.008*"thanks" + 0.008*"work" + 0.007*"job"'),
 (1, '0.016*"n" + 0.013*"1" + 0.012*"number" + 0.011*"data" + 0.009*"bit"'),
 (2,
  '0.032*"computer" + 0.021*"science" + 0.016*"c" + 0.012*"math" + 0.011*"would"'),
 (3,
  '0.011*"use" + 0.010*"would" + 0.010*"learn" + 0.009*"like" + 0.008*"algorithm"'),
 (4,
  '0.007*"use" + 0.007*"people" + 0.007*"key" + 0.006*"nft" + 0.005*"value"')]