In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import itertools
from wordcloud import WordCloud, STOPWORDS

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re

from gensim import corpora, models
import gensim

[nltk_data] Downloading package punkt to /home/andy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/andy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/andy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/andy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/andy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Read the posts csv file

In [2]:
df_posts = pd.read_csv("../data/raw/computerscience_posts.csv")
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I'm a university student, and while I've taken...",1642654000.0
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0


# Proprocess the text

#### Tokenize sentences

In [3]:
# Tokenize the title
df_posts['title'] = df_posts['title'].fillna('')
df_posts['title_token'] = df_posts['title'].apply(sent_tokenize)
df_posts['body'] = df_posts['body'].fillna('')
df_posts['body_token'] = df_posts['body'].apply(sent_tokenize)
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_token,body_token
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New to programming or computer science?, Want...",[The previous thread was finally archived with...
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0,[THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER ...,[Tech Support: /r/techsupport\n\nComputer Reco...
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I'm a university student, and while I've taken...",1642654000.0,[Ways to learn higher level math through CS?],"[I'm a university student, and while I've take..."
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0,[Why is there nothing between 8bit and 16bit?],"[For example, if i want to change the image sp..."
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0,[Why is Internet/transmission speed in Bit and...,"[I am always wondering about this., Why Intern..."


#### Tokenize words

In [4]:
def word_tokenize_helper(s):
    if len(s)==0:
        return word_tokenize('')
    else:
        return word_tokenize(s[0])

df_posts['title_word_tokens'] = df_posts['title_token'].apply(lambda s: word_tokenize_helper(s))
df_posts['body_word_tokens'] = df_posts['body_token'].apply(lambda s: word_tokenize_helper(s))
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_token,body_token,title_word_tokens,body_word_tokens
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New to programming or computer science?, Want...",[The previous thread was finally archived with...,"[New, to, programming, or, computer, science, ?]","[The, previous, thread, was, finally, archived..."
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0,[THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER ...,[Tech Support: /r/techsupport\n\nComputer Reco...,"[THIS, IS, NOT, A, TECH, SUPPORT, SUB, OR, A, ...","[Tech, Support, :, /r/techsupport, Computer, R..."
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I'm a university student, and while I've taken...",1642654000.0,[Ways to learn higher level math through CS?],"[I'm a university student, and while I've take...","[Ways, to, learn, higher, level, math, through...","[I, 'm, a, university, student, ,, and, while,..."
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0,[Why is there nothing between 8bit and 16bit?],"[For example, if i want to change the image sp...","[Why, is, there, nothing, between, 8bit, and, ...","[For, example, ,, if, i, want, to, change, the..."
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0,[Why is Internet/transmission speed in Bit and...,"[I am always wondering about this., Why Intern...","[Why, is, Internet/transmission, speed, in, Bi...","[I, am, always, wondering, about, this, .]"


### Clean up words

#### Filter punctuation and stopwords

In [5]:
stop_words = set(stopwords.words('english'))

def get_clean_sentences(sentences, remove_digits=False):
    '''Cleaning sentences by removing special characters and optionally digits'''
    clean_sentences = []
    for sent in sentences:
        pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]' 
        clean_text = re.sub(pattern, '', sent)
        clean_text = clean_text.lower()  # Converting to lower case
        clean_sentences.append(clean_text)
    # print('\nClean sentences:', clean_sentences)
    return clean_sentences

def filter_stopwords(words):
    '''Removing stopwords from given words'''
    filtered_words = [w for w in words if w not in stop_words]
    # print('\nFiltered words:', filtered_words)
    return filtered_words

In [6]:
df_posts['title_filtered'] = df_posts['title_word_tokens'].apply(get_clean_sentences)
df_posts['title_filtered'] = df_posts['title_filtered'].apply(filter_stopwords)
df_posts['title_filtered'] = df_posts['title_filtered'].apply(lambda s: list(filter(None, s)))

df_posts['body_filtered'] = df_posts['body_word_tokens'].apply(get_clean_sentences)
df_posts['body_filtered'] = df_posts['body_filtered'].apply(filter_stopwords)
df_posts['body_filtered'] = df_posts['body_filtered'].apply(lambda s: list(filter(None, s)))

In [7]:
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_token,body_token,title_word_tokens,body_word_tokens,title_filtered,body_filtered
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New to programming or computer science?, Want...",[The previous thread was finally archived with...,"[New, to, programming, or, computer, science, ?]","[The, previous, thread, was, finally, archived...","[new, programming, computer, science]","[previous, thread, finally, archived, 500, com..."
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0,[THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER ...,[Tech Support: /r/techsupport\n\nComputer Reco...,"[THIS, IS, NOT, A, TECH, SUPPORT, SUB, OR, A, ...","[Tech, Support, :, /r/techsupport, Computer, R...","[tech, support, sub, computer, recommendation,...","[tech, support, rtechsupport, computer, recomm..."
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I'm a university student, and while I've taken...",1642654000.0,[Ways to learn higher level math through CS?],"[I'm a university student, and while I've take...","[Ways, to, learn, higher, level, math, through...","[I, 'm, a, university, student, ,, and, while,...","[ways, learn, higher, level, math, cs]","[university, student, taken, math, classes, la..."
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0,[Why is there nothing between 8bit and 16bit?],"[For example, if i want to change the image sp...","[Why, is, there, nothing, between, 8bit, and, ...","[For, example, ,, if, i, want, to, change, the...","[nothing, 8bit, 16bit]","[example, want, change, image, specifications,..."
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0,[Why is Internet/transmission speed in Bit and...,"[I am always wondering about this., Why Intern...","[Why, is, Internet/transmission, speed, in, Bi...","[I, am, always, wondering, about, this, .]","[internettransmission, speed, bit, storage, ca...","[always, wondering]"


### Leaving section for stemming and lemming

#### Stemming

In [8]:
# def get_stems(words):
#     '''Reduce the words to their base word (stem) by cutting off the ends'''
#     ps = PorterStemmer()
#     stems = []
#     for word in words:
#         stems.append(ps.stem(word))
#     # print(stems)
#     return stems

In [9]:
# df_posts['title_stem'] = df_posts['title_filtered'].apply(get_stems)
# df_posts['body_stem'] = df_posts['body_filtered'].apply(get_stems)
# df_posts.head()

#### Lemmatization

##### POS tagging

In [10]:
def get_pos_tags(words):
    '''
    Get the part of speech (POS) tags for the words
    '''
    tags=[]
    for word in words:
        tags.append(nltk.pos_tag([word]))
    return tags

In [11]:
df_posts['title_tag'] = df_posts['title_filtered'].apply(get_pos_tags)
df_posts['body_tag'] = df_posts['body_filtered'].apply(get_pos_tags)
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_token,body_token,title_word_tokens,body_word_tokens,title_filtered,body_filtered,title_tag,body_tag
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New to programming or computer science?, Want...",[The previous thread was finally archived with...,"[New, to, programming, or, computer, science, ?]","[The, previous, thread, was, finally, archived...","[new, programming, computer, science]","[previous, thread, finally, archived, 500, com...","[[(new, JJ)], [(programming, VBG)], [(computer...","[[(previous, JJ)], [(thread, NN)], [(finally, ..."
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0,[THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER ...,[Tech Support: /r/techsupport\n\nComputer Reco...,"[THIS, IS, NOT, A, TECH, SUPPORT, SUB, OR, A, ...","[Tech, Support, :, /r/techsupport, Computer, R...","[tech, support, sub, computer, recommendation,...","[tech, support, rtechsupport, computer, recomm...","[[(tech, NN)], [(support, NN)], [(sub, NN)], [...","[[(tech, NN)], [(support, NN)], [(rtechsupport..."
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I'm a university student, and while I've taken...",1642654000.0,[Ways to learn higher level math through CS?],"[I'm a university student, and while I've take...","[Ways, to, learn, higher, level, math, through...","[I, 'm, a, university, student, ,, and, while,...","[ways, learn, higher, level, math, cs]","[university, student, taken, math, classes, la...","[[(ways, NNS)], [(learn, NN)], [(higher, JJR)]...","[[(university, NN)], [(student, NN)], [(taken,..."
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0,[Why is there nothing between 8bit and 16bit?],"[For example, if i want to change the image sp...","[Why, is, there, nothing, between, 8bit, and, ...","[For, example, ,, if, i, want, to, change, the...","[nothing, 8bit, 16bit]","[example, want, change, image, specifications,...","[[(nothing, NN)], [(8bit, CD)], [(16bit, CD)]]","[[(example, NN)], [(want, NN)], [(change, NN)]..."
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0,[Why is Internet/transmission speed in Bit and...,"[I am always wondering about this., Why Intern...","[Why, is, Internet/transmission, speed, in, Bi...","[I, am, always, wondering, about, this, .]","[internettransmission, speed, bit, storage, ca...","[always, wondering]","[[(internettransmission, NN)], [(speed, NN)], ...","[[(always, RB)], [(wondering, VBG)]]"


##### Lemming

In [12]:
def get_lemma(word_tags):
    '''Reduce the words to their base word (lemma) by using a lexicon'''
    wordnet_lemmatizer = WordNetLemmatizer()
    lemma = []
    for element in word_tags:
        word = element[0][0]
        pos = element[0][1]
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ, # Mapping NLTK POS tags to WordNet POS tags
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

        wordnet_pos = tag_dict.get(tag, wordnet.NOUN)
        lemma.append(wordnet_lemmatizer.lemmatize(word, wordnet_pos))
    return(lemma)

In [13]:
df_posts['title_lem'] = df_posts['title_tag'].apply(get_lemma)
df_posts['body_lem'] = df_posts['body_tag'].apply(get_lemma)
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_token,body_token,title_word_tokens,body_word_tokens,title_filtered,body_filtered,title_tag,body_tag,title_lem,body_lem
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[New to programming or computer science?, Want...",[The previous thread was finally archived with...,"[New, to, programming, or, computer, science, ?]","[The, previous, thread, was, finally, archived...","[new, programming, computer, science]","[previous, thread, finally, archived, 500, com...","[[(new, JJ)], [(programming, VBG)], [(computer...","[[(previous, JJ)], [(thread, NN)], [(finally, ...","[new, program, computer, science]","[previous, thread, finally, archive, 500, comm..."
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0,[THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER ...,[Tech Support: /r/techsupport\n\nComputer Reco...,"[THIS, IS, NOT, A, TECH, SUPPORT, SUB, OR, A, ...","[Tech, Support, :, /r/techsupport, Computer, R...","[tech, support, sub, computer, recommendation,...","[tech, support, rtechsupport, computer, recomm...","[[(tech, NN)], [(support, NN)], [(sub, NN)], [...","[[(tech, NN)], [(support, NN)], [(rtechsupport...","[tech, support, sub, computer, recommendation,...","[tech, support, rtechsupport, computer, recomm..."
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I'm a university student, and while I've taken...",1642654000.0,[Ways to learn higher level math through CS?],"[I'm a university student, and while I've take...","[Ways, to, learn, higher, level, math, through...","[I, 'm, a, university, student, ,, and, while,...","[ways, learn, higher, level, math, cs]","[university, student, taken, math, classes, la...","[[(ways, NNS)], [(learn, NN)], [(higher, JJR)]...","[[(university, NN)], [(student, NN)], [(taken,...","[way, learn, high, level, math, c]","[university, student, take, math, class, la, m..."
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0,[Why is there nothing between 8bit and 16bit?],"[For example, if i want to change the image sp...","[Why, is, there, nothing, between, 8bit, and, ...","[For, example, ,, if, i, want, to, change, the...","[nothing, 8bit, 16bit]","[example, want, change, image, specifications,...","[[(nothing, NN)], [(8bit, CD)], [(16bit, CD)]]","[[(example, NN)], [(want, NN)], [(change, NN)]...","[nothing, 8bit, 16bit]","[example, want, change, image, specification, ..."
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0,[Why is Internet/transmission speed in Bit and...,"[I am always wondering about this., Why Intern...","[Why, is, Internet/transmission, speed, in, Bi...","[I, am, always, wondering, about, this, .]","[internettransmission, speed, bit, storage, ca...","[always, wondering]","[[(internettransmission, NN)], [(speed, NN)], ...","[[(always, RB)], [(wondering, VBG)]]","[internettransmission, speed, bit, storage, ca...","[always, wonder]"


# Read the comments file

In [None]:
df_comments = pd.read_csv("../data/raw/computerscience_comments.csv")
df_comments.head()

# Merge the df comments and posts together

In [None]:
df_merge = df_posts.merge(df_comments, how='outer', left_on='post_id', right_on='post_id')
df_merge.head()

In [None]:
df_merge.comment.isna().sum()