In [1]:
import pandas as pd
import spacy
import numpy as np
from nltk.sentiment import SentimentIntensityAnalyzer
from src.features.preprocess import PreProcess
from bertopic import BERTopic

In [2]:
world_posts = pd.read_csv('../data/raw/computerscience_posts.csv')
world_comments = pd.read_csv('../data/raw/computerscience_comments.csv')

In [3]:
world_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I'm a university student, and while I've taken...",1642654000.0
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0


In [4]:
combine_df = world_posts.merge(world_comments, on='post_id', how='left')

In [5]:
combine_df

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,comment
0,n2n0ax,New to programming or computer science? Want a...,375,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1.619890e+09,How late is too late to start a career in prog...
1,n2n0ax,New to programming or computer science? Want a...,375,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1.619890e+09,I am a freshman at a university and haven't be...
2,n2n0ax,New to programming or computer science? Want a...,375,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1.619890e+09,I'm still in highschool but really interested ...
3,n2n0ax,New to programming or computer science? Want a...,375,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1.619890e+09,"This is probably a common question, but how we..."
4,n2n0ax,New to programming or computer science? Want a...,375,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1.619890e+09,I am planning on starting a CS major this fall...
...,...,...,...,...,...,...,...,...,...,...
3219,myc3u1,Good resources for basic understanding the bas...,99,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,21,Hey guys. \n\n\nI've programmed for a few ye...,1.619369e+09,Look for Ben Eater on YouTube if you're lookin...
3220,myc3u1,Good resources for basic understanding the bas...,99,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,21,Hey guys. \n\n\nI've programmed for a few ye...,1.619369e+09,"I would recommend ""Structured Computer Organiz..."
3221,myc3u1,Good resources for basic understanding the bas...,99,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,21,Hey guys. \n\n\nI've programmed for a few ye...,1.619369e+09,"Just to give you a very brief overview, there ..."
3222,myc3u1,Good resources for basic understanding the bas...,99,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,21,Hey guys. \n\n\nI've programmed for a few ye...,1.619369e+09,Thank you for this amazing list of resources :...


In [6]:
combine_df['post_id'].nunique()

492

In [7]:
corpus = list(world_posts['title']) + list(world_posts['body']) + list(world_comments['comment'])
corpus_id = list(world_posts['post_id']) + list(world_posts['post_id']) + list(world_comments['post_id'])

In [8]:
len(corpus_id)

4163

In [9]:
corpus_df = pd.DataFrame({'text': corpus, 'post_id': corpus_id})

In [10]:
corpus_df['text'] = corpus_df['text'].astype(str)

In [11]:
corpus_df

Unnamed: 0,text,post_id
0,New to programming or computer science? Want a...,n2n0ax
1,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,qb4bof
2,Ways to learn higher level math through CS?,s8afrp
3,Why is there nothing between 8bit and 16bit?,s7yrsa
4,Why is Internet/transmission speed in Bit and ...,s7li8v
...,...,...
4158,Look for Ben Eater on YouTube if you're lookin...,myc3u1
4159,"I would recommend ""Structured Computer Organiz...",myc3u1
4160,"Just to give you a very brief overview, there ...",myc3u1
4161,Thank you for this amazing list of resources :...,myc3u1


In [12]:
preprocessor = PreProcess()

preprocessed_df = preprocessor.preprocess(world_posts, 'body', lemm=True)
preprocessed_df = preprocessor.token_to_str(preprocessed_df, 'body')
preprocessed_df.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,body_word_token,body_tag,body_untokenized
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0,"[previous, thread, finally, archive, 500, comm...","[[(previous, JJ)], [(thread, NN)], [(finally, ...",previous thread finally archive 500 comment reply
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupportComputer Recommend...,1634619000.0,"[tech, support, rtechsupportcomputer, recommen...","[[(tech, NN)], [(support, NN)], [(rtechsupport...",tech support rtechsupportcomputer recommendati...
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I am a university student, and while I have ta...",1642654000.0,"[university, student, take, math, class, la, m...","[[(university, NN)], [(student, NN)], [(taken,...",university student take math class la multivar...
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0,"[example, want, change, image, specification, ...","[[(example, NN)], [(want, NN)], [(change, NN)]...",example want change image specification photos...
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0,"[always, wonder]","[[(always, RB)], [(wondering, VBG)]]",always wonder


In [13]:
def train_bert(df, column):
    topic_model = BERTopic(language="english", embedding_model="paraphrase-multilingual-mpnet-base-v2", min_topic_size=3, calculate_probabilities=True, verbose=True)
    topics, probs = topic_model.fit_transform(df[column])
    freq = topic_model.get_topic_info()
    docs = topic_model.get_representative_docs()
    topic_names = []
    for i in topics:
        topic_names.append(list(freq[freq['Topic'] == i]["Name"])[0])
    df['topic_label'] = topics
    df['topic_name'] = topic_names
    sia = SentimentIntensityAnalyzer()
    neg_scores = []
    neu_scores = []
    pos_scores = []
    compounds = []
    for i in range(len(df.index)):
        neg_scores.append(sia.polarity_scores(df.at[i, column])['neg'])
        neu_scores.append(sia.polarity_scores(df.at[i, column])['neu'])
        pos_scores.append(sia.polarity_scores(df.at[i, column])['pos'])
        compounds.append(sia.polarity_scores(df.at[i, column])['compound'])
    df['neg_sentiment'] = neg_scores
    df['neu_sentiment'] = neu_scores
    df['pos_sentiment'] = pos_scores
    df['compound_sentiment'] = compounds
    return topics, probs, topic_model, freq, df

# Notes
How should we think about this problem? Should topics be extracted one-by-one from each post? (This actually doesn't work using BERT as it wants to find multiple topics when learning) Or, the entire subreddit? (Much more manageable using BERT, but with some caveats). Do we want to include comments in topic modeling? This is also feasible, but comments in different threads can be labeled in the same topic, which feels strange to me. In any case, what's presented here is a view of the entire subreddit.

In [16]:
topic, probs, topic_model, freq, topic_df = train_bert(corpus_df, 'text')

Batches:   0%|          | 0/131 [00:00<?, ?it/s]

2022-02-01 12:53:53,528 - BERTopic - Transformed documents to Embeddings


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2022-02-01 12:54:20,081 - BERTopic - Reduced dimensionality with UMAP
2022-02-01 12:54:33,180 - BERTopic - Clustered UMAP embeddings with HDBSCAN


LookupError: 
**********************************************************************
  Resource [93mvader_lexicon[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('vader_lexicon')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93msentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt[0m

  Searched in:
    - '/home/andy/nltk_data'
    - '/home/andy/anaconda3/envs/reddit/nltk_data'
    - '/home/andy/anaconda3/envs/reddit/share/nltk_data'
    - '/home/andy/anaconda3/envs/reddit/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [None]:
topic_df

In [None]:
pd.set_option('display.max_rows', None)
freq

In [None]:
topic_nr = freq.iloc[3]["Topic"]
topic_model.get_topic(topic_nr)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_df['pos_sentiment'].hist()

In [None]:
topic_df[topic_df['post_id'] == 'n2n0ax']

In [None]:
final_df = topic_df[['topic_label', 'topic_name', 'neg_sentiment', 'neu_sentiment', 'pos_sentiment', 'compound_sentiment']]

In [None]:
pd.set_option('display.max_rows', None)
final_df.groupby(by=['topic_label', 'topic_name']).mean().reset_index()