In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import itertools
from wordcloud import WordCloud, STOPWORDS

from src.features.preprocess import PreProcess

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re

from gensim import corpora, models
import gensim

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/saimuktevi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saimuktevi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preprocessing

In [2]:
# read posts
df_posts = pd.read_csv("../data/raw/computerscience_posts.csv")
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created
0,n2n0ax,New to programming or computer science? Want a...,380,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1055,The previous thread was finally archived with ...,1619890000.0
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,410,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0
2,sbw98k,How do general compression algorithms approach...,17,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,7,"For example, if one had a source file containi...",1643058000.0
3,sbxbya,Max number of parallel http requests,0,0.44,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"Hi all, is there a way to know/calculate how m...",1643060000.0
4,saqw7i,Human Brain Cells From Petri Dishes Learn to P...,212,0.99,computerscience,https://science-news.co/human-brain-cells-from...,26,,1642934000.0


In [3]:
# Tokenize the title
df_posts['title'] = df_posts['title'].fillna('')
df_posts['title_token'] = df_posts['title'].apply(sent_tokenize)
df_posts['body'] = df_posts['body'].fillna('')
df_posts['body_token'] = df_posts['body'].apply(sent_tokenize)
df_posts['title_filtered'] = " " #introducing new column
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_token,body_token,title_filtered
0,n2n0ax,New to programming or computer science? Want a...,380,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1055,The previous thread was finally archived with ...,1619890000.0,"[New to programming or computer science?, Want...",[The previous thread was finally archived with...,
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,410,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0,[THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER ...,[Tech Support: /r/techsupport\n\nComputer Reco...,
2,sbw98k,How do general compression algorithms approach...,17,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,7,"For example, if one had a source file containi...",1643058000.0,[How do general compression algorithms approac...,"[For example, if one had a source file contain...",
3,sbxbya,Max number of parallel http requests,0,0.44,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"Hi all, is there a way to know/calculate how m...",1643060000.0,[Max number of parallel http requests],"[Hi all, is there a way to know/calculate how ...",
4,saqw7i,Human Brain Cells From Petri Dishes Learn to P...,212,0.99,computerscience,https://science-news.co/human-brain-cells-from...,26,,1642934000.0,[Human Brain Cells From Petri Dishes Learn to ...,[],


In [4]:
df_comments = pd.read_csv("../data/raw/computerscience_comments.csv")
df_comments.head()

Unnamed: 0,post_id,comment
0,n2n0ax,How late is too late to start a career in prog...
1,n2n0ax,I am a freshman at a university and haven't be...
2,n2n0ax,I'm still in highschool but really interested ...
3,n2n0ax,"This is probably a common question, but how we..."
4,n2n0ax,I am planning on starting a CS major this fall...


In [5]:
PreProcess.preprocess(df_posts, 'title')
PreProcess.preprocess(df_posts, 'body')
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_token,body_token,title_filtered,title_word_token,title_stem,body_word_token,body_filtered,body_stem
0,n2n0ax,New to programming or computer science? Want a...,380,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1055,The previous thread was finally archived with ...,1619890000.0,"[New to programming or computer science?, Want...",[The previous thread was finally archived with...,"[new, programming, computer, science]","[New, to, programming, or, computer, science, ?]","[new, program, comput, scienc]","[The, previous, thread, was, finally, archived...","[previous, thread, finally, archived, 500, com...","[previou, thread, final, archiv, 500, comment,..."
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,410,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0,[THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER ...,[Tech Support: /r/techsupport\n\nComputer Reco...,"[tech, support, sub, computer, recommendation,...","[THIS, IS, NOT, A, TECH, SUPPORT, SUB, OR, A, ...","[tech, support, sub, comput, recommend, sub]","[Tech, Support, :, /r/techsupport, Computer, R...","[tech, support, rtechsupport, computer, recomm...","[tech, support, rtechsupport, comput, recommen..."
2,sbw98k,How do general compression algorithms approach...,17,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,7,"For example, if one had a source file containi...",1643058000.0,[How do general compression algorithms approac...,"[For example, if one had a source file contain...","[general, compression, algorithms, approach, d...","[How, do, general, compression, algorithms, ap...","[gener, compress, algorithm, approach, data, r...","[For, example, ,, if, one, had, a, source, fil...","[example, one, source, file, containing, 100, ...","[exampl, one, sourc, file, contain, 100, trans..."
3,sbxbya,Max number of parallel http requests,0,0.44,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"Hi all, is there a way to know/calculate how m...",1643060000.0,[Max number of parallel http requests],"[Hi all, is there a way to know/calculate how ...","[max, number, parallel, http, requests]","[Max, number, of, parallel, http, requests]","[max, number, parallel, http, request]","[Hi, all, ,, is, there, a, way, to, know/calcu...","[hi, way, knowcalculate, many, parallel, http,...","[hi, way, knowcalcul, mani, parallel, http, re..."
4,saqw7i,Human Brain Cells From Petri Dishes Learn to P...,212,0.99,computerscience,https://science-news.co/human-brain-cells-from...,26,,1642934000.0,[Human Brain Cells From Petri Dishes Learn to ...,[],"[human, brain, cells, petri, dishes, learn, pl...","[Human, Brain, Cells, From, Petri, Dishes, Lea...","[human, brain, cell, petri, dish, learn, play,...",[],[],[]


In [6]:
all_words = [word for tokens in df_posts['body_filtered'] for word in tokens]
tweet_lengths = [len(tokens) for tokens in df_posts['body_filtered']]
vocab = sorted(list(set(all_words)))

print('{} words total, with a vocabulary size of {}'.format(len(all_words), len(vocab)))
print('Max tweet length is {}'.format(max(tweet_lengths)))

4419 words total, with a vocabulary size of 1946
Max tweet length is 72


In [7]:
flat_words = [item for sublist in df_posts['body_filtered'] for item in sublist]
word_freq = FreqDist(flat_words)
word_freq.most_common(30)

[('computer', 46),
 ('like', 37),
 ('know', 35),
 ('hello', 32),
 ('nt', 30),
 ('science', 29),
 ('hi', 28),
 ('n', 28),
 ('time', 25),
 ('learning', 23),
 ('would', 22),
 ('question', 22),
 ('algorithm', 22),
 ('software', 20),
 ('2', 20),
 ('data', 18),
 ('recently', 18),
 ('book', 17),
 ('want', 17),
 ('programming', 17),
 ('understand', 17),
 ('code', 17),
 ('bit', 16),
 ('could', 16),
 ('people', 15),
 ('memory', 15),
 ('get', 15),
 ('years', 15),
 ('algorithms', 15),
 ('one', 14)]

In [8]:
# PreProcess.preprocess(df_comments, 'title')
PreProcess.preprocess(df_comments, 'comment')
df_comments.head()

Unnamed: 0,post_id,comment,comment_word_token,comment_filtered,comment_stem
0,n2n0ax,How late is too late to start a career in prog...,"[How, late, is, too, late, to, start, a, caree...","[late, late, start, career, programming]","[late, late, start, career, program]"
1,n2n0ax,I am a freshman at a university and haven't be...,"[I, am, a, freshman, at, a, university, and, h...","[freshman, university, nt, able, work, side, p...","[freshman, univers, nt, abl, work, side, proje..."
2,n2n0ax,I'm still in highschool but really interested ...,"[I, 'm, still, in, highschool, but, really, in...","[still, highschool, really, interested, comput...","[still, highschool, realli, interest, comput, ..."
3,n2n0ax,"This is probably a common question, but how we...","[This, is, probably, a, common, question, ,, b...","[probably, common, question, well, coding, boo...","[probabl, common, question, well, code, bootca..."
4,n2n0ax,I am planning on starting a CS major this fall...,"[I, am, planning, on, starting, a, CS, major, ...","[planning, starting, cs, major, fall]","[plan, start, cs, major, fall]"


In [9]:
all_words = [word for tokens in df_comments['comment_filtered'] for word in tokens]
tweet_lengths = [len(tokens) for tokens in df_comments['comment_filtered']]
vocab = sorted(list(set(all_words)))

print('{} words total, with a vocabulary size of {}'.format(len(all_words), len(vocab)))
print('Max tweet length is {}'.format(max(tweet_lengths)))

24142 words total, with a vocabulary size of 5828
Max tweet length is 124


In [10]:
flat_words = [item for sublist in df_comments['comment_filtered'] for item in sublist]
word_freq = FreqDist(flat_words)
word_freq.most_common(30)

[('computer', 317),
 ('like', 205),
 ('nt', 183),
 ('science', 181),
 ('would', 147),
 ('programming', 134),
 ('data', 127),
 ('code', 125),
 ('think', 125),
 ('good', 112),
 ('one', 108),
 ('want', 97),
 ('use', 97),
 ('know', 96),
 ('get', 94),
 ('really', 91),
 ('time', 91),
 ('software', 90),
 ('cs', 84),
 ('need', 80),
 ('1', 80),
 ('math', 78),
 ('first', 76),
 ('n', 76),
 ('book', 74),
 ('language', 74),
 ('way', 73),
 ('learning', 72),
 ('c', 71),
 ('learn', 69)]

# BERT Topic Extraction

In [11]:
# !pip install bertopic

Collecting bertopic
  Using cached bertopic-0.9.4-py2.py3-none-any.whl (57 kB)
Collecting hdbscan>=0.8.27
  Using cached hdbscan-0.8.27.tar.gz (6.4 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
[?25hCollecting umap-learn>=0.5.0
  Using cached umap-learn-0.5.2.tar.gz (86 kB)
Collecting plotly>=4.7.0
  Using cached plotly-5.5.0-py2.py3-none-any.whl (26.5 MB)
Collecting sentence-transformers>=0.4.1
  Using cached sentence-transformers-2.1.0.tar.gz (78 kB)
Collecting cython>=0.27
  Using cached Cython-0.29.27-py2.py3-none-any.whl (983 kB)
Collecting tenacity>=6.2.0
  Using cached tenacity-8.0.1-py3-none-any.whl (24 kB)
Collecting transformers<5.0.0,>=4.6.0
  Using cached transformers-4.16.1-py3-none-any.whl (3.5 MB)
Collecting tokenizers>=0.10.3
  Downloading tokenizers-0.11.4-cp39-cp39-macosx_10_11_x86_64.whl (3.7 MB)
[K     |████████████████████████████████| 3.7 MB 4.1 MB/s

In [12]:
from bertopic import BERTopic

ImportError: Numba needs NumPy 1.21 or less

In [None]:
docs = list(preprocessed_df['title'])
topic_model = BERTopic(language="english", embedding_model="paraphrase-multilingual-mpnet-base-v2", min_topic_size=3, calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs)