In [20]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import re

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [26]:
# Per Samantha Chu:
# When dealing with text data, there are common pre-processing steps. We won't necessarily use all of them every time we deal with text data.
#- Remove special characters (e.g. Regular expression to just keep the words)
#- Tokenizing 
#- Lemmatizing/Stemming
#- Stop word removal

In [3]:
# remove special chars
#df['title'] = df['title'].replace('[^\w ]','',regex=True).astype(str) 
# awesome regex shortcut thanks to https://stackoverflow.com/questions/1219915/regex-to-remove-apostrophe

In [25]:
astro_comm_df = pd.read_csv('./datasets/eda_astro_comment.csv')
astro_post_df = pd.read_csv('./datasets/eda_astro_post.csv')
quantum_comm_df = pd.read_csv('./datasets/eda_quantum_comment.csv')
quantum_post_df = pd.read_csv('./datasets/eda_quantum_post.csv')

Start with additional light cleaning to each DataFrame and then combine them into one (before running train_test_split etc)

**In hindsight this could have been done in the earlier EDA phase** but *c'est la vie*

In [26]:
astro_comm_df.drop(columns=['Unnamed: 0','created_utc','author','subreddit'],inplace=True)
astro_comm_df

Unnamed: 0,body,Category,Post_comment,word_count
0,Your mum says your pole is more like 2 inches ...,Astrophysics,Comment,67
1,"&gt; 760 mph\n\nJust to clarify, thats the spe...",Astrophysics,Comment,19
2,Threw this git repo together [https://github.c...,Astrophysics,Comment,40
3,oh okay thanks for clearing that.,Astrophysics,Comment,6
4,"again, the movement needs to propagate through...",Astrophysics,Comment,24
...,...,...,...,...
9476,You can make similar conclusions about redshif...,Astrophysics,Comment,53
9477,Trust me -- you can usually busk it. Depends o...,Astrophysics,Comment,43
9478,Light has a frequency that can be measured. Wh...,Astrophysics,Comment,208
9479,The interplay between story details and the ph...,Astrophysics,Comment,48


In [27]:
quantum_comm_df.drop(columns=['Unnamed: 0','created_utc','author','subreddit'],inplace=True)
quantum_comm_df

Unnamed: 0,body,Category,Post_comment,word_count
0,Related question: there are cloud-based quantu...,Quantum,Comment,89
1,"3-SAT is NP-complete, and we don't expect to b...",Quantum,Comment,98
2,It is currently conjectured that quantum compu...,Quantum,Comment,34
3,"serious question, since i know nothing about t...",Quantum,Comment,57
4,"This was really an excellent podcast, thanks!",Quantum,Comment,7
...,...,...,...,...
9751,[https://www.lanl.gov/projects/national-securi...,Quantum,Comment,1
9752,"Yeah, just thought that post was particularly ...",Quantum,Comment,33
9753,‘Internships’ are rarely advertised. Find prof...,Quantum,Comment,69
9754,I bet the quantum internet people at TUDelf an...,Quantum,Comment,23


In [28]:
astro_comm_df['body'] = astro_comm_df['body'].replace('[^\w ]','',regex=True).astype(str)
astro_comm_df

Unnamed: 0,body,Category,Post_comment,word_count
0,Your mum says your pole is more like 2 inches ...,Astrophysics,Comment,67
1,gt 760 mphJust to clarify thats the speed of s...,Astrophysics,Comment,19
2,Threw this git repo together httpsgithubcomSha...,Astrophysics,Comment,40
3,oh okay thanks for clearing that,Astrophysics,Comment,6
4,again the movement needs to propagate through ...,Astrophysics,Comment,24
...,...,...,...,...
9476,You can make similar conclusions about redshif...,Astrophysics,Comment,53
9477,Trust me you can usually busk it Depends on h...,Astrophysics,Comment,43
9478,Light has a frequency that can be measured Wha...,Astrophysics,Comment,208
9479,The interplay between story details and the ph...,Astrophysics,Comment,48


In [29]:
quantum_comm_df['body'] = quantum_comm_df['body'].replace('[^\w ]','',regex=True).astype(str)
quantum_comm_df

Unnamed: 0,body,Category,Post_comment,word_count
0,Related question there are cloudbased quantum ...,Quantum,Comment,89
1,3SAT is NPcomplete and we dont expect to be ab...,Quantum,Comment,98
2,It is currently conjectured that quantum compu...,Quantum,Comment,34
3,serious question since i know nothing about th...,Quantum,Comment,57
4,This was really an excellent podcast thanks,Quantum,Comment,7
...,...,...,...,...
9751,httpswwwlanlgovprojectsnationalsecurityeducati...,Quantum,Comment,1
9752,Yeah just thought that post was particularly f...,Quantum,Comment,33
9753,Internships are rarely advertised Find profess...,Quantum,Comment,69
9754,I bet the quantum internet people at TUDelf an...,Quantum,Comment,23


In [30]:
all_comments_df = pd.concat([astro_comm_df,quantum_comm_df])
all_comments_df

Unnamed: 0,body,Category,Post_comment,word_count
0,Your mum says your pole is more like 2 inches ...,Astrophysics,Comment,67
1,gt 760 mphJust to clarify thats the speed of s...,Astrophysics,Comment,19
2,Threw this git repo together httpsgithubcomSha...,Astrophysics,Comment,40
3,oh okay thanks for clearing that,Astrophysics,Comment,6
4,again the movement needs to propagate through ...,Astrophysics,Comment,24
...,...,...,...,...
9751,httpswwwlanlgovprojectsnationalsecurityeducati...,Quantum,Comment,1
9752,Yeah just thought that post was particularly f...,Quantum,Comment,33
9753,Internships are rarely advertised Find profess...,Quantum,Comment,69
9754,I bet the quantum internet people at TUDelf an...,Quantum,Comment,23


In [31]:
all_comments_df['Category'].replace({'Astrophysics':1,'Quantum':0},inplace=True)
all_comments_df

Unnamed: 0,body,Category,Post_comment,word_count
0,Your mum says your pole is more like 2 inches ...,1,Comment,67
1,gt 760 mphJust to clarify thats the speed of s...,1,Comment,19
2,Threw this git repo together httpsgithubcomSha...,1,Comment,40
3,oh okay thanks for clearing that,1,Comment,6
4,again the movement needs to propagate through ...,1,Comment,24
...,...,...,...,...
9751,httpswwwlanlgovprojectsnationalsecurityeducati...,0,Comment,1
9752,Yeah just thought that post was particularly f...,0,Comment,33
9753,Internships are rarely advertised Find profess...,0,Comment,69
9754,I bet the quantum internet people at TUDelf an...,0,Comment,23


#### Set up everything

In [32]:
# X and y
X = all_comments_df['body']
y = all_comments_df['Category']

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y)

In [33]:
# Instantiate a CountVectorizer with the default hyperparameters.
cvec = CountVectorizer()

# Fit the vectorizer on our corpus.
cvec.fit(X_train)

# Transform the corpus.
X_train_cvec = cvec.transform(X_train)

#transform the test data
X_test_cvec = cvec.transform(X_test)

In [35]:
len(cvec.get_feature_names())

34153

In [38]:
# trying a logreg model

# 1 - instantiate model
logreg = LogisticRegression(max_iter=1000)

# 2 - fit model
logreg.fit(X_train_cvec,y_train)

LogisticRegression(max_iter=1000)

In [39]:
logreg.predict(X_train_cvec)

array([0, 1, 0, ..., 1, 0, 0], dtype=int64)

In [40]:
logreg.score(X_train_cvec,y_train)

0.9507174048658765

In [41]:
logreg.score(X_test_cvec,y_test)

0.817047817047817