In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import re

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [26]:
# Per Samantha Chu:
# When dealing with text data, there are common pre-processing steps. We won't necessarily use all of them every time we deal with text data.
#- Remove special characters (e.g. Regular expression to just keep the words)
#- Tokenizing 
#- Lemmatizing/Stemming
#- Stop word removal

In [2]:
# remove special chars
#df['title'] = df['title'].replace('[^\w ]','',regex=True).astype(str) 
# awesome regex shortcut thanks to https://stackoverflow.com/questions/1219915/regex-to-remove-apostrophe

In [3]:
astro_comm_df = pd.read_csv('./datasets/eda_astro_comment.csv')
quantum_comm_df = pd.read_csv('./datasets/eda_quantum_comment.csv')

Start with additional light cleaning to each DataFrame and then combine them into one (before running train_test_split etc)

**In hindsight this could have been done in the earlier EDA phase** but *c'est la vie*

In [4]:
astro_comm_df.head(2)

Unnamed: 0.1,Unnamed: 0,body,created_utc,subreddit,author,Category,Post_comment,word_count
0,0,Your mum says your pole is more like 2 inches ...,1617222727,astrophysics,moon-worshiper,Astrophysics,Comment,67
1,1,"&gt; 760 mph\n\nJust to clarify, thats the spe...",1617220516,astrophysics,Lewri,Astrophysics,Comment,19


#### Dropping unnecessary columns

In [5]:
astro_comm_df.drop(columns=['Unnamed: 0','created_utc','subreddit'],inplace=True)
#astro_comm_df

In [6]:
quantum_comm_df.drop(columns=['Unnamed: 0','created_utc','subreddit'],inplace=True)
#quantum_comm_df

In [9]:
all_comm_df = pd.concat([astro_comm_df,quantum_comm_df])
#all_comm_df

In [12]:
all_comm_df = all_comm_df.rename(columns={'body':'combo_text'})
#all_comm_df

In [14]:
all_comm_df = all_comm_df.reindex(columns=['combo_text','Category', 'Post_comment', 'word_count','author'])
#all_comm_df

In [16]:
all_comm_df['combo_text'] = all_comm_df['combo_text'].replace('[^\w ]','',regex=True).astype(str)
all_comm_df

Unnamed: 0,combo_text,Category,Post_comment,word_count,author
0,Your mum says your pole is more like 2 inches ...,Astrophysics,Comment,67,moon-worshiper
1,gt 760 mphJust to clarify thats the speed of s...,Astrophysics,Comment,19,Lewri
2,Threw this git repo together httpsgithubcomSha...,Astrophysics,Comment,40,physmathastro
3,oh okay thanks for clearing that,Astrophysics,Comment,6,AryanPandey
4,again the movement needs to propagate through ...,Astrophysics,Comment,24,Lewri
...,...,...,...,...,...
9751,httpswwwlanlgovprojectsnationalsecurityeducati...,Quantum,Comment,1,WorriedPurpose
9752,Yeah just thought that post was particularly f...,Quantum,Comment,33,SaltKick2
9753,Internships are rarely advertised Find profess...,Quantum,Comment,69,youngeverest
9754,I bet the quantum internet people at TUDelf an...,Quantum,Comment,23,rrtucci


In [17]:
all_comm_df['Category'].replace({'Astrophysics':1,'Quantum':0},inplace=True)
all_comm_df

Unnamed: 0,combo_text,Category,Post_comment,word_count,author
0,Your mum says your pole is more like 2 inches ...,1,Comment,67,moon-worshiper
1,gt 760 mphJust to clarify thats the speed of s...,1,Comment,19,Lewri
2,Threw this git repo together httpsgithubcomSha...,1,Comment,40,physmathastro
3,oh okay thanks for clearing that,1,Comment,6,AryanPandey
4,again the movement needs to propagate through ...,1,Comment,24,Lewri
...,...,...,...,...,...
9751,httpswwwlanlgovprojectsnationalsecurityeducati...,0,Comment,1,WorriedPurpose
9752,Yeah just thought that post was particularly f...,0,Comment,33,SaltKick2
9753,Internships are rarely advertised Find profess...,0,Comment,69,youngeverest
9754,I bet the quantum internet people at TUDelf an...,0,Comment,23,rrtucci


In [18]:
all_comm_df['Post_comment'].replace({'Post':1,'Comment':0},inplace=True)
all_comm_df

Unnamed: 0,combo_text,Category,Post_comment,word_count,author
0,Your mum says your pole is more like 2 inches ...,1,0,67,moon-worshiper
1,gt 760 mphJust to clarify thats the speed of s...,1,0,19,Lewri
2,Threw this git repo together httpsgithubcomSha...,1,0,40,physmathastro
3,oh okay thanks for clearing that,1,0,6,AryanPandey
4,again the movement needs to propagate through ...,1,0,24,Lewri
...,...,...,...,...,...
9751,httpswwwlanlgovprojectsnationalsecurityeducati...,0,0,1,WorriedPurpose
9752,Yeah just thought that post was particularly f...,0,0,33,SaltKick2
9753,Internships are rarely advertised Find profess...,0,0,69,youngeverest
9754,I bet the quantum internet people at TUDelf an...,0,0,23,rrtucci


#### Set up everything

In [19]:
# X and y
X = all_comm_df['combo_text']
y = all_comm_df['Category']

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y)

In [20]:
# Instantiate a CountVectorizer with the default hyperparameters.
cvec = CountVectorizer(min_df=2)

# Fit the vectorizer on our corpus.
cvec.fit(X_train)

# Transform the corpus.
X_train_cvec = cvec.transform(X_train)

#transform the test data
X_test_cvec = cvec.transform(X_test)

In [21]:
X_train_df = pd.DataFrame(X_train_cvec.todense(), columns=cvec.get_feature_names())

In [22]:
X_test_df = pd.DataFrame(X_test_cvec.todense(),columns=cvec.get_feature_names())

In [23]:
X_train_df

Unnamed: 0,00,000,000000000050000000i,000007,0005,000gt,001,001gt,005,00gt,...,zurich,zx,α0gt,β1gt,δt,δv,λcdm,ψgt,ﾟヮﾟnice,𝓷𝓲𝓬𝓮
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14422,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14423,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14424,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14425,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
rf = RandomForestClassifier(n_estimators=20)

In [25]:
cross_val_score(rf,X_train_df,y_train)

array([0.79487179, 0.78898129, 0.7965338 , 0.78370884, 0.79064125])

In [26]:
et = ExtraTreesClassifier(n_estimators=10)

In [27]:
cross_val_score(et,X_train_df,y_train)

array([0.78239778, 0.78620929, 0.79064125, 0.76880416, 0.79237435])

In [37]:
# params and grid search 
rf_params = {
    'n_estimators': [100],
    'max_depth': [None,1,2,3,4,5]
}

gs = GridSearchCV(rf,param_grid=rf_params,cv=5)
gs.fit(X_train_df,y_train)

print(gs.best_score_)
print(gs.best_params_)

0.8040476525051915
{'max_depth': None, 'n_estimators': 100}


In [38]:
gs.score(X_train_df,y_train)

0.9850973868441117

In [39]:
gs.score(X_test_df,y_test)

0.8047817047817047