In [2]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import re

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
# Per Samantha Chu:
# When dealing with text data, there are common pre-processing steps. We won't necessarily use all of them every time we deal with text data.
#- Remove special characters (e.g. Regular expression to just keep the words)
#- Tokenizing 
#- Lemmatizing/Stemming
#- Stop word removal

In [4]:
# remove special chars
#df['title'] = df['title'].replace('[^\w ]','',regex=True).astype(str) 
# awesome regex shortcut thanks to https://stackoverflow.com/questions/1219915/regex-to-remove-apostrophe

In [7]:
astro_comm_df = pd.read_csv('../datasets/eda_astro_comment.csv')
astro_post_df = pd.read_csv('../datasets/eda_astro_post.csv')
quantum_comm_df = pd.read_csv('../datasets/eda_quantum_comment.csv')
quantum_post_df = pd.read_csv('../datasets/eda_quantum_post.csv')

Start with additional light cleaning to each DataFrame and then combine them into one (before running train_test_split etc)

**In hindsight this could have been done in the earlier EDA phase** but *c'est la vie*

In [8]:
astro_comm_df.head(2)

Unnamed: 0.1,Unnamed: 0,body,created_utc,subreddit,author,Category,Post_comment,word_count
0,0,Your mum says your pole is more like 2 inches ...,1617222727,astrophysics,moon-worshiper,Astrophysics,Comment,67
1,1,"&gt; 760 mph\n\nJust to clarify, thats the spe...",1617220516,astrophysics,Lewri,Astrophysics,Comment,19


In [9]:
#dropping columns for all DataFrames, uniformly
astro_comm_df.drop(columns=['Unnamed: 0','created_utc','subreddit'],inplace=True)
quantum_comm_df.drop(columns=['Unnamed: 0','created_utc','subreddit'],inplace=True)
astro_post_df.drop(columns=['Unnamed: 0','created_utc','title','selftext','num_comments','subreddit'],inplace=True)
quantum_post_df.drop(columns=['Unnamed: 0','created_utc','title','selftext','num_comments','subreddit'],inplace=True)

In [10]:
#combining all "Comments" into one DataFrame
all_comm_df = pd.concat([astro_comm_df,quantum_comm_df])

#combining all "Posts" together
all_post_df = pd.concat([astro_post_df,quantum_post_df])

#renaming the column header from 'body' to 'combo_text' to be identical
all_comm_df = all_comm_df.rename(columns={'body':'combo_text'})

# rearranging order of the columns to be identical
all_post_df = all_post_df.reindex(columns=['combo_text','Category', 'Post_comment', 'word_count','author'])
all_comm_df = all_comm_df.reindex(columns=['combo_text','Category', 'Post_comment', 'word_count','author'])

In [11]:
#combining "Posts" and "Comments" into one big "Text" DataFrame
all_text_df = pd.concat([all_comm_df,all_post_df])
all_text_df

Unnamed: 0,combo_text,Category,Post_comment,word_count,author
0,Your mum says your pole is more like 2 inches ...,Astrophysics,Comment,67,moon-worshiper
1,"&gt; 760 mph\n\nJust to clarify, thats the spe...",Astrophysics,Comment,19,Lewri
2,Threw this git repo together [https://github.c...,Astrophysics,Comment,40,physmathastro
3,oh okay thanks for clearing that.,Astrophysics,Comment,6,AryanPandey
4,"again, the movement needs to propagate through...",Astrophysics,Comment,24,Lewri
...,...,...,...,...,...
4490,Questions for a beginner in quantum computing ...,Quantum,Post,87,jb_1988
4491,How does one observe the ourput of a QC if obs...,Quantum,Post,18,gravitypushes
4492,"Quick stupid question, struggling with the con...",Quantum,Post,114,Quantuum
4493,Quantum Computing Game Play this game. (link i...,Quantum,Post,70,akiel123


#### Cleaning text, binarizing categories

In [13]:
#this nifty piece of code cleans out a lot of punctuation, etc. from text
#but look at Samantha Chu's advice above and make your own best choice for cleaning text data
all_text_df['combo_text'] = all_text_df['combo_text'].replace('[^\w ]','',regex=True).astype(str)
#all_text_df

#binarizing categories
all_text_df['Category'].replace({'Astrophysics':1,'Quantum':0},inplace=True)
all_text_df['Post_comment'].replace({'Post':1,'Comment':0},inplace=True)
all_text_df

Unnamed: 0,combo_text,Category,Post_comment,word_count,author
0,Your mum says your pole is more like 2 inches ...,1,0,67,moon-worshiper
1,gt 760 mphJust to clarify thats the speed of s...,1,0,19,Lewri
2,Threw this git repo together httpsgithubcomSha...,1,0,40,physmathastro
3,oh okay thanks for clearing that,1,0,6,AryanPandey
4,again the movement needs to propagate through ...,1,0,24,Lewri
...,...,...,...,...,...
4490,Questions for a beginner in quantum computing ...,0,1,87,jb_1988
4491,How does one observe the ourput of a QC if obs...,0,1,18,gravitypushes
4492,Quick stupid question struggling with the conc...,0,1,114,Quantuum
4493,Quantum Computing Game Play this game link in ...,0,1,70,akiel123


In [15]:
#setting a baseline
print(all_text_df['Category'].value_counts(normalize=True))
print(all_text_df['Post_comment'].value_counts(normalize=True))

1    0.52062
0    0.47938
Name: Category, dtype: float64
0    0.6471
1    0.3529
Name: Post_comment, dtype: float64


A **52%** baseline expectation to choose *Astrophysics*
A **65%** baseline expectation to choose *Posts*

#### Set up everything

In [16]:
# X and y
X = all_text_df['combo_text']
y = all_text_df['Post_comment']

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y)

In [17]:
# Instantiate a CountVectorizer with the default hyperparameters.
cvec = CountVectorizer(min_df=2)

# Fit the vectorizer on our corpus.
cvec.fit(X_train)

# Transform the corpus.
X_train_cvec = cvec.transform(X_train)

#transform the test data
X_test_cvec = cvec.transform(X_test)

In [18]:
len(cvec.get_feature_names())

16659

In [19]:
#import the DTC
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=20)
dtc.fit(X_train_cvec,y_train)

# score the model
print(f'DecisionTreeClassifier training score: {dtc.score(X_train_cvec,y_train)}')
print(f'DecisionTreeClassifier testing score: {dtc.score(X_test_cvec,y_test)}')

DecisionTreeClassifier training score: 0.9344725511302476
DecisionTreeClassifier testing score: 0.8240043057050592


In [20]:
#import the ABC
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(n_estimators=200)
abc.fit(X_train_cvec,y_train)

print(f'AdaBoost Classifier training score: {abc.score(X_train_cvec,y_train)}')
print(f'AdaBoost Classifier testing score: {abc.score(X_test_cvec,y_test)}')

AdaBoost Classifier training score: 0.8794402583423035
AdaBoost Classifier testing score: 0.875
