In [2]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import re

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [26]:
# Per Samantha Chu:
# When dealing with text data, there are common pre-processing steps. We won't necessarily use all of them every time we deal with text data.
#- Remove special characters (e.g. Regular expression to just keep the words)
#- Tokenizing 
#- Lemmatizing/Stemming
#- Stop word removal

In [3]:
# remove special chars
#df['title'] = df['title'].replace('[^\w ]','',regex=True).astype(str) 
# awesome regex shortcut thanks to https://stackoverflow.com/questions/1219915/regex-to-remove-apostrophe

In [4]:
astro_post_df = pd.read_csv('./datasets/eda_astro_post.csv')
quantum_post_df = pd.read_csv('./datasets/eda_quantum_post.csv')

Start with additional light cleaning to each DataFrame and then combine them into one (before running train_test_split etc)

**In hindsight this could have been done in the earlier EDA phase** but *c'est la vie*

#### Dropping unnecessary columns

In [5]:
astro_post_df.drop(columns=['Unnamed: 0','created_utc','title','selftext','num_comments','subreddit'],inplace=True)
#astro_post_df

In [6]:
quantum_post_df.drop(columns=['Unnamed: 0','created_utc','title','selftext','num_comments','subreddit'],inplace=True)
#quantum_post_df

In [7]:
all_post_df = pd.concat([astro_post_df,quantum_post_df])
#all_post_df

In [8]:
all_post_df = all_post_df.reindex(columns=['combo_text','Category', 'Post_comment', 'word_count','author'])
#all_post_df

In [11]:
all_post_df['combo_text'] = all_post_df['combo_text'].replace('[^\w ]','',regex=True).astype(str)
all_post_df

Unnamed: 0,combo_text,Category,Post_comment,word_count,author
0,Msc in astrophy after bsc in biophysics removed,Astrophysics,Post,8,Previous-Ad-8137
1,Msc Astrophy after Bsc Biophysics removed,Astrophysics,Post,6,Previous-Ad-8137
2,Not an astrophysicist But say I have a polesti...,Astrophysics,Post,55,BlueJ5
3,4 Tiny Missions Answering the Biggest Question...,Astrophysics,Post,10,NiklasFiedler
4,Light bending around a black hole Ive been try...,Astrophysics,Post,64,Saashiv01
...,...,...,...,...,...
4490,Questions for a beginner in quantum computing ...,Quantum,Post,87,jb_1988
4491,How does one observe the ourput of a QC if obs...,Quantum,Post,18,gravitypushes
4492,Quick stupid question struggling with the conc...,Quantum,Post,114,Quantuum
4493,Quantum Computing Game Play this game link in ...,Quantum,Post,70,akiel123


In [12]:
all_post_df['Category'].replace({'Astrophysics':1,'Quantum':0},inplace=True)
all_post_df

Unnamed: 0,combo_text,Category,Post_comment,word_count,author
0,Msc in astrophy after bsc in biophysics removed,1,Post,8,Previous-Ad-8137
1,Msc Astrophy after Bsc Biophysics removed,1,Post,6,Previous-Ad-8137
2,Not an astrophysicist But say I have a polesti...,1,Post,55,BlueJ5
3,4 Tiny Missions Answering the Biggest Question...,1,Post,10,NiklasFiedler
4,Light bending around a black hole Ive been try...,1,Post,64,Saashiv01
...,...,...,...,...,...
4490,Questions for a beginner in quantum computing ...,0,Post,87,jb_1988
4491,How does one observe the ourput of a QC if obs...,0,Post,18,gravitypushes
4492,Quick stupid question struggling with the conc...,0,Post,114,Quantuum
4493,Quantum Computing Game Play this game link in ...,0,Post,70,akiel123


In [14]:
all_post_df['Post_comment'].replace({'Post':1,'Comment':0},inplace=True)
all_post_df

Unnamed: 0,combo_text,Category,Post_comment,word_count,author
0,Msc in astrophy after bsc in biophysics removed,1,1,8,Previous-Ad-8137
1,Msc Astrophy after Bsc Biophysics removed,1,1,6,Previous-Ad-8137
2,Not an astrophysicist But say I have a polesti...,1,1,55,BlueJ5
3,4 Tiny Missions Answering the Biggest Question...,1,1,10,NiklasFiedler
4,Light bending around a black hole Ive been try...,1,1,64,Saashiv01
...,...,...,...,...,...
4490,Questions for a beginner in quantum computing ...,0,1,87,jb_1988
4491,How does one observe the ourput of a QC if obs...,0,1,18,gravitypushes
4492,Quick stupid question struggling with the conc...,0,1,114,Quantuum
4493,Quantum Computing Game Play this game link in ...,0,1,70,akiel123


In [16]:
all_post_df['Post_comment'].value_counts()

1    10491
Name: Post_comment, dtype: int64

#### Set up everything

In [17]:
# X and y
X = all_post_df['combo_text']
y = all_post_df['Category']

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y)

In [18]:
# Instantiate a CountVectorizer with the default hyperparameters.
cvec = CountVectorizer(min_df=2)

# Fit the vectorizer on our corpus.
cvec.fit(X_train)

# Transform the corpus.
X_train_cvec = cvec.transform(X_train)

#transform the test data
X_test_cvec = cvec.transform(X_test)

In [19]:
X_train_df = pd.DataFrame(X_train_cvec.todense(), columns=cvec.get_feature_names())

In [20]:
X_test_df = pd.DataFrame(X_test_cvec.todense(),columns=cvec.get_feature_names())

In [21]:
X_train_df

Unnamed: 0,00,000,0000000x,00gt,01,011,012,01gt,02,031,...,zealand,zero,zodiac,zone,zones,zoo,zoom,zubrin,zurich,แทงบอลออนไลน
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7863,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7864,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7865,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7866,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
rf = RandomForestClassifier(n_estimators=20)

In [23]:
cross_val_score(rf,X_train_df,y_train)

array([0.94536213, 0.9339263 , 0.93011436, 0.93324857, 0.92879847])

In [24]:
et = ExtraTreesClassifier(n_estimators=10)

In [25]:
cross_val_score(et,X_train_df,y_train)

array([0.93074968, 0.92121982, 0.91867853, 0.93197711, 0.92689129])

In [32]:
# params and grid search 
rf_params = {
    'n_estimators': [10,20,5,6,7],
    'max_depth': [None,1,2,3,4,5]
}

gs = GridSearchCV(rf,param_grid=rf_params,cv=5)
gs.fit(X_train_df,y_train)

print(gs.best_score_)
print(gs.best_params_)

0.9377220099987802
{'max_depth': None, 'n_estimators': 20}


In [33]:
gs.score(X_train_df,y_train)

0.9965683782409761

In [34]:
gs.score(X_test_df,y_test)

0.9386199008768585