In [20]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import re

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [26]:
# Per Samantha Chu:
# When dealing with text data, there are common pre-processing steps. We won't necessarily use all of them every time we deal with text data.
#- Remove special characters (e.g. Regular expression to just keep the words)
#- Tokenizing 
#- Lemmatizing/Stemming
#- Stop word removal

In [21]:
# remove special chars
#df['title'] = df['title'].replace('[^\w ]','',regex=True).astype(str) 
# awesome regex shortcut thanks to https://stackoverflow.com/questions/1219915/regex-to-remove-apostrophe

In [22]:
astro_post_df = pd.read_csv('./datasets/eda_astro_post.csv')
quantum_post_df = pd.read_csv('./datasets/eda_quantum_post.csv')

Start with additional light cleaning to each DataFrame and then combine them into one (before running train_test_split etc)

**In hindsight this could have been done in the earlier EDA phase** but *c'est la vie*

In [23]:
astro_post_df.drop(columns=['Unnamed: 0','created_utc','title','selftext','num_comments','subreddit'],inplace=True)
astro_post_df

Unnamed: 0,author,Category,Post_comment,combo_text,word_count
0,Previous-Ad-8137,Astrophysics,Post,Msc in astrophy after bsc in biophysics? [remo...,8
1,Previous-Ad-8137,Astrophysics,Post,Msc. Astrophy after Bsc. Biophysics? [removed],6
2,BlueJ5,Astrophysics,Post,"Not an astrophysicist. But, say I have a pole/...",55
3,NiklasFiedler,Astrophysics,Post,4 Tiny Missions Answering the Biggest Question...,10
4,Saashiv01,Astrophysics,Post,Light bending around a black hole I've been tr...,64
...,...,...,...,...,...
5991,woodycanuck,Astrophysics,Post,"Saw Lawrence Krauss talk last night, have some...",364
5992,JunCTionS,Astrophysics,Post,Where can I a list of dim satellite flyovers? ...,90
5993,[deleted],Astrophysics,Post,Do mathematical models for a white hole model ...,314
5994,[deleted],Astrophysics,Post,Need some help for Astrophysics/Astronomy Home...,112


In [24]:
quantum_post_df.drop(columns=['Unnamed: 0','created_utc','title','selftext','num_comments','subreddit'],inplace=True)
quantum_post_df

Unnamed: 0,author,Category,Post_comment,combo_text,word_count
0,Chipdoc,Quantum,Post,Sandia National Laboratories : Rare open-acces...,11
1,factSciGuy,Quantum,Post,Some facts about Quantum physics the,6
2,zctppe5,Quantum,Post,Reasoning under uncertainty with a near-term q...,9
3,grams4days,Quantum,Post,Quantum Computing Resources Hey everyone — jus...,72
4,asm-us,Quantum,Post,What's the complexity of 3-sat algorithm on qu...,43
...,...,...,...,...,...
4490,jb_1988,Quantum,Post,Questions for a beginner in quantum computing ...,87
4491,gravitypushes,Quantum,Post,How does one observe the ourput of a QC if obs...,18
4492,Quantuum,Quantum,Post,"Quick stupid question, struggling with the con...",114
4493,akiel123,Quantum,Post,Quantum Computing Game Play this game. (link i...,70


In [25]:
all_post_df = pd.concat([astro_post_df,quantum_post_df])
all_post_df

Unnamed: 0,author,Category,Post_comment,combo_text,word_count
0,Previous-Ad-8137,Astrophysics,Post,Msc in astrophy after bsc in biophysics? [remo...,8
1,Previous-Ad-8137,Astrophysics,Post,Msc. Astrophy after Bsc. Biophysics? [removed],6
2,BlueJ5,Astrophysics,Post,"Not an astrophysicist. But, say I have a pole/...",55
3,NiklasFiedler,Astrophysics,Post,4 Tiny Missions Answering the Biggest Question...,10
4,Saashiv01,Astrophysics,Post,Light bending around a black hole I've been tr...,64
...,...,...,...,...,...
4490,jb_1988,Quantum,Post,Questions for a beginner in quantum computing ...,87
4491,gravitypushes,Quantum,Post,How does one observe the ourput of a QC if obs...,18
4492,Quantuum,Quantum,Post,"Quick stupid question, struggling with the con...",114
4493,akiel123,Quantum,Post,Quantum Computing Game Play this game. (link i...,70


In [26]:
all_post_df = all_post_df.reindex(columns=['combo_text','Category', 'Post_comment', 'word_count','author'])
all_post_df

Unnamed: 0,combo_text,Category,Post_comment,word_count,author
0,Msc in astrophy after bsc in biophysics? [remo...,Astrophysics,Post,8,Previous-Ad-8137
1,Msc. Astrophy after Bsc. Biophysics? [removed],Astrophysics,Post,6,Previous-Ad-8137
2,"Not an astrophysicist. But, say I have a pole/...",Astrophysics,Post,55,BlueJ5
3,4 Tiny Missions Answering the Biggest Question...,Astrophysics,Post,10,NiklasFiedler
4,Light bending around a black hole I've been tr...,Astrophysics,Post,64,Saashiv01
...,...,...,...,...,...
4490,Questions for a beginner in quantum computing ...,Quantum,Post,87,jb_1988
4491,How does one observe the ourput of a QC if obs...,Quantum,Post,18,gravitypushes
4492,"Quick stupid question, struggling with the con...",Quantum,Post,114,Quantuum
4493,Quantum Computing Game Play this game. (link i...,Quantum,Post,70,akiel123


In [32]:
all_post_df['combo_text'] = all_post_df['combo_text'].replace('[^\w ]','',regex=True).astype(str)
all_post_df

Unnamed: 0,combo_text,Category,Post_comment,word_count,author
0,Msc in astrophy after bsc in biophysics removed,Astrophysics,Post,8,Previous-Ad-8137
1,Msc Astrophy after Bsc Biophysics removed,Astrophysics,Post,6,Previous-Ad-8137
2,Not an astrophysicist But say I have a polesti...,Astrophysics,Post,55,BlueJ5
3,4 Tiny Missions Answering the Biggest Question...,Astrophysics,Post,10,NiklasFiedler
4,Light bending around a black hole Ive been try...,Astrophysics,Post,64,Saashiv01
...,...,...,...,...,...
4490,Questions for a beginner in quantum computing ...,Quantum,Post,87,jb_1988
4491,How does one observe the ourput of a QC if obs...,Quantum,Post,18,gravitypushes
4492,Quick stupid question struggling with the conc...,Quantum,Post,114,Quantuum
4493,Quantum Computing Game Play this game link in ...,Quantum,Post,70,akiel123


In [33]:
all_post_df['Category'].replace({'Astrophysics':1,'Quantum':0},inplace=True)
all_post_df

Unnamed: 0,combo_text,Category,Post_comment,word_count,author
0,Msc in astrophy after bsc in biophysics removed,1,Post,8,Previous-Ad-8137
1,Msc Astrophy after Bsc Biophysics removed,1,Post,6,Previous-Ad-8137
2,Not an astrophysicist But say I have a polesti...,1,Post,55,BlueJ5
3,4 Tiny Missions Answering the Biggest Question...,1,Post,10,NiklasFiedler
4,Light bending around a black hole Ive been try...,1,Post,64,Saashiv01
...,...,...,...,...,...
4490,Questions for a beginner in quantum computing ...,0,Post,87,jb_1988
4491,How does one observe the ourput of a QC if obs...,0,Post,18,gravitypushes
4492,Quick stupid question struggling with the conc...,0,Post,114,Quantuum
4493,Quantum Computing Game Play this game link in ...,0,Post,70,akiel123


#### Set up everything

In [35]:
# X and y
X = all_post_df['combo_text']
y = all_post_df['Category']

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y)

In [36]:
# Instantiate a CountVectorizer with the default hyperparameters.
cvec = CountVectorizer(min_df=2)

# Fit the vectorizer on our corpus.
cvec.fit(X_train)

# Transform the corpus.
X_train_cvec = cvec.transform(X_train)

#transform the test data
X_test_cvec = cvec.transform(X_test)

In [37]:
len(cvec.get_feature_names())

9248

In [43]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=20)
dtc.fit(X_train_cvec,y_train)

dtc.score(X_train_cvec,y_train), dtc.score(X_test_cvec,y_test)

(0.9450940518556177, 0.936713686618376)

In [40]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(n_estimators=200)
abc.fit(X_train_cvec,y_train)
abc.score(X_train_cvec,y_train), abc.score(X_test_cvec,y_test)

(0.9635231316725978, 0.9428135722455204)