In [2]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import re

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [26]:
# Per Samantha Chu:
# When dealing with text data, there are common pre-processing steps. We won't necessarily use all of them every time we deal with text data.
#- Remove special characters (e.g. Regular expression to just keep the words)
#- Tokenizing 
#- Lemmatizing/Stemming
#- Stop word removal

In [3]:
# remove special chars
#df['title'] = df['title'].replace('[^\w ]','',regex=True).astype(str) 
# awesome regex shortcut thanks to https://stackoverflow.com/questions/1219915/regex-to-remove-apostrophe

In [4]:
astro_comm_df = pd.read_csv('./datasets/eda_astro_comment.csv')
astro_post_df = pd.read_csv('./datasets/eda_astro_post.csv')
quantum_comm_df = pd.read_csv('./datasets/eda_quantum_comment.csv')
quantum_post_df = pd.read_csv('./datasets/eda_quantum_post.csv')

Start with additional light cleaning to each DataFrame and then combine them into one (before running train_test_split etc)

**In hindsight this could have been done in the earlier EDA phase** but *c'est la vie*

In [9]:
astro_post_df.drop(columns = ['Unnamed: 0','created_utc','author','num_comments','title','selftext','subreddit','Post_comment'],inplace=True)
astro_post_df

Unnamed: 0,Category,combo_text,word_count
0,Astrophysics,Msc in astrophy after bsc in biophysics? [remo...,8
1,Astrophysics,Msc. Astrophy after Bsc. Biophysics? [removed],6
2,Astrophysics,"Not an astrophysicist. But, say I have a pole/...",55
3,Astrophysics,4 Tiny Missions Answering the Biggest Question...,10
4,Astrophysics,Light bending around a black hole I've been tr...,64
...,...,...,...
5991,Astrophysics,"Saw Lawrence Krauss talk last night, have some...",364
5992,Astrophysics,Where can I a list of dim satellite flyovers? ...,90
5993,Astrophysics,Do mathematical models for a white hole model ...,314
5994,Astrophysics,Need some help for Astrophysics/Astronomy Home...,112


In [10]:
quantum_post_df.drop(columns = ['Unnamed: 0','created_utc','author','num_comments','title','selftext','subreddit','Post_comment'],inplace=True)
quantum_post_df

Unnamed: 0,Category,combo_text,word_count
0,Quantum,Sandia National Laboratories : Rare open-acces...,11
1,Quantum,Some facts about Quantum physics the,6
2,Quantum,Reasoning under uncertainty with a near-term q...,9
3,Quantum,Quantum Computing Resources Hey everyone — jus...,72
4,Quantum,What's the complexity of 3-sat algorithm on qu...,43
...,...,...,...
4490,Quantum,Questions for a beginner in quantum computing ...,87
4491,Quantum,How does one observe the ourput of a QC if obs...,18
4492,Quantum,"Quick stupid question, struggling with the con...",114
4493,Quantum,Quantum Computing Game Play this game. (link i...,70


In [12]:
astro_post_df['combo_text'] = astro_post_df['combo_text'].replace('[^\w ]','',regex=True).astype(str)
astro_post_df

Unnamed: 0,Category,combo_text,word_count
0,Astrophysics,Msc in astrophy after bsc in biophysics removed,8
1,Astrophysics,Msc Astrophy after Bsc Biophysics removed,6
2,Astrophysics,Not an astrophysicist But say I have a polesti...,55
3,Astrophysics,4 Tiny Missions Answering the Biggest Question...,10
4,Astrophysics,Light bending around a black hole Ive been try...,64
...,...,...,...
5991,Astrophysics,Saw Lawrence Krauss talk last night have some ...,364
5992,Astrophysics,Where can I a list of dim satellite flyovers I...,90
5993,Astrophysics,Do mathematical models for a white hole model ...,314
5994,Astrophysics,Need some help for AstrophysicsAstronomy Homew...,112


In [13]:
quantum_post_df['combo_text'] = quantum_post_df['combo_text'].replace('[^\w ]','',regex=True).astype(str)
quantum_post_df

Unnamed: 0,Category,combo_text,word_count
0,Quantum,Sandia National Laboratories Rare openaccess ...,11
1,Quantum,Some facts about Quantum physics the,6
2,Quantum,Reasoning under uncertainty with a nearterm qu...,9
3,Quantum,Quantum Computing Resources Hey everyone just...,72
4,Quantum,Whats the complexity of 3sat algorithm on quan...,43
...,...,...,...
4490,Quantum,Questions for a beginner in quantum computing ...,87
4491,Quantum,How does one observe the ourput of a QC if obs...,18
4492,Quantum,Quick stupid question struggling with the conc...,114
4493,Quantum,Quantum Computing Game Play this game link in ...,70


In [14]:
all_post_df = pd.concat([astro_post_df,quantum_post_df])
all_post_df

Unnamed: 0,Category,combo_text,word_count
0,Astrophysics,Msc in astrophy after bsc in biophysics removed,8
1,Astrophysics,Msc Astrophy after Bsc Biophysics removed,6
2,Astrophysics,Not an astrophysicist But say I have a polesti...,55
3,Astrophysics,4 Tiny Missions Answering the Biggest Question...,10
4,Astrophysics,Light bending around a black hole Ive been try...,64
...,...,...,...
4490,Quantum,Questions for a beginner in quantum computing ...,87
4491,Quantum,How does one observe the ourput of a QC if obs...,18
4492,Quantum,Quick stupid question struggling with the conc...,114
4493,Quantum,Quantum Computing Game Play this game link in ...,70


In [15]:
all_post_df['Category'].replace({'Astrophysics':1,'Quantum':0},inplace=True)
all_post_df

Unnamed: 0,Category,combo_text,word_count
0,1,Msc in astrophy after bsc in biophysics removed,8
1,1,Msc Astrophy after Bsc Biophysics removed,6
2,1,Not an astrophysicist But say I have a polesti...,55
3,1,4 Tiny Missions Answering the Biggest Question...,10
4,1,Light bending around a black hole Ive been try...,64
...,...,...,...
4490,0,Questions for a beginner in quantum computing ...,87
4491,0,How does one observe the ourput of a QC if obs...,18
4492,0,Quick stupid question struggling with the conc...,114
4493,0,Quantum Computing Game Play this game link in ...,70


#### Set up everything

In [16]:
# X and y
X = all_post_df['combo_text']
y = all_post_df['Category']

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y)

In [17]:
# Instantiate a CountVectorizer with the default hyperparameters.
cvec = CountVectorizer()

# Fit the vectorizer on our corpus.
cvec.fit(X_train)

# Transform the corpus.
X_train_cvec = cvec.transform(X_train)

#transform the test data
X_test_cvec = cvec.transform(X_test)

In [18]:
len(cvec.get_feature_names())

24182

In [19]:
# trying a logreg model

# 1 - instantiate model
logreg = LogisticRegression(max_iter=1000)

# 2 - fit model
logreg.fit(X_train_cvec,y_train)

LogisticRegression(max_iter=1000)

In [21]:
logreg.predict(X_train_cvec)

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [22]:
logreg.score(X_train_cvec,y_train)

0.9899593289273004

In [23]:
logreg.score(X_test_cvec,y_test)

0.9390011437285551