In [1]:
#When dealing with text data, there are common pre-processing steps. We won't necessarily use all of them every time we deal with text data.
#- Remove special characters (there was that Regular expression example he gave on how to just keep the words)
#- Tokenizing 
#- Lemmatizing/Stemming
#- Stop word removal

In [2]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import re

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
# remove special chars
#df['title'] = df['title'].replace('[^\w ]','',regex=True).astype(str) 
# awesome regex shortcut thanks to https://stackoverflow.com/questions/1219915/regex-to-remove-apostrophe

In [4]:
astro_comm_df = pd.read_csv('./datasets/eda_astro_comment.csv')
astro_post_df = pd.read_csv('./datasets/eda_astro_post.csv')
quantum_comm_df = pd.read_csv('./datasets/eda_quantum_comment.csv')
quantum_post_df = pd.read_csv('./datasets/eda_quantum_post.csv')

I'm going to create an initial model that looks only at Comments for both Astrophysics and QuantumComputing.  I'll start by applying some light cleaning to each DataFrame and then combine them into one (before running train_test_split etc)

In [5]:
astro_comm_df.head(2)

Unnamed: 0.1,Unnamed: 0,body,created_utc,subreddit,author,Category,Post_comment,word_count
0,0,Your mum says your pole is more like 2 inches ...,1617222727,astrophysics,moon-worshiper,Astrophysics,Comment,67
1,1,"&gt; 760 mph\n\nJust to clarify, thats the spe...",1617220516,astrophysics,Lewri,Astrophysics,Comment,19


In [13]:
astro_comm_df.drop(columns=['Unnamed: 0','created_utc','subreddit','author'],inplace=True)
astro_comm_df

Unnamed: 0,body,Category,Post_comment,word_count
0,Your mum says your pole is more like 2 inches ...,Astrophysics,Comment,67
1,"&gt; 760 mph\n\nJust to clarify, thats the spe...",Astrophysics,Comment,19
2,Threw this git repo together [https://github.c...,Astrophysics,Comment,40
3,oh okay thanks for clearing that.,Astrophysics,Comment,6
4,"again, the movement needs to propagate through...",Astrophysics,Comment,24
...,...,...,...,...
9476,You can make similar conclusions about redshif...,Astrophysics,Comment,53
9477,Trust me -- you can usually busk it. Depends o...,Astrophysics,Comment,43
9478,Light has a frequency that can be measured. Wh...,Astrophysics,Comment,208
9479,The interplay between story details and the ph...,Astrophysics,Comment,48


In [14]:
quantum_comm_df.drop(columns=['Unnamed: 0','created_utc','subreddit','author'],inplace=True)
quantum_comm_df

Unnamed: 0,body,Category,Post_comment,word_count
0,Related question: there are cloud-based quantu...,Quantum,Comment,89
1,"3-SAT is NP-complete, and we don't expect to b...",Quantum,Comment,98
2,It is currently conjectured that quantum compu...,Quantum,Comment,34
3,"serious question, since i know nothing about t...",Quantum,Comment,57
4,"This was really an excellent podcast, thanks!",Quantum,Comment,7
...,...,...,...,...
9751,[https://www.lanl.gov/projects/national-securi...,Quantum,Comment,1
9752,"Yeah, just thought that post was particularly ...",Quantum,Comment,33
9753,‘Internships’ are rarely advertised. Find prof...,Quantum,Comment,69
9754,I bet the quantum internet people at TUDelf an...,Quantum,Comment,23


In [15]:
astro_post_df.drop(columns=['Unnamed: 0','created_utc','title','selftext','num_comments','author','subreddit'],inplace=True)
astro_post_df

Unnamed: 0,Category,Post_comment,combo_text,word_count
0,Astrophysics,Post,Msc in astrophy after bsc in biophysics? [remo...,8
1,Astrophysics,Post,Msc. Astrophy after Bsc. Biophysics? [removed],6
2,Astrophysics,Post,"Not an astrophysicist. But, say I have a pole/...",55
3,Astrophysics,Post,4 Tiny Missions Answering the Biggest Question...,10
4,Astrophysics,Post,Light bending around a black hole I've been tr...,64
...,...,...,...,...
5991,Astrophysics,Post,"Saw Lawrence Krauss talk last night, have some...",364
5992,Astrophysics,Post,Where can I a list of dim satellite flyovers? ...,90
5993,Astrophysics,Post,Do mathematical models for a white hole model ...,314
5994,Astrophysics,Post,Need some help for Astrophysics/Astronomy Home...,112


In [16]:
quantum_post_df.drop(columns=['Unnamed: 0','created_utc','title','selftext','num_comments','author','subreddit'],inplace=True)
quantum_post_df

Unnamed: 0,Category,Post_comment,combo_text,word_count
0,Quantum,Post,Sandia National Laboratories : Rare open-acces...,11
1,Quantum,Post,Some facts about Quantum physics the,6
2,Quantum,Post,Reasoning under uncertainty with a near-term q...,9
3,Quantum,Post,Quantum Computing Resources Hey everyone — jus...,72
4,Quantum,Post,What's the complexity of 3-sat algorithm on qu...,43
...,...,...,...,...
4490,Quantum,Post,Questions for a beginner in quantum computing ...,87
4491,Quantum,Post,How does one observe the ourput of a QC if obs...,18
4492,Quantum,Post,"Quick stupid question, struggling with the con...",114
4493,Quantum,Post,Quantum Computing Game Play this game. (link i...,70


In [17]:
all_comm_df = pd.concat([astro_comm_df,quantum_comm_df])
all_comm_df

Unnamed: 0,body,Category,Post_comment,word_count
0,Your mum says your pole is more like 2 inches ...,Astrophysics,Comment,67
1,"&gt; 760 mph\n\nJust to clarify, thats the spe...",Astrophysics,Comment,19
2,Threw this git repo together [https://github.c...,Astrophysics,Comment,40
3,oh okay thanks for clearing that.,Astrophysics,Comment,6
4,"again, the movement needs to propagate through...",Astrophysics,Comment,24
...,...,...,...,...
9751,[https://www.lanl.gov/projects/national-securi...,Quantum,Comment,1
9752,"Yeah, just thought that post was particularly ...",Quantum,Comment,33
9753,‘Internships’ are rarely advertised. Find prof...,Quantum,Comment,69
9754,I bet the quantum internet people at TUDelf an...,Quantum,Comment,23


In [18]:
all_post_df = pd.concat([astro_post_df,quantum_post_df])
all_post_df

Unnamed: 0,Category,Post_comment,combo_text,word_count
0,Astrophysics,Post,Msc in astrophy after bsc in biophysics? [remo...,8
1,Astrophysics,Post,Msc. Astrophy after Bsc. Biophysics? [removed],6
2,Astrophysics,Post,"Not an astrophysicist. But, say I have a pole/...",55
3,Astrophysics,Post,4 Tiny Missions Answering the Biggest Question...,10
4,Astrophysics,Post,Light bending around a black hole I've been tr...,64
...,...,...,...,...
4490,Quantum,Post,Questions for a beginner in quantum computing ...,87
4491,Quantum,Post,How does one observe the ourput of a QC if obs...,18
4492,Quantum,Post,"Quick stupid question, struggling with the con...",114
4493,Quantum,Post,Quantum Computing Game Play this game. (link i...,70


In [23]:
all_post_df = all_post_df.reindex(columns=['combo_text','Category', 'Post_comment', 'word_count'])
all_post_df

Unnamed: 0,combo_text,Category,Post_comment,word_count
0,Msc in astrophy after bsc in biophysics? [remo...,Astrophysics,Post,8
1,Msc. Astrophy after Bsc. Biophysics? [removed],Astrophysics,Post,6
2,"Not an astrophysicist. But, say I have a pole/...",Astrophysics,Post,55
3,4 Tiny Missions Answering the Biggest Question...,Astrophysics,Post,10
4,Light bending around a black hole I've been tr...,Astrophysics,Post,64
...,...,...,...,...
4490,Questions for a beginner in quantum computing ...,Quantum,Post,87
4491,How does one observe the ourput of a QC if obs...,Quantum,Post,18
4492,"Quick stupid question, struggling with the con...",Quantum,Post,114
4493,Quantum Computing Game Play this game. (link i...,Quantum,Post,70


In [25]:
all_comm_df = all_comm_df.rename(columns={'body':'combo_text'})
all_comm_df

Unnamed: 0,combo_text,Category,Post_comment,word_count
0,Your mum says your pole is more like 2 inches ...,Astrophysics,Comment,67
1,"&gt; 760 mph\n\nJust to clarify, thats the spe...",Astrophysics,Comment,19
2,Threw this git repo together [https://github.c...,Astrophysics,Comment,40
3,oh okay thanks for clearing that.,Astrophysics,Comment,6
4,"again, the movement needs to propagate through...",Astrophysics,Comment,24
...,...,...,...,...
9751,[https://www.lanl.gov/projects/national-securi...,Quantum,Comment,1
9752,"Yeah, just thought that post was particularly ...",Quantum,Comment,33
9753,‘Internships’ are rarely advertised. Find prof...,Quantum,Comment,69
9754,I bet the quantum internet people at TUDelf an...,Quantum,Comment,23


In [27]:
all_text_df = pd.concat([all_comm_df,all_post_df])
all_text_df

Unnamed: 0,combo_text,Category,Post_comment,word_count
0,Your mum says your pole is more like 2 inches ...,Astrophysics,Comment,67
1,"&gt; 760 mph\n\nJust to clarify, thats the spe...",Astrophysics,Comment,19
2,Threw this git repo together [https://github.c...,Astrophysics,Comment,40
3,oh okay thanks for clearing that.,Astrophysics,Comment,6
4,"again, the movement needs to propagate through...",Astrophysics,Comment,24
...,...,...,...,...
4490,Questions for a beginner in quantum computing ...,Quantum,Post,87
4491,How does one observe the ourput of a QC if obs...,Quantum,Post,18
4492,"Quick stupid question, struggling with the con...",Quantum,Post,114
4493,Quantum Computing Game Play this game. (link i...,Quantum,Post,70


In [28]:
all_text_df['combo_text'] = all_text_df['combo_text'].replace('[^\w ]','',regex=True).astype(str)
all_text_df

Unnamed: 0,combo_text,Category,Post_comment,word_count
0,Your mum says your pole is more like 2 inches ...,Astrophysics,Comment,67
1,gt 760 mphJust to clarify thats the speed of s...,Astrophysics,Comment,19
2,Threw this git repo together httpsgithubcomSha...,Astrophysics,Comment,40
3,oh okay thanks for clearing that,Astrophysics,Comment,6
4,again the movement needs to propagate through ...,Astrophysics,Comment,24
...,...,...,...,...
4490,Questions for a beginner in quantum computing ...,Quantum,Post,87
4491,How does one observe the ourput of a QC if obs...,Quantum,Post,18
4492,Quick stupid question struggling with the conc...,Quantum,Post,114
4493,Quantum Computing Game Play this game link in ...,Quantum,Post,70


In [29]:
all_text_df['Category'].replace({'Astrophysics':1,'Quantum':0},inplace=True)
all_text_df

Unnamed: 0,combo_text,Category,Post_comment,word_count
0,Your mum says your pole is more like 2 inches ...,1,Comment,67
1,gt 760 mphJust to clarify thats the speed of s...,1,Comment,19
2,Threw this git repo together httpsgithubcomSha...,1,Comment,40
3,oh okay thanks for clearing that,1,Comment,6
4,again the movement needs to propagate through ...,1,Comment,24
...,...,...,...,...
4490,Questions for a beginner in quantum computing ...,0,Post,87
4491,How does one observe the ourput of a QC if obs...,0,Post,18
4492,Quick stupid question struggling with the conc...,0,Post,114
4493,Quantum Computing Game Play this game link in ...,0,Post,70


#### Set up everything

In [30]:
# X and y
X = all_text_df['combo_text']
y = all_text_df['Category']

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y)

In [31]:
# Instantiate a CountVectorizer with the default hyperparameters.
cvec = CountVectorizer()

# Fit the vectorizer on our corpus.
cvec.fit(X_train)

# Transform the corpus.
X_train_cvec = cvec.transform(X_train)

#transform the test data
X_test_cvec = cvec.transform(X_test)

In [32]:
len(cvec.get_feature_names())

46195

In [33]:
# trying a logreg model

# 1 - instantiate model
logreg = LogisticRegression(max_iter=1000)

# 2 - fit model
logreg.fit(X_train_cvec,y_train)

LogisticRegression(max_iter=1000)

In [34]:
logreg.predict(X_train_cvec)

array([1, 1, 1, ..., 1, 0, 1], dtype=int64)

In [36]:
logreg.score(X_train_cvec,y_train)

0.9610692500897022

In [37]:
logreg.score(X_test_cvec,y_test)

0.8657158234660925