<font size="7"> Modeling </font>

In [153]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

In [4]:
df = pd.read_csv("./dataframe", keep_default_na = False, index_col = 0)

In [5]:
df.head(10) #output first ten rows of dataframe

Unnamed: 0,title,created_utc,selftext,subreddit,author,media_only,permalink,textsum,0
0,Gorgeous NASA X-ray images of universe look li...,1602008143,,astrophysics,Sorin61,False,/r/astrophysics/comments/j6aaro/gorgeous_nasa_...,Gorgeous NASA X-ray images of universe look li...,0
1,Can someone please explain how to calculate th...,1602006466,,astrophysics,astrojosue,False,/r/astrophysics/comments/j69qx5/can_someone_pl...,Can someone please explain how to calculate th...,0
2,Can someone explain formula to calculate the m...,1602000784,,astrophysics,astrojosue,False,/r/astrophysics/comments/j67wz2/can_someone_ex...,Can someone explain formula to calculate the m...,0
3,Can someone elaborate?,1601999794,,astrophysics,astrojosue,False,/r/astrophysics/comments/j67lw5/can_someone_el...,Can someone elaborate?,0
4,Maybe astrophysics is not for me! Just watched...,1601987660,,astrophysics,Yugitonii,False,/r/astrophysics/comments/j6485v/maybe_astrophy...,Maybe astrophysics is not for me! Just watched...,0
5,Extended essay,1601966026,,astrophysics,annawithann,False,/r/astrophysics/comments/j608lt/extended_essay/,Extended essay,0
6,Extended essay,1601965817,"Hello everyone! I'm a high school student, and...",astrophysics,MartynaLaptev,False,/r/astrophysics/comments/j6079j/extended_essay/,Extended essay Hello everyone! I'm a high scho...,0
7,One of the strangest objects in space may be k...,1601956254,,astrophysics,microworlds,False,/r/astrophysics/comments/j5y9hv/one_of_the_str...,One of the strangest objects in space may be k...,0
8,Any worthwhile certificates?,1601955078,I have a bachelors in IT and am hoping to purs...,astrophysics,brain____dead,False,/r/astrophysics/comments/j5xzpz/any_worthwhile...,Any worthwhile certificates? I have a bachelor...,0
9,What are your thoughts on Gravitons being used...,1601954472,,astrophysics,TheScienceVerse,False,/r/astrophysics/comments/j5xunw/what_are_your_...,What are your thoughts on Gravitons being used...,0


In [6]:
df.rename(columns={"0": "category"}, inplace = True) # change column name of binary class

Baseline Models

In [7]:
X = df["title"] # assign feature space
y = df["category"] #assign target 

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=60, stratify=y) # split data

In [9]:
cvect = CountVectorizer(stop_words = "english") #instantiate CountVectorizer

In [15]:
lr = LogisticRegression(max_iter=300) #Instantiate Logistic Regression

In [16]:
pipe = make_pipeline(cvect, lr) # make pipeline

In [17]:
pipe.fit(X_train, y_train) # fit pipeline

Pipeline(steps=[('countvectorizer', CountVectorizer(stop_words='english')),
                ('logisticregression', LogisticRegression(max_iter=300))])

In [144]:
pipe.score(X_train, y_train)

0.9933333333333333

In [145]:
pipe.score(X_test,y_test)

0.768

There is a sizeable discrepancy between the score on training data and testing data. The score of 0.768 is certainly better than 0.5 ( randomly choosing). 

In [125]:
X = df["textsum"]

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [149]:
cvect = CountVectorizer(stop_words = "english")

In [150]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer(stop_words='english')),
                ('logisticregression', LogisticRegression(max_iter=300))])

In [151]:
pipe.score(X_train, y_train)

0.9986666666666667

In [152]:
pipe.score(X_test, y_test)

0.816

The model performed better when using the sum of the title and selftext columns. 

In [159]:
word_coefs = pipe.named_steps["logisticregression"].coef_[0] #

words = pipe.named_steps['countvectorizer'].get_feature_names()

word_def = pd.DataFrame({'coefs': word_coefs, 'word': words})

word_def.nlargest(15, 'coefs')

#output 15  most signifcant words in accordance with coefficents 
#the higher the coefficent the more it contributed to the classification model

Unnamed: 0,coefs,word
1456,2.677758,particle
358,1.269378,cern
1602,1.189057,quantum
2106,0.987254,using
1876,0.980944,standard
980,0.970078,higgs
2209,0.916095,youtube
1458,0.909031,particles
142,0.90155,antimatter
1861,0.823442,spin


In [107]:
def pipe_in(transformer, estimator):
    pipe = make_pipeline(transformer, estimator)
    return pipe

def parameters(transformer):
    adjust = (str(transformer).replace('()', "")).lower()
    params = { adjust + '__max_features': [100, 500], adjust + '__stop_words': ['english', None], adjust + '__ngram_range': [(1, 1), (1, 2), (1, 3)]}
    return params


def gridsearch(transformer, estimator):

    pipe = pipe_in(transformer, estimator)
    params = parameters(transformer)
    gs = GridSearchCV(pipe,
                      param_grid=params,
                      cv=7,
                      n_jobs=-1)
    gs.fit(X_train, y_train)
    best_model = gs.best_estimator_
    
  
    print('Best model:', gs.best_params_)
    print('Best model:', best_model)
    print('Best model Score:', gs.best_score_)
    print('Training data Score:', best_model.score(X_train, y_train))
    print('Testing Data Score:', best_model.score(X_test, y_test))
#establish a gridsearch    

In [110]:
gridsearch(CountVectorizer(), LogisticRegression(max_iter=400)) #gridsearch for logistic regression and countvectorizer

Best model: {'countvectorizer__max_features': 500, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': 'english'}
Best model: Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=500, stop_words='english')),
                ('logisticregression', LogisticRegression(max_iter=400))])
Best model Score: 0.8293333333333333
Training data Score: 0.94
Testing Data Score: 0.764


In [78]:
gridsearch(CountVectorizer(), MultinomialNB()) # gridsearch with multinomial 

CountVectorizer
Best model: {'countvectorizer__max_features': 500, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': 'english'}

Best model: Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=500, stop_words='english')),
                ('multinomialnb', MultinomialNB())])

Cross-val score of the best model: 0.7666666666666667
Accuracy of best model on the training data: 0.8706666666666667
Accuracy of best model on the testing data: 0.736




In [79]:
gridsearch(CountVectorizer(), AdaBoostClassifier()) #adaboost 

CountVectorizer
Best model: {'countvectorizer__max_features': 400, 'countvectorizer__ngram_range': (1, 4), 'countvectorizer__stop_words': None}

Best model: Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=400, ngram_range=(1, 4))),
                ('adaboostclassifier', AdaBoostClassifier())])

Cross-val score of the best model: 0.772
Accuracy of best model on the training data: 0.8573333333333333
Accuracy of best model on the testing data: 0.748




In [123]:
toke = RegexpTokenizer('\w+') # establish tokenizer

In [124]:
lemmat = WordNetLemmatizer() #establish lemmatizer

In [120]:
def tokenize_lemmatize(column):
    list_of_tokens = []
    for string in column:
        list_of_tokens.append(toke.tokenize(string))
    e =  list_of_tokens
    
    list_of_lemmas = []
    for token in e:
        list_of_lemmas.append(lemmat.lemmatize(str(token)))
    return list_of_lemmas   

# establish a function that tokenize and lemmatizes

In [121]:
X_train = tokenize_lemmatize(X_train)
X_test = tokenize_lemmatize(X_test)

In [122]:
gridsearch(TfidfVectorizer(), LogisticRegression())

Best model: {'tfidfvectorizer__max_features': 500, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': 'english'}
Best model: Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=500, stop_words='english')),
                ('logisticregression', LogisticRegression())])
Best model Score: 0.8453333333333333
Training data Score: 0.936
Testing Data Score: 0.82


In [127]:
port = PorterStemmer() #use a porter stemmer 

In [131]:
def tokenize_porter(column):
    list_of_tokens = []
    for string in column:
        list_of_tokens.append(toke.tokenize(string))
    e =  list_of_tokens

    list_of_stems = []
    for string in e:
        list_of_stems.append(port.stem(str(string)))
    return list_of_stems

In [132]:
X_train = tokenize_porter(X_train)
X_test = tokenize_porter(X_test)

In [148]:
gridsearch(TfidfVectorizer(), LogisticRegression()) 

Best model: {'tfidfvectorizer__max_features': 500, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': 'english'}
Best model: Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=500, stop_words='english')),
                ('logisticregression', LogisticRegression(C=1.1))])
Best model Score: 0.8453333333333333
Training data Score: 0.936
Testing Data Score: 0.82


In [None]:
rm = RandomForestClassifier(max_depth=2, random_state=0) # random forests 

In [135]:
from sklearn.ensemble import RandomForestClassifier

In [143]:
gridsearch(TfidfVectorizer(), RandomForestClassifier(max_depth=6))

Best model: {'tfidfvectorizer__max_features': 500, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': 'english'}
Best model: Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=500, stop_words='english')),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=6))])
Best model Score: 0.7813333333333333
Training data Score: 0.84
Testing Data Score: 0.716


In [144]:
from sklearn.linear_model import SGDClassifier # stochastic Gradient Descent 

In [145]:
gridsearch(TfidfVectorizer(), SGDClassifier(loss="hinge", penalty="l2", max_iter=5))

Best model: {'tfidfvectorizer__max_features': 500, 'tfidfvectorizer__ngram_range': (1, 3), 'tfidfvectorizer__stop_words': 'english'}
Best model: Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=500, ngram_range=(1, 3),
                                 stop_words='english')),
                ('sgdclassifier', SGDClassifier(max_iter=5))])
Best model Score: 0.8280000000000001
Training data Score: 0.968
Testing Data Score: 0.768




Classification Metrics

In [None]:
gridsearch(TfidfVectorizer(), LogisticRegression())

In [151]:
pipe = make_pipeline(TfidfVectorizer(max_features=500),
                          LogisticRegression())
pipe.fit(X_train, y_train)


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(max_features=500)),
                ('logisticregression', LogisticRegression())])

In [154]:
preds = best_pipe.predict(X_test)
conf_matrix = confusion_matrix(y_test, preds)
conf_matrix
#Confusion Matrix 

array([[105,  20],
       [ 35,  90]])

In [155]:
tn, fp, fn, tp = conf_matrix.ravel() # True negative, False positive, false negative, True positive 

In [157]:
pd.DataFrame(conf_matrix, index=['actual Astrophysics', 'actual Particle Physics'], 
                 columns=['predicted Astrophysics', 'predicted Particle Physics'])

Unnamed: 0,predicted Astrophysics,predicted Particle Physics
actual Astrophysics,105,20
actual Particle Physics,35,90


The best model was logistic regression coupled with Tfidf.  It acquired an accuracy of 0.82.  The underlying limitations of this project was that only 500 posts were utilized for both the Astrophysics and Particle Physics Reddit. 