### This model will use the engineered "title_text" column as the feature since it performed better than the "title" column alone, and a Random Forest model in effort to reduce the variance in the logistic regression models.

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.metrics as metrics

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import plot_confusion_matrix
from sklearn.pipeline import Pipeline

In [5]:
df=pd.read_csv('../../data/reddit_content_20220228-063053.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,selftext,subreddit,created_utc
0,0,I'm working on my first UX project about reduc...,"Hi, I know this is annoying but this is my fir...",sustainability,1646023113
1,1,I’m not quite sure what to say for this but I ...,,sustainability,1646021616
2,2,Self-Sustainable Communities,"Over the past few years, I've been learning a ...",sustainability,1646009146
3,3,Great Barrier Reef: New Huge Coral Found,,sustainability,1645999683
4,4,Drying Hands: Blow Drying or Paper Towels,[removed],sustainability,1645997735


In [6]:
df.drop(columns='Unnamed: 0', inplace=True)

df['subreddit'] = df['subreddit'].replace({'sustainability':1, 'academia':0})

df['title_text'] = df['title'] + ' ' + df['selftext'].fillna('')

X= df['title_text']
y=df['subreddit']

## Baseline

In [7]:
y.value_counts(normalize=True)

0    0.503403
1    0.496597
Name: subreddit, dtype: float64

#### This model would predict every post to be from the academia subreddit and would be correct 50.3% of the time.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [9]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier())
    
])

In [None]:
#attempting different params. trial and error. hoping to get an accuracy greater than 94% from 02_model
params = {
    'cvec__stop_words': [None, 'english'],
    #'cvec__min_df': [0.01, 0.02, 0.03, 5, 10], 
    'cvec__max_df': [1.0, .75, .5, .25],
    'cvec__binary': [True, False],
    'rf__n_estimators': [75, 100, 125, 150],
    'rf__max_features' : [None, 'sqrt', 'log2', 3,6,9]
    
      #'cvec__strip_accents' : ['ascii', 'unicode', None]
}

gs = GridSearchCV(pipe,
                  param_grid=params)
gs.fit(X_train, y_train)
print(gs.best_score_) # <- cross val score
gs.best_params_

In [None]:
plot_confusion_matrix(gs, X_test, y_test, display_labels=['Academia', 'Sustainability'], cmap='cividis', colorbar=False);

In [None]:
#source: code from Tori Powers
trainscore = gs.score(X_train, y_train)
testscore = gs.score(X_test, y_test)
crossval = cross_val_score(gs, X_train, y_train).mean()
preds = gs.predict(X_test)
tn, fp, fn, tp = metrics.confusion_matrix(y_test, preds).ravel()
recall = metrics.recall_score(y_test, preds)
precision = metrics.precision_score(y_test, preds)
accuracy = testscore
specificity = tn / (tn + fp)

print (f'Train Score = {trainscore}')
print (f'Test Score = {testscore}')
print (f'Cross Val Score = {crossval}')
print (f'Accuracy Score = {testscore}')
print (f'Recall Score = {recall}')
print (f'Precision Score = {precision}')
print (f'Specificity Score = {specificity}')

## Evaluations

Since this model has a low accuracy score of 90% in comparison to a previous models score of 94%, no further work will be done with this model.