<span style="font-family:Trebuchet MS; font-size:2em;">Project 3 | NB4: Modeling</span>

Riley Robertson | Reddit Classification Project | 

## Imports

In [169]:
# Basic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Custom
import utilities.densmore as dns


# Modeling
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import recall_score, precision_score, accuracy_score

# Plots
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')

---

## Reading in CSV files from data collection

In [2]:
df_m = pd.read_csv('../data/reddit_posts_clean_modeling.csv')

In [3]:
df_m.shape

(12534, 57)

# Modeling

## Setup

### Train/Test Split

In [4]:
X = df_m.drop(columns=['subreddit', 'target'])
y = df_m['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=74, stratify=y)

In [6]:
# X_train.head()
# X_test.head()

### Vectorizing

In [7]:
# add_stop_words = []
stp_wds = text.ENGLISH_STOP_WORDS #.union(add_stop_words) # uncomment list and .union() to add stop words

In [8]:
tvec = TfidfVectorizer(stop_words=stp_wds)                # instantiate vectorizer

X_train_tvec = tvec.fit_transform(X_train['alltext'])     # fit/transform X_train
X_test_tvec = tvec.transform(X_test['alltext'])           # transform X_test

In [81]:
type(X_train_tvec)       # sparse matrix

scipy.sparse.csr.csr_matrix

In [80]:
X_train_tvec.todense()   # dense matrix

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [78]:
X_train_df = pd.DataFrame(X_train_tvec.todense(), columns=tvec.get_feature_names())
# X_train_df.head()

In [12]:
X_test_df = pd.DataFrame(X_test_tvec.todense(), columns=tvec.get_feature_names())
# X_test_df.head()

In [13]:
X_train_df.reset_index(drop=True, inplace=True)      # reset train_df index after transform
X_train.reset_index(drop=True, inplace=True)         # reset original X_train to match for concat

X_test.reset_index(drop=True, inplace=True)          # reset test_df index after transform
X_test_df.reset_index(drop=True, inplace = True)     # reset original X_test to match for concat

In [70]:
X_train_all = pd.concat([X_train, X_train_df],axis = 1)    # concat train token columns with other features
# X_train_all.head()

In [15]:
X_train_all.shape

(9400, 29213)

In [16]:
X_test_all = pd.concat([X_test, X_test_df],axis = 1)       # concat test token columns with other features
# X_test_all.head()

## Baseline Model

In [17]:
y_test.value_counts(normalize=True)

0    0.557116
1    0.442884
Name: target, dtype: float64

The cleaned dataset contains about 56% NFL posts and 44% EPL posts. This is our baseline model.

## Logistic Regression

### X, y setup

In [18]:
X_train_mod = X_train_all.drop(columns=['alltext'])
y_train_mod = y_train
X_test_mod = X_test_all.drop(columns=['alltext'])
y_test_mod = y_test

### Custom quickmodel function

In [71]:
# dns.quickmod_logregsa_nlp(X_train=X_train_mod, y_train=y_train_mod, 
#                                X_test=X_test_mod, y_test=y_test_mod,
#                                alist=[.15, .1525, .155, .1575, .16], 
#                                penalty='l1', 
#                                random_state=74
#                                )

<span style="font-family:Trebuchet MS; display:block; text-align:center; font-size:2em;">Testing Values for ùõº</span>

`|         Batch 1          |          Batch 2           |           Batch 3           |         Batch 4         |`

| alpha     | C           | Train Acc.| Test Acc.     |‚îÉ| alpha     | C           | Train Acc.| Test Acc.   |‚îÉ| alpha       | C           | Train Acc.| Test Acc.     |‚îÉ| alpha     | C           | Train Acc.| Test Acc.   |
|:----------|:------------|:----------|:--------------| |:----------|:------------|:----------|:------------| |:----------|:------------|:----------|:--------------| |:----------|:------------|:----------|:------------|
| **0.1**   | **10**      | **1**     | **0.9872**    |‚îÉ| 0.05      | 20          | 1         | 0.9875      |‚îÉ| 0.04        | 25          | 1         | 0.9869        |‚îÉ| **0.15**  | **6.6667**  | **1**     | **0.9885**  |
| 0.2       | 5           | 1         | 0.9860        |‚îÉ| 0.8       | 1.25        | 1         | 0.9796      |‚îÉ| 0.06        | 16.6667     | 1         | 0.9879        |‚îÉ| 0.1525    | 6.5574      | 1         | 0.9879      |
| 0.3       | 3.3333      | 1         | 0.9866        |‚îÉ| 0.12      | 8.3333      | 1         | 0.9869      |‚îÉ| 0.14        | 7.1428      | 1         | 0.9860        |‚îÉ| 0.155     | 6.4516      | 1         | 0.9888      |
| 0.4       | 2.5         | 1         | 0.9818        |‚îÉ| **0.15**  | **6.6667**  | **1**     | **0.9885**  |‚îÉ| **0.16**    | **6.25**    | **1**     | **0.9888**    |‚îÉ| **0.16**  | **6.25**    | **1**     | **0.9888**  | 


### Using best value for alpha

In [19]:
sc = StandardScaler(with_mean=False)

Z_train = sc.fit_transform(X_train_mod)
Z_test = sc.transform(X_test_mod)

logreg = LogisticRegression(penalty='l1', C=(1/.155), solver='liblinear', random_state=74)

logreg.fit(Z_train, y_train_mod)

LogisticRegression(C=6.451612903225807, penalty='l1', random_state=74,
                   solver='liblinear')

### Accuracy

In [83]:
logreg.score(Z_train, y_train_mod)

(1.0, 0.9865985960433951)

In [84]:
logreg.score(Z_test, y_test_mod)

0.9865985960433951

### Predictions and Confusion Matrix

In [180]:
logreg_preds = logreg.predict(Z_test)

tn, fp, fn, tp = confusion_matrix(y_test, logreg_preds).ravel()

plot_confusion_matrix(logreg, Z_test, y_test, values_format='d', cmap='Greens', labels=['Predictions', 'Actual'], display_labels=['NFL', 'EPL']);

  elif np.all([l not in y_true for l in labels]):


ValueError: At least one label specified must be in y_true

In [108]:
print(f'  True Neg: {tn}    False Pos: {fp}')
print(f' False Neg: {fn}       True Pos: {tp}')

  True Neg: 1736    False Pos: 10
 False Neg: 32       True Pos: 1356


### Exponentiated Coefficients

In [None]:
logreg_coefs_exp = list(zip(X_train_mod.columns, (list(np.exp(logreg.coef_)[0]))))

In [None]:
# pd.DataFrame(logreg_coefs_exp, columns=['Feature Name', 'Odds']).sort_values(by=['Odds'], ascending = False)[161:201]

In [None]:
# pd.DataFrame(logreg_coefs_exp, columns=['Feature Name', 'Odds']).sort_values(by=['Odds'], ascending = True)[161:201]

**Top Words**

A one-unit change in [token] means that classification as 1 (Premier League) is [odds] times as likely.

<span style="font-family:Trebuchet MS; display:block; text-align:center; font-size:2em;">Token Importance</span>
<span style="font-family:Trebuchet MS; display:block; text-align:center; font-size:em;">Token Importance</span>

|| <span style="font-family:Trebuchet MS; font-size:2em;">English Premier League</span>|  | <span style="font-family:Trebuchet MS; font-size:2em;">National Football League</span> | |
|---|---------------------:|---:|:------------------|---|


| Odds     | EPL Tokens (21-40)      | ‚îÉ | Odds     | EPL Tokens (1-20)    | ‚îÉ | NFL Tokens (1-20) | Odds     | ‚îÉ | NFL Tokens (21-40) | Odds     |
|---------:|------------------------:|:-:|---------:|---------------------:|:-:|:------------------|:---------|:-:|:------------------|:---------|
| 1.449 | clubs                   | ‚îÉ | 9.527 | Discussion           | ‚îÉ | nfl               | 0.103 | ‚îÉ | division          | 0.786 |
| 1.44  | everton                 | ‚îÉ | 5.964 | premier              | ‚îÉ | comments          | 0.406 | ‚îÉ | college           | 0.793 |
| 1.428 | goals                   | ‚îÉ | 5.710 | Question             | ‚îÉ | qb                | 0.485 | ‚îÉ | rookie            | 0.797 |
| 1.406 |       Manchester United | ‚îÉ | 2.853   | liverpool            | ‚îÉ | bowl              | 0.573 | ‚îÉ | coaches           | 0.797 |
| 1.401 | fixtures                | ‚îÉ | 2.770 | pl                   | ‚îÉ | draft             | 0.583 | ‚îÉ | playoffs          | 0.804 |
| 1.381 | goal                    | ‚îÉ | 2.330 | club                 | ‚îÉ | field             | 0.635  | ‚îÉ | rodgers           | 0.80  |
| 1.380 | matches                 | ‚îÉ | 2.291 | league               | ‚îÉ | patriots          | 0.652 | ‚îÉ | conference        | 0.816 |
| 1.365 | manchester              | ‚îÉ | 2.274 | Premier League       | ‚îÉ | offense           | 0.656 | ‚îÉ | superbowl         | 0.821 |
| 1.365 |       Liverpool         | ‚îÉ | 2.154 | arsenal              | ‚îÉ | yards             | 0.673 | ‚îÉ | running           | 0.823 |
| 1.340 | wolves                  | ‚îÉ | 2.131 | epl                  | ‚îÉ | brady             | 0.682 | ‚îÉ | qbs               | 0.824 |
| 1.337  | prem                    | ‚îÉ | 2.081  | chelsea              | ‚îÉ | playoff           | 0.693 | ‚îÉ | pff               | 0.829 |
| 1.331 | cup                     | ‚îÉ | 1.841 | var                  | ‚îÉ | none              | 0.701 | ‚îÉ | practice          | 0.830 |
| 1.320 | southampton             | ‚îÉ | 1.702 | united               | ‚îÉ | cowboys           | 0.734 | ‚îÉ | guy               | 0.830 |
| 1.315  | mourinho                | ‚îÉ | 1.672 | match                | ‚îÉ | franchise         | 0.746 | ‚îÉ | nfc               | 0.834 |
| 1.315 | newcastle               | ‚îÉ | 1.655 | poll                 | ‚îÉ | eagles            | 0.752 | ‚îÉ | brown             | 0.834 |
| 1.290 | transfer                | ‚îÉ | 1.642  | spurs                | ‚îÉ | browns            | 0.753 | ‚îÉ | yard              | 0.835 |
| 1.280 | manager                 | ‚îÉ | 1.629 | tottenham            | ‚îÉ | ravens            | 0.768 | ‚îÉ | trade             | 0.837 |
| 1.277 | london                  | ‚îÉ | 1.571 | city                 | ‚îÉ | quarterback       | 0.771 | ‚îÉ | pass              | 0.840 |
| 1.272 | leicester               | ‚îÉ | 1.532 | english              | ‚îÉ | jets              | 0.774 | ‚îÉ | years             | 0.841 |
| 1.248 | brighton                | ‚îÉ | 1.469 | News                 | ‚îÉ | offensive         | 0.774 | ‚îÉ | like              | 0.841 |


| <span style="font-family:Trebuchet MS; font-size:2em;">English Premier League</span>     | ‚îÉ | <span style="font-family:Trebuchet MS; font-size:2em;">National Football League</span> | 
|---------------------:|---:|:------------------|

| Odds     | EPL Tokens (61-80)    | ‚îÉ | Odds     | EPL Tokens (41-60)      | ‚îÉ | NFL Tokens (41-60) | Odds     | ‚îÉ | NFL Tokens (61-80) | Odds     |
|---------:|------------------------:|:-:|---------:|---------------------:|:-:|:------------------|:---------|:-:|:------------------|:---------|
| 1.196975 | premierleague         | ‚îÉ | 1.247207 | palace                  | ‚îÉ | td                 | 0.842231 | ‚îÉ | year               | 0.859388 |
| 1.196412 | leeds                 | ‚îÉ | 1.246749 | offside                 | ‚îÉ | giants             | 0.842783 | ‚îÉ | falcons            | 0.859973 |
| 1.195304 | Chelsea               | ‚îÉ | 1.244287 | madrid                  | ‚îÉ | offseason          | 0.843461 | ‚îÉ | sacks              | 0.86044  |
| 1.191963 | join                  | ‚îÉ | 1.239036 | villa                   | ‚îÉ | chiefs             | 0.844952 | ‚îÉ | receiver           | 0.860556 |
| 1.185141 | sky                   | ‚îÉ | 1.238473 | american                | ‚îÉ | bay                | 0.844984 | ‚îÉ | dude               | 0.861166 |
| 1.184474 | burnley               | ‚îÉ | 1.236565 | hello                   | ‚îÉ | roster             | 0.845317 | ‚îÉ | broncos            | 0.861406 |
| 1.183534 | ham                   | ‚îÉ | 1.227556 | pogba                   | ‚îÉ | 16                 | 0.845399 | ‚îÉ | plays              | 0.862848 |
| 1.174342 | finish                | ‚îÉ | 1.219523 | today                   | ‚îÉ | steelers           | 0.846241 | ‚îÉ | cap                | 0.863694 |
| 1.17416  | Manchester City       | ‚îÉ | 1.217903 | sterling                | ‚îÉ | rams               | 0.846504 | ‚îÉ | cardinals          | 0.863955 |
| 1.173772 | stream                | ‚îÉ | 1.215974 | nbc                     | ‚îÉ | wr                 | 0.847066 | ‚îÉ | coach              | 0.866043 |
| 1.168381 | nbcsn                 | ‚îÉ | 1.215363 | salah                   | ‚îÉ | williams           | 0.847163 | ‚îÉ | contract           | 0.869072 |
| 1.167137 | title                 | ‚îÉ | 1.214193 | world                   | ‚îÉ | bears              | 0.84778  | ‚îÉ | lions              | 0.870203 |
| 1.166319 | watford               | ‚îÉ | 1.209385 | Tottenham Hotspur       | ‚îÉ | rules              | 0.84848  | ‚îÉ | hall               | 0.87065  |
| 1.165636 | bruyne                | ‚îÉ | 1.206445 | bissaka                 | ‚îÉ | 12                 | 0.853288 | ‚îÉ | decided            | 0.872295 |
| 1.164696 | fantasy               | ‚îÉ | 1.206409 | pitch                   | ‚îÉ | night              | 0.853389 | ‚îÉ | picks              | 0.872544 |
| 1.162007 | signings              | ‚îÉ | 1.204813 | thanks                  | ‚îÉ | packers            | 0.854285 | ‚îÉ | calls              | 0.872669 |
| 1.161203 | Leeds United          | ‚îÉ | 1.203586 | soccer                  | ‚îÉ | football           | 0.855693 | ‚îÉ | sb                 | 0.872748 |
| 1.15957  | podcasts              | ‚îÉ | 1.20307  | streams                 | ‚îÉ | numbers            | 0.856211 | ‚îÉ | smith              | 0.873245 |
| 1.158943 | ozil                  | ‚îÉ | 1.201184 | x200b                   | ‚îÉ | defense            | 0.856393 | ‚îÉ | pats               | 0.873918 |
| 1.158394 | subtitles             | ‚îÉ | 1.199226 | relegation              | ‚îÉ | chargers           | 0.857366 | ‚îÉ | te                 | 0.875945 |

## Random Forest

### Setup

In [60]:
rfc = RandomForestClassifier()

In [61]:
params = {
    'n_estimators': [100, 600],
    'max_depth': [5, 40],
    'max_features': [500, 3000]
}

### Gridsearch and Fit

In [62]:
rfc_gs = GridSearchCV(rfc, params, cv=3, verbose=1, n_jobs=-2)

In [63]:
rfc_gs.fit(X_train_mod, y_train_mod)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  24 out of  24 | elapsed: 21.8min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-2,
             param_grid={'max_depth': [5, 40], 'max_features': [500, 3000],
                         'n_estimators': [100, 600]},
             verbose=1)

In [28]:
# checks
# X_train_mod.shape, y_train_mod.shape

### Results

In [143]:
rfc_gs.best_params_

{'max_depth': 40, 'max_features': 500, 'n_estimators': 600}

In [66]:
rfc_gs.best_score_

0.9880849929519719

### Confusion Matrix

In [162]:
random_forest_preds = rfc_gs.predict(X_test_mod)

tn, fp, fn, tp = confusion_matrix(y_test_mod, random_forest_preds).ravel()

plot_confusion_matrix(rfc_gs, X_test_mod, y_test_mod, values_format='d', cmap='Greens');

  elif np.all([l not in y_true for l in labels]):


ValueError: At least one label specified must be in y_true

In [None]:
print(f'  True Neg: {tn}    False Pos: {fp}')
print(f' False Neg: {fn}       True Pos: {tp}')

In [None]:
d = {'Column Name':X_train_all.columns,'Feature Importance':pipe.best_estimator_.named_steps['rfc'].feature_importances_}
fi = pd.DataFrame(d)
#fi.head()
rfc_fi = fi.sort_values(by=['Feature Importance'], ascending = False).head(50)