# Summary of this notebook

## Imports

In [132]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier



#Import functions and preprocessors from previous notebook
from processing_functions import url_preprocessor, stem_processor

## Data Import

In [2]:
df_all = pd.read_csv('../data/combined.csv', index_col='id')

#Since Pandas converts empty strings to NaN's, we need to fill these in again
df_all.fillna('', inplace=True)
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2325 entries, 107hfj5 to 10b33q8
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        2325 non-null   object 
 1   text         2325 non-null   object 
 2   utc          2325 non-null   float64
 3   subreddit    2325 non-null   object 
 4   title_words  2325 non-null   int64  
 5   text_words   2325 non-null   int64  
 6   title_chars  2325 non-null   int64  
 7   text_chars   2325 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 163.5+ KB


## Add a column that combines title and body text

In [3]:
df_all['post'] = df_all[['title','text']].apply((lambda x : ' '.join(x)), axis=1)
df_all.head(2)

Unnamed: 0_level_0,title,text,utc,subreddit,title_words,text_words,title_chars,text_chars,post
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
107hfj5,minature series,"Over 2022, I wrote about 30 little pieces for ...",1673279000.0,Composers,2,128,15,657,"minature series Over 2022, I wrote about 30 li..."
107hah0,How can I get my music published?,"I tried being ""my own"" publisher on ASCAP but ...",1673278000.0,Composers,8,142,33,613,How can I get my music published? I tried bein...


## Train/Test Split

We reserve 20% of our data in the dataframe `val_df`.  The remaining 80% of the data will be used to train our models; we will store this data in the dataframe `df`.  We will not use the `val_df` data until the validation stage at the end of the modeling process.

In [4]:
df, val_df = train_test_split(df_all,
                               random_state=123,
                               stratify=df_all['subreddit'])
df['subreddit'].value_counts()

Producers    930
Composers    813
Name: subreddit, dtype: int64

## Baseline accuracy

In [5]:
df['subreddit'].value_counts(normalize=True)

Producers    0.533563
Composers    0.466437
Name: subreddit, dtype: float64

So if we used a null model that just guessed the most common class ("Producers") in all cases, our accuracy score would be about 53.3%.

# Training models on title + body text

We'll begin by training models on both the titles and body texts of posts, but no other features.  Later we will try training the models on other sets of features, such as using just the titles instead of titles+body texts, and also including word counts.

In [6]:
X = df['post']
y = df['subreddit']

We will train the following types of classification models: Naive Bayes, Logistic Regression, $k$-Nearest Neighbors (kNN), Random Forest, and Gradient Boosted Decision Trees.  For each type of model, we will perform a hyperparameter grid search to find the model that achieves the highest cross-val Accuracy score.  Based on the results of previous grid searches, we'll adapt the hyperparameters over which we're grid searching.  Finally, we'll go back to any models that performed well and more finely tune their hyperparameters.

## Naive Bayes Classifier

In [7]:
nb_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords.words('english'))),
    ('nb', MultinomialNB())
])

In [8]:
nb_pipe_params = {
    'cvec__preprocessor': [url_preprocessor, ( lambda x : stem_processor(url_preprocessor(x)) )],
    'cvec__max_features': list(range(1000,5001,1000)),
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.5, .7, 0.9],
    'cvec__ngram_range': [(1, 1), (1, 2)]
}

In [9]:
nb_gs = GridSearchCV(nb_pipe, param_grid = nb_pipe_params, n_jobs=-1)

In [10]:
nb_gs.fit(X, y)

GridSearchCV(estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                                    'your',
                     

In [11]:
#Examine best hyperparameters but don't print the whole list of stopwords
nb_gs.best_params_

{'cvec__max_df': 0.5,
 'cvec__max_features': 4000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': <function processing_functions.url_preprocessor(text)>}

In [12]:
nb_gs.best_score_

0.9357375753384053

This is a good crossval score!

## Logistic Regression

In [13]:
lr_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords.words('english'))),
    ('ss',StandardScaler(with_mean=False)),
    ('lr', LogisticRegression())
])

In [14]:
lr_pipe_params = {
    'lr__C' : np.logspace(-2,2,12),
    
    'cvec__preprocessor': [url_preprocessor, ( lambda x : stem_processor(url_preprocessor(x)) )],
    'cvec__max_features': list(range(1000,5001,1000)),
    'cvec__min_df': [2],
    'cvec__max_df': [.5, .7, 0.9],
    'cvec__ngram_range': [(1, 1), (1, 2)]
}

In [15]:
lr_gs = GridSearchCV(lr_pipe, lr_pipe_params, n_jobs=-1)

In [16]:
lr_gs.fit(X,y)

GridSearchCV(estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                                    'your',
                     

In [17]:
lr_gs.best_params_

{'cvec__max_df': 0.5,
 'cvec__max_features': 1000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': <function processing_functions.url_preprocessor(text)>,
 'lr__C': 0.01}

In [18]:
lr_gs.best_score_

0.9093419622566941

This crossval score is still pretty good, but not as good as the Naive Bayes classifier.

### What we've learned so far

It seems that `CountVectorizer`'s parameters `min_df` being 2 works better than it being 3.  We'll try 1 as well (below).  Also, it seems that an `ngram_range` of `(1,1)` - i.e., the default of considering only single words - is generally best.  So for now, we'll save some computation time and set these parameters to these values.  Similarly, adding stemming (on top of the URL preprocessor) didn't seem to help in any case so far, so we'll stop using it.

Finally, `max_df` seems to be too large to be relevant (perhaps because no word appears in more than 50% of posts): both of the above gridsearches found the optimal `max_df` to be .5.  Any value smaller than this certainly defeats the point of having a `max_df`, so we'll drop this parameter.  

# kNN

In [19]:
knn_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords.words('english'), preprocessor = url_preprocessor) ),
    ('ss', StandardScaler(with_mean=False)),
    ('knn', KNeighborsClassifier())
])

In [20]:
knn_pipe_params = {
    'knn__n_neighbors':[3,5,7,9],
    'knn__weights':['uniform','distance'],
    
    'cvec__max_features': list(range(1000,5001,1000)),
    'cvec__min_df': [1,2],
}

In [21]:
knn_gs = GridSearchCV(knn_pipe, param_grid = knn_pipe_params, n_jobs=-1)

In [22]:
knn_gs.fit(X, y)

GridSearchCV(estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(preprocessor=<function url_preprocessor at 0x7fb4990e2af0>,
                                                        stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                          

In [23]:
knn_gs.best_params_

{'cvec__max_features': 1000,
 'cvec__min_df': 2,
 'knn__n_neighbors': 3,
 'knn__weights': 'uniform'}

In [24]:
knn_gs.best_score_

0.763648190231532

So again we find that `min_df=2` is best.  So next time, we'll drop `min_df=3` from consideration, but we'll also try `min_df=.01` (i.e., keep only those words that appear in at least 1% of posts).

This score is not nearly as good as the other models, so we won't bother with a more refined search.

# Random Forest

In [63]:
rf_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords.words('english'), preprocessor = url_preprocessor) ),
    ('rf', RandomForestClassifier(random_state=42))
])

In [64]:
rf_pipe_params = {
    'rf__n_estimators':[50,100,150],
    'rf__max_depth':[None,5],
    'rf__min_samples_split':[2,5,10],
    'rf__ccp_alpha':[0,.01,.1],
    
    'cvec__max_features': list(range(1000,5001,1000)),
    'cvec__min_df': [2,.01],
}

In [65]:
rf_gs = GridSearchCV(rf_pipe, param_grid = rf_pipe_params, n_jobs=-1)

In [66]:
rf_gs.fit(X, y)

GridSearchCV(estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(preprocessor=<function url_preprocessor at 0x7fb4990e2af0>,
                                                        stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                          

In [67]:
rf_gs.best_params_

{'cvec__max_features': 2000,
 'cvec__min_df': 2,
 'rf__ccp_alpha': 0,
 'rf__max_depth': None,
 'rf__min_samples_split': 10,
 'rf__n_estimators': 150}

In [68]:
rf_gs.best_score_

0.9225290649804038

Again, this is a pretty good score.  And again we find that `min_df=2` seems to be working best.

# Gradient Boosted Decision Trees

In [69]:
boost_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords.words('english'), preprocessor = url_preprocessor) ),
    ('ss', StandardScaler(with_mean=False)),
    ('boost', GradientBoostingClassifier(random_state=42))
])

In [71]:
boost_pipe_params = {
    'boost__n_estimators': list(range(40,161,20)),
    'boost__max_depth': [3,4],

    
    'cvec__max_features': [1000,2000,3000,4000,5000],
    'cvec__min_df': [2],
}

In [72]:
boost_gs = GridSearchCV(boost_pipe, param_grid = boost_pipe_params, n_jobs=-1)

In [73]:
boost_gs.fit(X, y)

GridSearchCV(estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(preprocessor=<function url_preprocessor at 0x7fb4990e2af0>,
                                                        stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                          

In [74]:
boost_gs.best_params_

{'boost__max_depth': 4,
 'boost__n_estimators': 160,
 'cvec__max_features': 1000,
 'cvec__min_df': 2}

In [75]:
boost_gs.best_score_

0.9087721898363139

# Refining the models

We'll refine the models in the reverse order from the order in which we originally trained them.

### Gradient Boosted Decision Trees

In [77]:
#Adjusting learning rate before further refining
boost_pipe_params0 = {
    'boost__n_estimators': [50,100,150,200],
    'boost__max_depth': [3,4,5],
    'boost__learning_rate': np.logspace(-3,-1,8),
    
    'cvec__max_features': [500,1000,1500,2000],
    'cvec__min_df': [2],
}

In [78]:
boost_gs0 = GridSearchCV(boost_pipe, param_grid = boost_pipe_params0, n_jobs=-1)

In [79]:
boost_gs0.fit(X, y)

GridSearchCV(estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(preprocessor=<function url_preprocessor at 0x7fb4990e2af0>,
                                                        stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                          

In [80]:
boost_gs0.best_params_

{'boost__learning_rate': 0.1,
 'boost__max_depth': 5,
 'boost__n_estimators': 200,
 'cvec__max_features': 1000,
 'cvec__min_df': 2}

In [82]:
boost_gs0.best_score_

0.9139363699239207

This model did somewhat better than the last one, but playing with the `learning_rate` did not help.  We'll use this information to look for a more refined model:

#### Refining again

In [83]:
#Adjusting learning rate before further refining
boost_pipe_params1 = {
    'boost__n_estimators': list(range(150,221,10)),
    'boost__max_depth': [4,5,6,7],
    
    'cvec__max_features': [700,800,900,1000,1100,1200],
    'cvec__min_df': [2],
}

In [84]:
boost_gs1 = GridSearchCV(boost_pipe, param_grid = boost_pipe_params1, n_jobs=-1)

In [85]:
boost_gs1.fit(X, y)

GridSearchCV(estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(preprocessor=<function url_preprocessor at 0x7fb4990e2af0>,
                                                        stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                          

In [86]:
boost_gs1.best_params_

{'boost__max_depth': 5,
 'boost__n_estimators': 220,
 'cvec__max_features': 1000,
 'cvec__min_df': 2}

In [87]:
boost_gs1.best_score_

0.9139380166650198

The improvement from the second round of refining was rather miniscule.

### Random Forest

Let's see if we can do any better.  We'll try adding back bigrams this time, as well as adding in `min_samples_leaf`.

In [38]:
rf_pipe_params1 = {
    'rf__n_estimators':[100, 150, 200, 250],
    'rf__max_depth':[None,7],
    'rf__min_samples_split':[5,10],
    'rf__min_samples_leaf':[1,2,5,10],
    
    'cvec__max_features': [2500,3000,3500],
    'cvec__min_df': [2],
    'cvec__ngram_range': [(1,1), (1,2)]
}

In [39]:
rf_gs1 = GridSearchCV(rf_pipe, param_grid = rf_pipe_params1, n_jobs=-1)

In [40]:
rf_gs1.fit(X, y)

GridSearchCV(estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(preprocessor=<function url_preprocessor at 0x7fb4990e2af0>,
                                                        stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                          

In [41]:
rf_gs1.best_params_

{'cvec__max_features': 3500,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1),
 'rf__max_depth': None,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 10,
 'rf__n_estimators': 150}

In [42]:
rf_gs1.best_score_

0.9225241247571058

This more refined search didn't improve the score all that much.

### Logistic Regression

In [44]:
lr_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords.words('english'), preprocessor = url_preprocessor) ),
    ('ss',StandardScaler(with_mean=False)),
    ('lr', LogisticRegression())
])

In [46]:
lr_pipe_params1 = {
    'lr__C' : np.logspace(-3,2,20),

    'cvec__max_features': [250, 500, 750, 1000, 1250, 1500, 1750, 2000],
    'cvec__min_df': [2],
    'cvec__ngram_range': [(1, 1), (1, 2)]
}

In [47]:
lr_gs1 = GridSearchCV(lr_pipe, lr_pipe_params1, n_jobs=-1)

In [48]:
lr_gs1.fit(X,y)

GridSearchCV(estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(preprocessor=<function url_preprocessor at 0x7fb4990e2af0>,
                                                        stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                          

In [51]:
lr_gs1.best_params_

{'cvec__max_features': 1750,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1),
 'lr__C': 0.006158482110660267}

In [52]:
lr_gs1.best_score_

0.9110743338932252

Again, the more refined search did not improve the accuracy score much.

### Naive Bayes

In [54]:
nb_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords.words('english'), preprocessor = url_preprocessor) ),
    ('nb', MultinomialNB())
])

In [57]:
nb_pipe_params1 = {
    'nb__alpha': np.logspace(-2,2,9),
    
    'cvec__max_features': list(range(3250,4751,250)),
    'cvec__min_df': [2],
    'cvec__ngram_range': [(1, 1), (1, 2)]
}

In [58]:
nb_gs1 = GridSearchCV(nb_pipe, param_grid = nb_pipe_params1, n_jobs=-1)

In [59]:
nb_gs1.fit(X, y)

GridSearchCV(estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(preprocessor=<function url_preprocessor at 0x7fb4990e2af0>,
                                                        stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                          

In [60]:
#Examine best hyperparameters but don't print the whole list of stopwords
nb_gs1.best_params_

{'cvec__max_features': 3750,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1),
 'nb__alpha': 1.0}

In [61]:
nb_gs1.best_score_

0.9363106412409842

Again, the improvement is rather small.

# Summary of best models trained on titles + body text

So far, to avoid data leakage, we haven't touched the test data.  But to summarize our progress so far, we'll make a table of the crossval accuracy scores and test data accuracy scores of the best models so far.

In [98]:
names_dict = {'nb_gs1': 'Naive Bayes',
              'rf_gs1': 'Random Forest',
              'lr_gs1': 'Logistic Regression',
              'boost_gs1':'Gradient Boost'}

X_test = val_df['post']
y_test = val_df['subreddit']

results_dict = {}
for est in names_dict.keys():
    results_dict[names_dict[est]] = {
     'Crossval Score': locals()[est].best_score_,
     'Test Data Score': locals()[est].score(X_test, y_test)
    }
    
results_df = pd.DataFrame(results_dict).T
results_df.sort_values('Crossval Score', ascending=False)

Unnamed: 0,Crossval Score,Test Data Score
Naive Bayes,0.936311,0.917526
Random Forest,0.922524,0.914089
Gradient Boost,0.913938,0.908935
Logistic Regression,0.911074,0.90378


These models have performed rather well.  But would a stacked model, which combines their predictions, improve results even further?

## Stacked Model

In [101]:
names_dict.keys()

dict_keys(['nb_gs1', 'rf_gs1', 'lr_gs1', 'boost_gs1'])

In [100]:
locals()['nb_gs1'].best_estimator_

Pipeline(steps=[('cvec',
                 CountVectorizer(max_features=3750, min_df=2,
                                 preprocessor=<function url_preprocessor at 0x7fb4990e2af0>,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('nb', MultinomialNB())])

In [103]:
print(list(names_dict.keys()))

['nb_gs1', 'rf_gs1', 'lr_gs1', 'boost_gs1']


In [106]:
#The following list comprehension won't work, as locals() is local to the most local scope (which is only the list comp here):
#level1_estimators = [ (name, locals()[name].best_estimator_) for name in names_dict.keys()]

level1_estimators = []
for name in names_dict.keys():
    level1_estimators.append((name, locals()[name].best_estimator_))

stacked_model = StackingClassifier(estimators=level1_estimators,
                                 final_estimator=LogisticRegression(penalty='none'))

In [109]:
cvs = cross_val_score(stacked_model, X, y)

In [111]:
stacked_model.fit(X,y)

StackingClassifier(estimators=[('nb_gs1',
                                Pipeline(steps=[('cvec',
                                                 CountVectorizer(max_features=3750,
                                                                 min_df=2,
                                                                 preprocessor=<function url_preprocessor at 0x7fb4990e2af0>,
                                                                 stop_words=['i',
                                                                             'me',
                                                                             'my',
                                                                             'myself',
                                                                             'we',
                                                                             'our',
                                                                             'ours',
                              

In [114]:
results_dict['Stacked Model'] = {
     'Crossval Score': cvs.mean(),
     'Test Data Score': stacked_model.score(X_test, y_test)
    }

results_df = pd.DataFrame(results_dict).T
results_df.sort_values('Crossval Score', ascending=False)

Unnamed: 0,Crossval Score,Test Data Score
Stacked Model,0.940327,0.931271
Naive Bayes,0.936311,0.917526
Random Forest,0.922524,0.914089
Gradient Boost,0.913938,0.908935
Logistic Regression,0.911074,0.90378


# Training on Titles Only

For comparison, we'll train a Naive Bayes model on only the *titles* of the posts.

In [134]:
X_title = df['title']

In [135]:
nb_title_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords.words('english'), preprocessor = url_preprocessor) ),
    ('nb', MultinomialNB())
])

In [136]:
nb_title_pipe_params = {
    'cvec__max_features': list(range(500,5001,500)),
    'cvec__min_df': [2, .1],
    'cvec__max_df': [.95, .9],
    'cvec__ngram_range': [(1, 1), (1, 2)]
}

In [137]:
nb_title_gs = GridSearchCV(nb_title_pipe, param_grid = nb_title_pipe_params, n_jobs=-1)

In [138]:
nb_title_gs.fit(X_title, y)

GridSearchCV(estimator=Pipeline(steps=[('cvec',
                                        CountVectorizer(preprocessor=<function url_preprocessor at 0x7fb4990e2af0>,
                                                        stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                          

In [139]:
#Examine best hyperparameters but don't print the whole list of stopwords
nb_title_gs.best_params_

{'cvec__max_df': 0.95,
 'cvec__max_features': 1500,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1)}

In [140]:
#Cross-val score
nb_title_gs.best_score_

0.8468283766426243

In [141]:
#Test score
X_test_title = val_df['title']
nb_title_gs.score(X_test_title, y_test)

0.8642611683848798

# Including word counts

Do our models perform better if we give them the title and body text word counts as features?

In [130]:
X_words = df[['post','title_words','text_words']]
X_words_test = val_df[['post','title_words','text_words']]
X_words.head(2)

Unnamed: 0_level_0,post,title_words,text_words
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
103osx2,halftime vs gross beat is there any audible di...,4,30
zoda66,"First Composition, what do you think ? Hi. I'm...",8,68


In [159]:
#Count vectorize only the 'post' column
counter = ColumnTransformer(
 transformers=[
     ('cvec', CountVectorizer(stop_words=stopwords.words('english'), preprocessor = url_preprocessor),  ['post']),
 ],
    remainder='passthrough'
)

In [152]:
nb_words_pipe = Pipeline([
    ('count', counter),
    ('nb', MultinomialNB())
])

In [153]:
nb_words_pipe_params = {
    'count__cvec__max_features': list(range(500,5001,500)),
    'count__cvec__min_df': [2, .1],
    #'count__cvec__max_df': [.95, .9],
    'count__cvec__ngram_range': [(1, 1), (1, 2)]
}

In [154]:
nb_words_gs = GridSearchCV(nb_words_pipe, param_grid = nb_words_pipe_params, n_jobs=-1)

In [155]:
nb_words_gs.fit(X_words, y)

ValueError: max_df corresponds to < documents than min_df

In [121]:
#Examine best hyperparameters but don't print the whole list of stopwords
nb_title_gs.best_params_

{'cvec__max_df': 0.95,
 'cvec__max_features': 1500,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1)}

In [126]:
#Cross-val score
nb_title_gs.best_score_

0.8468283766426243

In [127]:
#Test score
X_test_title = val_df['title']
nb_title_gs.score(X_test_title, y_test)

0.8642611683848798