## Data Modeling (05) : Naive Bayes

**Import Libraries**

In [1]:
# standards
import pandas as pd
import numpy as np
from pprint import pprint
from sara import eda, eda_unique

# modeling
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB

# viz
import seaborn as sns

# stop future warnings
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

**Read in Processed Data**

In [2]:
df_reddit_process = pd.read_csv('./data/reddit_processed.csv')
print(df_reddit_process.shape)

(110378, 8)


**Check for NaNs**

In [3]:
df_reddit_process['title'].isna().sum()

625

**Drop NaNs**

In [4]:
df_reddit_process.dropna(subset=['title'],inplace=True)

**Explore Balance of Subreddits**

In [5]:
df_reddit_process['subreddit'].value_counts()

1    93393
0    16360
Name: subreddit, dtype: int64

**Select X and Y Features**

In [6]:
X = df_reddit_process['title']
y = df_reddit_process['subreddit']

print('y nulls: ', y.isna().sum())
print('X nulls: ', X.isna().sum())

y nulls:  0
X nulls:  0


**Train Test Split**

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

**NLP MODEL FUCNTION**

In [12]:
def nlp_model(X_train, X_test, y_train, y_test,
              transform, model, params):

    # STEP 1: Select Transform
    if transform == 'cvec':
        nlp_transform = 'cvec', CountVectorizer()
    if transform == 'tvec':
        nlp_transform = 'tvec', TfidfVectorizer()
    else:
        pass
    
    # STEP 2: Select Estimator
    if model == 'mnb':
        nlp_model = 'mnb', MultinomialNB()
    if model == 'gnb':
        nlp_model = 'gnb', GaussianNB()
    if model == 'bnb':
        nlp_model = 'bnb', BernuolliNB()
    else:
        pass
    
    # STEP 3: Build Pipeline
    pipe = Pipeline([
        (nlp_transform),
        (nlp_model)
    ])
    
    # STEP 4: Run Model
    gs = GridSearchCV(pipe,
                      param_grid=params,
                      n_jobs=-1,
                      cv=5)
    gs.fit(X_train,y_train)
    train_score = gs.score(X_train,y_train)
    test_score = gs.score(X_test,y_test)
    model = gs
    
    # STEP 5: Extract Parameters as Dictionary
    best_parameters = gs.best_estimator_.get_params()
    param_dict = {}
    for param_name in params.keys():
        new_param = {
            param_name : best_parameters[param_name],
        }
        param_dict.update(new_param)
    
    # STEP 6: Extract Coefficients as Dictionary
    if model == 'lr':
        if transform == 'cvec':
            coef_values = gs.best_estimator_.named_steps.lr.coef_  
            coef_keys = gs.best_estimator_.named_steps.cvec.get_feature_names()
            coef_dict = {k : v for (k,v) in zip(coef_keys,list(coef_values[0]))}
        if transform == 'tvec':
            coef_values = gs.best_estimator_.named_steps.lr.coef_  
            coef_keys = gs.best_estimator_.named_steps.tvec.get_feature_names()
            coef_dict = {k : v for (k,v) in zip(coef_keys,list(coef_values[0]))} 
    else:
        coef_dict = None
    
    # STEP 7: Score Dictionary
    score_values = [model,transform,
              train_score,test_score,param_dict,coef_dict]
    score_keys = ['Model','Transform',
            'Train Score','Test Score','Parameters','Coefficients']
    score_dict = dict(zip(score_keys,score_values))
    
    # STEP 8: Score DataFrame
    data = {'Model': [model], 'Transform': [transform],
            'Train Score': [train_score], 'Test Score': [test_score]}
    df_new_row = pd.DataFrame(data)

    # STEP 9: update scores csv with new model
    df_old_scores = pd.read_csv('./data/all_scores.csv')
    merge_scores = [df_old_scores, df_new_row]
    df_all_scores = pd.concat(merge_scores, axis=0)
    df_all_scores.to_csv('./data/all_scores.csv', index=False)
    
    # STEP 10: Return Score Dictionary
    return df_new_row, score_dict, model

## Run Models

**Instantiate Model Score List**

**Model 1**

In [13]:
# Params 1
params1 = {
    'cvec__max_features' : [1000,5000,10000],
    'cvec__ngram_range'  : [(1,1),(1,2),(1,3)]
}

# Model 1
model1, model1_dict, model1_self = nlp_model(X_train,X_test,y_train,y_test,'cvec','mnb',params1)

**Model 2**

In [14]:
# Params 2
params2 = {  
    'tvec__max_features' : [1000,5000,10000],
    'tvec__ngram_range'  : [(1,1),(1,2),(1,3)]
}

# Model 2
model2, model2_dict, model2_self = nlp_model(X_train,X_test,y_train,y_test,'tvec','mnb',params2)

In [15]:
model2

Unnamed: 0,Model,Transform,Train Score,Test Score
0,"GridSearchCV(cv=5, error_score=nan,\n ...",tvec,0.901827,0.896826


**Model 3**

In [16]:
# Params 3
params3 = {  
    'cvec__max_features' : [1000,5000,10000],
    'cvec__ngram_range'  : [(1,1),(1,2),(1,3)]
}

# Model 3
model3, model3_dict, model3_self = nlp_model(X_train,X_test,y_train,y_test,'cvec','mnb',params3)

**Model 4**

In [17]:
# Params 4
params4 = {  
    'tvec__max_features' : [1000,5000,10000],
    'tvec__ngram_range'  : [(1,1),(1,2),(1,3)]
}

# Model 4
model4, model4_dict, model4_self = nlp_model(X_train,X_test,y_train,y_test,'tvec','mnb',params4)

**Model 5**

In [18]:
# Params 5
params5 = {  
    'tvec__max_features' : [1000,5000,10000],
    'tvec__ngram_range'  : [(1,1),(1,2),(1,3)]
}

# Model 5
model5, model5_dict, model5_self = nlp_model(X_train,X_test,y_train,y_test,'tvec','mnb',params5)

**Make Scores List**

In [19]:
model1

Unnamed: 0,Model,Transform,Train Score,Test Score
0,"GridSearchCV(cv=5, error_score=nan,\n ...",cvec,0.906735,0.901454


In [20]:
model2

Unnamed: 0,Model,Transform,Train Score,Test Score
0,"GridSearchCV(cv=5, error_score=nan,\n ...",tvec,0.901827,0.896826


In [21]:
model3

Unnamed: 0,Model,Transform,Train Score,Test Score
0,"GridSearchCV(cv=5, error_score=nan,\n ...",cvec,0.906735,0.901454


In [22]:
model4

Unnamed: 0,Model,Transform,Train Score,Test Score
0,"GridSearchCV(cv=5, error_score=nan,\n ...",tvec,0.901827,0.896826


In [23]:
model5

Unnamed: 0,Model,Transform,Train Score,Test Score
0,"GridSearchCV(cv=5, error_score=nan,\n ...",tvec,0.901827,0.896826
