# Towards Data Science Blog Posts: Bag of Words Frequency Models
### Classification Analysis 
#### *Attempt to model the relationship between the words in the corpus with the target variable: claps.*

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#nlp
import nltk
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#modeling
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB

import warnings
warnings.filterwarnings('ignore')

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachelinsler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1]:
filename = '../data/nlp_nltk_stemmed_preproc.csv'

In [4]:
df = pd.read_csv(filename).drop(columns = 'Unnamed: 0')

In [5]:
df.head()

Unnamed: 0,claps,text
0,489,ultim guid ace code interview data scientist d...
1,139,shakespear versu eminem— who’ better lyricist ...
2,133,implement visualttransform pytorch hi guy happ...
3,92,stock price analysi panda altair practic guid ...
4,58,optim threshold imbalanc classif handson tutor...


#### Transform the Target into a Binary Variable

In [6]:
df['claps'].mean() + df['claps'].std()

208.91182371361316

In [None]:
#remove outliers
df['claps'] = pd.Series([1 if claps >= 195 else 0 for claps in df['claps']])

In [8]:
df['claps'].value_counts(normalize=True)

0    0.865231
1    0.134769
Name: claps, dtype: float64

#### Classification Modeling

In [9]:
df.head()

Unnamed: 0,claps,text
0,1,ultim guid ace code interview data scientist d...
1,0,shakespear versu eminem— who’ better lyricist ...
2,0,implement visualttransform pytorch hi guy happ...
3,0,stock price analysi panda altair practic guid ...
4,0,optim threshold imbalanc classif handson tutor...


In [10]:
X = df['text']
y = df['claps']

In [11]:
#train-test split
X_train, X_test, y_train, y_test=train_test_split(X,
                                                 y,
                                                 test_size=.2,
                                                  stratify=y,
                                                 random_state=42)

#### Count Vectorizer

In [12]:
cvec = CountVectorizer(min_df=5, max_df=.98, ngram_range=(1,2))

In [13]:
X_train_cvec = cvec.fit_transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [14]:
model_list = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    BaggingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    SVC(),
    BernoulliNB()
]

results_list = []

for model_obj in model_list:
    #instantiate each model 
    model = model_obj
   
    #fit the model
    model.fit(X_train_cvec, y_train) 
  
    #create a dictionary with scores and evaluation metrics for each model
    results_dict = {}    
    results_dict['model_name'] = str(model_obj)
    results_dict['train_score'] = model.score(X_train_cvec, y_train)
    results_dict['test_score'] = model.score(X_test_cvec, y_test)
    results_dict['cv_score'] = cross_val_score(model, X_train_cvec, y_train, cv = 5).mean()
    results_dict['precision_score'] = precision_score(y_test, model.predict(X_test_cvec))
    results_dict['train_f1_score'] = f1_score(y_train, model.predict(X_train_cvec))  
    results_dict['test_f1_score'] = f1_score(y_test, model.predict(X_test_cvec))  
    
    #add the dictionary to the list
    results_list.append(results_dict)

In [15]:
cvec_results = pd.DataFrame(results_list)
cvec_results

Unnamed: 0,model_name,train_score,test_score,cv_score,precision_score,train_f1_score,test_f1_score
0,LogisticRegression(),1.0,0.844995,0.845935,0.371622,1.0,0.276382
1,DecisionTreeClassifier(),1.0,0.813778,0.80032,0.26699,1.0,0.241228
2,BaggingClassifier(),0.979952,0.872443,0.869753,0.644444,0.919677,0.19661
3,RandomForestClassifier(),1.0,0.875135,0.870426,0.846154,1.0,0.15942
4,AdaBoostClassifier(),0.876211,0.863832,0.8602,0.476923,0.272152,0.196825
5,SVC(),0.895721,0.865447,0.86585,0.5,0.369406,0.007937
6,BernoulliNB(),0.908907,0.842842,0.818757,0.354167,0.658253,0.258883


With a baseline accuracy of 86.5%, none of these models is predictive. The Random Forest model appears to do *slightly* better than the baseline model, but the F1 score on testing data is so low, it's not worth pursuing.

#### TfidfVectorizer

In [16]:
tvec = TfidfVectorizer(min_df=5, max_df=.98, ngram_range=(1,2))

In [17]:
X_train_tvec = tvec.fit_transform(X_train)
X_test_tvec = tvec.transform(X_test)

In [18]:
model_list = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    BaggingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    SVC(),
    BernoulliNB()
]

results_list = []

for model_obj in model_list:
    #instantiate each model 
    model = model_obj
   
    #fit the model
    model.fit(X_train_tvec, y_train) 
  
    #create a dictionary with scores and evaluation metrics for each model
    results_dict = {}    
    results_dict['model_name'] = str(model_obj)
    results_dict['train_score'] = model.score(X_train_tvec, y_train)
    results_dict['test_score'] = model.score(X_test_tvec, y_test)
    results_dict['cv_score'] = cross_val_score(model, X_train_tvec, y_train, cv = 5).mean()
    results_dict['precision_score'] = precision_score(y_test, model.predict(X_test_tvec))
    results_dict['train_f1_score'] = f1_score(y_train, model.predict(X_train_tvec))  
    results_dict['test_f1_score'] = f1_score(y_test, model.predict(X_test_tvec))  
    
    #add the dictionary to the list
    results_list.append(results_dict)

In [19]:
tvec_results = pd.DataFrame(results_list)
tvec_results

Unnamed: 0,model_name,train_score,test_score,cv_score,precision_score,train_f1_score,test_f1_score
0,LogisticRegression(),0.869887,0.8676,0.867061,0.75,0.069297,0.046512
1,DecisionTreeClassifier(),1.0,0.815931,0.806915,0.285047,1.0,0.262931
2,BaggingClassifier(),0.979279,0.877826,0.870156,0.828571,0.916757,0.203509
3,RandomForestClassifier(),1.0,0.876211,0.871233,0.884615,1.0,0.166667
4,AdaBoostClassifier(),0.87944,0.864909,0.860334,0.493671,0.310769,0.237082
5,SVC(),0.922632,0.869214,0.867196,0.818182,0.597621,0.068966
6,BernoulliNB(),0.908907,0.842842,0.818757,0.354167,0.658253,0.258883


Same conclusion for TFIDF. With a baseline accuracy of 86.5%, none of these models is predictive. The Random Forest and Bagging Classifiers appear to do *slightly* better than the baseline model, but the F1 score on testing data is so low, it's not worth pursuing.