# Modeling: Bag of Words Classification Model
### *LogClaps as Target*

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#nlp
import nltk
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#modeling
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB

import warnings
warnings.filterwarnings('ignore')

In [2]:
# nltk.download('stopwords')

In [4]:
filename =  'data/nlp_nltk_stemmed_preproc_log.csv'

In [5]:
df = pd.read_csv(filename).drop(columns = 'Unnamed: 0')

In [6]:
df.head()

Unnamed: 0,log_claps,text
0,6.907755,"['7', 'recommend', 'skill', 'learn', '2021', '..."
1,6.192362,"['ultim', 'guid', 'ace', 'code', 'interview', ..."
2,4.934474,"['shakespear', 'versu', 'eminem—', 'who’', 'be..."
3,5.068904,"['custom', 'segment', 'onlin', 'retail', 'deta..."
4,4.890349,"['implement', 'visualttransform', 'pytorch', '..."


#### Transform the Target into a Binary Variable

In [7]:
print(f"Num infinite: {len(df['log_claps']) - np.isfinite(df['log_claps']).sum()}")
print(f"Num null: {df['log_claps'].isnull().sum()}")
print(f'Shape is {df.shape}')

Num infinite: 62
Num null: 0
Shape is (9804, 2)


In [8]:
#remove inf and nan values

df = df[np.isfinite(df['log_claps'])]

In [9]:
#establish a threshold for 'high engagement' of mean plus one standard deviation
thresh = df['log_claps'].mean() + df['log_claps'].std()
thresh

5.258180875814285

In [10]:
#check how many posts meet that criterion
(df['log_claps'] >= thresh).sum()

1492

In [11]:
log_claps = [1 if clap >= thresh else 0 for clap in df['log_claps']]

In [12]:
np.isfinite(log_claps).sum()

9742

In [13]:
df['log_claps'] = log_claps

In [14]:
df['log_claps'].value_counts(normalize=True)

0    0.846849
1    0.153151
Name: log_claps, dtype: float64

#### Classification Modeling

In [15]:
df.isnull().sum()

log_claps    0
text         0
dtype: int64

In [16]:
X = df['text']
y = df['log_claps']

In [17]:
#train-test split
X_train, X_test, y_train, y_test=train_test_split(X,
                                                 y,
                                                 test_size=.2,
                                                  stratify=y,
                                                 random_state=42)

#### Count Vectorizer

In [18]:
cvec = CountVectorizer(min_df=5, max_df=.98, ngram_range=(1,2))

In [19]:
X_train_cvec = cvec.fit_transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [None]:
model_list = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    BaggingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    SVC(),
    BernoulliNB()
]

results_list = []

for model_obj in model_list:
    #instantiate each model 
    model = model_obj
   
    #fit the model
    model.fit(X_train_cvec, y_train) 
  
    #create a dictionary with scores and evaluation metrics for each model
    results_dict = {}    
    results_dict['model_name'] = str(model_obj)
    results_dict['train_score'] = model.score(X_train_cvec, y_train)
    results_dict['test_score'] = model.score(X_test_cvec, y_test)
    results_dict['cv_score'] = cross_val_score(model, X_train_cvec, y_train, cv = 5).mean()
    results_dict['precision_score'] = precision_score(y_test, model.predict(X_test_cvec))
    results_dict['train_f1_score'] = f1_score(y_train, model.predict(X_train_cvec))  
    results_dict['test_f1_score'] = f1_score(y_test, model.predict(X_test_cvec))  
    
    #add the dictionary to the list
    results_list.append(results_dict)

In [21]:
cvec_results = pd.DataFrame(results_list)
cvec_results

Unnamed: 0,model_name,train_score,test_score,cv_score,precision_score,train_f1_score,test_f1_score
0,LogisticRegression(),1.0,0.830682,0.827408,0.423077,1.0,0.347826
1,DecisionTreeClassifier(),1.0,0.786044,0.783652,0.286738,1.0,0.277296
2,BaggingClassifier(),0.978314,0.855823,0.853073,0.616438,0.92384,0.242588
3,RandomForestClassifier(),1.0,0.85685,0.854869,0.827586,1.0,0.146789
4,AdaBoostClassifier(),0.864622,0.84197,0.840882,0.457627,0.33606,0.259615
5,SVC(),0.885153,0.851206,0.847556,0.9,0.400536,0.058442
6,BernoulliNB(),0.896574,0.812211,0.80662,0.339623,0.651685,0.282353


With a baseline accuracy of 84.6%, this model is not predictive.

#### TfidfVectorizer

In [22]:
tvec = TfidfVectorizer(min_df=5, max_df=.98, ngram_range=(1,2))

In [23]:
X_train_tvec = tvec.fit_transform(X_train)
X_test_tvec = tvec.transform(X_test)

In [24]:
model_list = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    BaggingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    SVC(),
    BernoulliNB()
]

results_list = []

for model_obj in model_list:
    #instantiate each model 
    model = model_obj
   
    #fit the model
    model.fit(X_train_tvec, y_train) 
  
    #create a dictionary with scores and evaluation metrics for each model
    results_dict = {}    
    results_dict['model_name'] = str(model_obj)
    results_dict['train_score'] = model.score(X_train_tvec, y_train)
    results_dict['test_score'] = model.score(X_test_tvec, y_test)
    results_dict['cv_score'] = cross_val_score(model, X_train_tvec, y_train, cv = 5).mean()
    results_dict['precision_score'] = precision_score(y_test, model.predict(X_test_tvec))
    results_dict['train_f1_score'] = f1_score(y_train, model.predict(X_train_tvec))  
    results_dict['test_f1_score'] = f1_score(y_test, model.predict(X_test_tvec))  
    
    #add the dictionary to the list
    results_list.append(results_dict)

In [25]:
tvec_results = pd.DataFrame(results_list)
tvec_results

Unnamed: 0,model_name,train_score,test_score,cv_score,precision_score,train_f1_score,test_f1_score
0,LogisticRegression(),0.860516,0.849153,0.849609,0.583333,0.169595,0.086957
1,DecisionTreeClassifier(),1.0,0.779887,0.785834,0.276451,1.0,0.274112
2,BaggingClassifier(),0.977929,0.863007,0.854355,0.77193,0.922383,0.247887
3,RandomForestClassifier(),1.0,0.857363,0.855254,0.833333,1.0,0.152439
4,AdaBoostClassifier(),0.859489,0.84197,0.837931,0.45283,0.304762,0.237624
5,SVC(),0.926986,0.852745,0.85102,0.789474,0.687534,0.094637
6,BernoulliNB(),0.896574,0.812211,0.80662,0.339623,0.651685,0.282353


With a baseline accuracy of 84.6%, this model is not predictive.