In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


#modeling
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB

import warnings
warnings.filterwarnings('ignore')

In [1]:
filename =  '../data/vectorized_jargon.csv'

In [46]:
df = pd.read_csv(filename)

In [47]:
df.head()

Unnamed: 0,claps,text,A/B testing,accuracy,action,activation function,active learning,AdaGrad,agent,agglomerative clustering,...,user matrix,validation,validation set,vanishing gradient problem,Wasserstein loss,weight,Weighted Alternating Least Squares (WALS),wide model,width,total_jargon_count
0,398,python builtin database — here’s use utilize b...,0,0,2,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,11
1,203,build deploy dashboard python google sheets vu...,0,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,12
2,360,magic python context managers getting started ...,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,45
3,88,generating image segmentation masks — easy way...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,10
4,301,top 5 machine learning certifications opinion ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,46


### Modeling Attempt I: Claps as Target

#### Remove Columns Not Needed For Modeling

In [29]:
df.drop(columns = ['text'], inplace=True)

#### Transform the Target into a Binary Variable

In [30]:
#establish a threshold for 'high engagement' of mean plus one standard deviation
thresh = df['claps'].mean() + df['claps'].std()
thresh

195.012762167029

In [31]:
#check how many posts meet that criterion
(df['claps'] >= thresh).sum()

1168

In [32]:
claps = [1 if clap >= thresh else 0 for clap in df['claps']]

In [33]:
df['claps'] = claps

In [34]:
df['claps'].value_counts(normalize=True)

0    0.872252
1    0.127748
Name: claps, dtype: float64

#### Classification Modeling

In [35]:
#confirm no nulls 
df.isnull().sum().sum()

0

In [36]:
X = df.drop(columns='claps')
y = df['claps']

In [37]:
#train-test split
X_train, X_test, y_train, y_test=train_test_split(X,
                                                 y,
                                                 test_size=.2,
                                                  stratify=y,
                                                 random_state=42)

#### Run Through Classifiers

In [38]:
model_list = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    BaggingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    SVC(),
    BernoulliNB()
]

results_list = []

for model_obj in model_list:
    #instantiate each model 
    model = model_obj
   
    #fit the model
    model.fit(X_train, y_train) 
  
    #create a dictionary with scores and evaluation metrics for each model
    results_dict = {}    
    results_dict['model_name'] = str(model_obj)
    results_dict['train_score'] = model.score(X_train, y_train)
    results_dict['test_score'] = model.score(X_test, y_test)
    results_dict['cv_score'] = cross_val_score(model, X_train, y_train, cv = 5).mean()
    results_dict['precision_score'] = precision_score(y_test, model.predict(X_test))
    results_dict['train_f1_score'] = f1_score(y_train, model.predict(X_train))  
    results_dict['test_f1_score'] = f1_score(y_test, model.predict(X_test))  
    
    #add the dictionary to the list
    results_list.append(results_dict)

In [39]:
results = pd.DataFrame(results_list)
results

Unnamed: 0,model_name,train_score,test_score,cv_score,precision_score,train_f1_score,test_f1_score
0,LogisticRegression(),0.872983,0.869328,0.869428,0.142857,0.033299,0.008299
1,DecisionTreeClassifier(),0.998769,0.791689,0.789993,0.164384,0.995159,0.15894
2,BaggingClassifier(),0.974706,0.870421,0.864642,0.411765,0.890338,0.055777
3,RandomForestClassifier(),0.998633,0.874248,0.872573,1.0,0.994618,0.033613
4,AdaBoostClassifier(),0.874214,0.870421,0.868609,0.285714,0.053498,0.016598
5,SVC(),0.87271,0.872061,0.8723,0.0,0.006403,0.0
6,BernoulliNB(),0.854389,0.839256,0.848784,0.134146,0.136253,0.06962


None of these models are better than baseline at predicting whether a post will get an above-average number of claps or not.

### Modeling Attempt II: Log of Claps

In [48]:
#Read the file back in
df = pd.read_csv(filename)

#### Transform Claps into Log Claps

In [49]:
df['log_claps'] = np.log(df['claps'])
df = df[np.isfinite(df['log_claps'])]

#### Remove Columns Not Needed For Modeling

In [50]:
df.drop(columns = ['claps', 'text'], inplace=True)

#### Transform the Target into a Binary Variable

In [51]:
#establish a threshold for 'high engagement' of mean plus one standard deviation
thresh = df['log_claps'].mean() + df['log_claps'].std()
thresh

5.105927127864417

In [52]:
#check how many posts meet that criterion
(df['log_claps'] >= thresh).sum()

1433

In [53]:
log_claps = [1 if clap >= thresh else 0 for clap in df['log_claps']]

In [55]:
df['log_claps'] = log_claps

In [56]:
df['log_claps'].value_counts(normalize=True)

0    0.84225
1    0.15775
Name: log_claps, dtype: float64

#### Classification Modeling

In [57]:
#confirm no nulls 
df.isnull().sum().sum()

0

In [58]:
X = df.drop(columns='log_claps')
y = df['log_claps']

In [59]:
#train-test split
X_train, X_test, y_train, y_test=train_test_split(X,
                                                 y,
                                                 test_size=.2,
                                                  stratify=y,
                                                 random_state=42)

#### Run Through Classifiers

In [60]:
model_list = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    BaggingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    SVC(),
    BernoulliNB()
]

results_list = []

for model_obj in model_list:
    #instantiate each model 
    model = model_obj
   
    #fit the model
    model.fit(X_train, y_train) 
  
    #create a dictionary with scores and evaluation metrics for each model
    results_dict = {}    
    results_dict['model_name'] = str(model_obj)
    results_dict['train_score'] = model.score(X_train, y_train)
    results_dict['test_score'] = model.score(X_test, y_test)
    results_dict['cv_score'] = cross_val_score(model, X_train, y_train, cv = 5).mean()
    results_dict['precision_score'] = precision_score(y_test, model.predict(X_test))
    results_dict['train_f1_score'] = f1_score(y_train, model.predict(X_train))  
    results_dict['test_f1_score'] = f1_score(y_test, model.predict(X_test))  
    
    #add the dictionary to the list
    results_list.append(results_dict)

In [61]:
results = pd.DataFrame(results_list)
results

Unnamed: 0,model_name,train_score,test_score,cv_score,precision_score,train_f1_score,test_f1_score
0,LogisticRegression(),0.842851,0.842598,0.838448,0.6,0.041946,0.020548
1,DecisionTreeClassifier(),0.998624,0.759494,0.743911,0.1875,0.995618,0.170778
2,BaggingClassifier(),0.973854,0.832691,0.834594,0.225806,0.909782,0.044025
3,RandomForestClassifier(),0.998624,0.841497,0.842026,0.333333,0.995626,0.006897
4,AdaBoostClassifier(),0.84409,0.840947,0.837897,0.416667,0.051883,0.033445
5,SVC(),0.842576,0.842047,0.842301,0.0,0.003484,0.0
6,BernoulliNB(),0.832943,0.819483,0.821797,0.211268,0.108664,0.083799


None of these models are better than baseline at predicting whether a post will get an above-average number of claps or not.