## Imports

In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import utils

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.options.display.max_colwidth = 400

## Load Data

In [2]:
tweets = pd.read_csv('../data/p_tweets.csv')

In [3]:
tweets.shape

(32832, 4)

In [4]:
tweets.isnull().sum()

datetime       0
p_text         0
hashtag        0
get_vaccine    0
dtype: int64

## Train/Test Split

In [13]:
X = tweets['p_text']
y = tweets['get_vaccine']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)

## Baseline Accuracies

In [7]:
print(y.value_counts(normalize=True))
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

0    0.509442
1    0.490558
Name: get_vaccine, dtype: float64
0    0.509423
1    0.490577
Name: get_vaccine, dtype: float64
0    0.509517
1    0.490483
Name: get_vaccine, dtype: float64


Baseline accuracy of about 0.5

## TF-IDF Vectorizer 

In [8]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [9]:
stopwords = stopwords.words('english')
new_stop_words = ['amp', "'d", "'ll", "'re", "'s", "'ve", 'could', 'doe', 'ha', 'might', 'must', "n't", 'need', 'sha', 'wa', 'wo', 'would']
stopwords.extend(new_stop_words)

In [14]:
tvec = TfidfVectorizer(lowercase=True, 
                        preprocessor=None, 
                        tokenizer=LemmaTokenizer(), 
                        stop_words=stopwords, 
                        analyzer='word')

X_train = tvec.fit_transform(X_train)
X_test = tvec.transform(X_test)

## Model Selection

In [22]:
# initialize classifiers
classifiers = {
    'logreg' : LogisticRegression(random_state=42, max_iter=1000),
    'svc' : SVC(random_state=42),
    'random_forests' : RandomForestClassifier(random_state=42),
    'multinomialNB' : MultinomialNB(),
    'knearestneighbors' : KNeighborsClassifier(),
    'adaboost' : AdaBoostClassifier(random_state=42, base_estimator=DecisionTreeClassifier()),
    'xgboost' : XGBClassifier()
}

In [35]:
def run_models(model, 
               X_train, y_train, X_test, y_test,
               verbose=True):
    """
    Fits a baseline model for each model specified.
    Compiles accuracy, variance, precision and f1 score results in a dictionary.
    For 2 classes.
    """
    
    results = {}
    
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test).ravel()
    
    results['train_accuracy'] = accuracy_score(y_train, y_pred_train)
    results['test_accuracy'] = accuracy_score(y_test, y_pred_test)
    results['variance'] = results['train_accuracy'] - results['test_accuracy']
    results['test_precision'] = precision_score(y_test, y_pred_test, pos_label=1, zero_division=0)
    results['test_recall'] = recall_score(y_test, y_pred_test)
    results['test_specificity'] = tn / (tn + fp)
    results['test_f1'] = f1_score(y_test, y_pred_test, pos_label=1, zero_division=0)
    
    return results

In [36]:
def model_results(models, X_train, y_train, X_test, y_test, verbose=False):
    """
    Returns the baseline model results in a dataframe.
    For 2 classes.
    """
    results = {}
    
    for name, model in models.items():
        if verbose:
            print('\nRunning {} - {}'.format(name, model))
        
        results[name] = run_models(model, X_train, y_train, X_test, y_test, verbose=False)

    return pd.DataFrame.from_dict(results, orient='index')


In [37]:
model_results(classifiers,
              X_train,
              y_train,
              X_test,
              y_test
             )

Unnamed: 0,train_accuracy,test_accuracy,variance,test_precision,test_recall,test_specificity,test_f1
logreg,0.845193,0.783158,0.062035,0.806133,0.734554,0.829946,0.768681
svc,0.95534,0.786204,0.169136,0.815123,0.729587,0.840705,0.769987
random_forests,0.987283,0.761687,0.225596,0.764199,0.743558,0.779139,0.753737
multinomialNB,0.842566,0.776001,0.066565,0.793034,0.735175,0.815302,0.76301
knearestneighbors,0.752065,0.599056,0.15301,0.583051,0.640795,0.558876,0.610561
adaboost,0.987283,0.693315,0.293968,0.659529,0.774604,0.615063,0.71245
xgboost,0.813288,0.754987,0.058301,0.790765,0.680534,0.826659,0.73152


For business purpose, it is more helpful to focus on optimize for the classification "yes_vax" population, who will be the potential users or customers. This narrows down to optimizing for the recall vs precision. 

Recall vs precision tradeoff