# Tuning the NB classifier

Hyperparameter sweep for the **multinomial naive Bayes classifier** for producing results with various **degrees of dataset imbalance**. Optimize the hyperparameters for the worst-case imbalance configuration. Optimization conducted by means of the tree-structured Parzen estimator.

## Import useful packages

In [None]:
# Generic packages
import numpy as np
import pandas as pd
import copy
import sys

In [None]:
# Sci-kit learn for machine learning tasks
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, roc_curve, auc, f1_score, make_scorer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [None]:
# NLTK for natural language processing
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# HyperOptfor Bayesian optimization
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
# Custom helper-functions script (supplied to Colab manually)
import utils as uu

## Load and process data

In [None]:
# Load training set
train_set_file  = "train_set_imb_0.csv"
train_set = pd.read_csv(train_set_file)

In [None]:
# Preprocess texts
train_set['text'] = train_set['text'].apply(str)
train_set['text'] = train_set['text'].apply(uu.preprocess_text)

In [None]:
# Decouple texts and labels (deep copy to not modfy the original datasets accidentally)
texts_train = copy.deepcopy(train_set['text'])
labels_train = copy.deepcopy(train_set['is_about_cc'])

## Define hyperparameter search space and objective function

In [None]:
# Define the search space for hyperparameterss
hyperparam_space = {'alpha': hp.uniform('alpha', 0.1, 0.7),
                    'max_features': scope.int(hp.qloguniform('max_features', np.log(5e4), np.log(5e5), q=1)),
                    'vectorizer': hp.choice('vectorizer', ['tfidf', 'count']),
                    'fit_prior': hp.choice('fit_prior', [True, False])
                    }

In [None]:
# Define the objective function for optimization
def objective(params):   
    print ('Params testing: ', params)

    # Split dataset into training validation sets
    inp_train, inp_valid, lbl_train, lbl_valid = train_test_split(texts_train, labels_train, 
                                                                  test_size=0.15, 
                                                                  random_state=12345, 
                                                                  stratify=labels_train)
    
    # Compute class weights for compensation the imbalance in classes
    class_weights = class_weight.compute_class_weight(class_weight = "balanced", classes = np.unique(lbl_train), y = lbl_train)
    class_weights = dict(zip(np.unique(lbl_train), class_weights))

    # Set vectorizer
    if params['vectorizer']=='tfidf':
        vectorizer = TfidfVectorizer(max_features=params['max_features'], lowercase=True, analyzer='word', dtype=np.float32)
    elif params['vectorizer']=='count':
        vectorizer = CountVectorizer()
    else:
        sys.exit("Unsupported vectorizer!")

    # Fit vectorizer
    vectorizer.fit(inp_train)
    features_train = vectorizer.transform(inp_train)
    features_valid = vectorizer.transform(inp_valid)

    # Define the model
    model = MultinomialNB(alpha=params['alpha'], fit_prior=params['fit_prior'], class_prior=None)
    
    # Fit the model
    model.fit(features_train, lbl_train)

    # Compute performance of the trial
    pred_scores = (model.predict(features_valid) > 0.5).astype("int32")
    f1 = f1_score(lbl_valid, pred_scores)
    print('DONE!      F1 = ' + str(f1))
    sys.stdout.flush() 
    return {'loss': -f1, 'status': STATUS_OK}


## Run Bayesian optimization with Tree Parzen Estimator

In [None]:
trials = Trials()
best = fmin(objective, hyperparam_space, algo=tpe.suggest, max_evals=1e3, trials=trials)
print('Best hyperparams: ', best)

Params testing: 
{'alpha': 0.21368187052301807, 'fit_prior': False, 'max_features': 76693, 'vectorizer': 'count'}
DONE!      F1 = 0.5063291139240507
Params testing: 
{'alpha': 0.6568232659715779, 'fit_prior': False, 'max_features': 494977, 'vectorizer': 'tfidf'}
DONE!      F1 = 0.5510204081632653
Params testing: 
{'alpha': 0.6694769892855779, 'fit_prior': True, 'max_features': 63791, 'vectorizer': 'count'}
DONE!      F1 = 0.6122448979591837
Params testing: 
{'alpha': 0.3865344776474846, 'fit_prior': False, 'max_features': 68756, 'vectorizer': 'count'}
DONE!      F1 = 0.5270270270270271
Params testing: 
{'alpha': 0.5798302538595695, 'fit_prior': False, 'max_features': 280349, 'vectorizer': 'tfidf'}
DONE!      F1 = 0.5631067961165047
Params testing: 
{'alpha': 0.49735690671659927, 'fit_prior': True, 'max_features': 374511, 'vectorizer': 'count'}
DONE!      F1 = 0.6481481481481481
Params testing: 
{'alpha': 0.16599202802888463, 'fit_prior': True, 'max_features': 104772, 'vectorizer': 'tfi