# Tuning the SVM classifier

Hyperparameter sweep for the **support vector machine classifier** for producing results with various **degrees of dataset imbalance**. Optimize the hyperparameters for the worst-case imbalance configuration. Optimization conducted by means of the tree-structured Parzen estimator.

## Import useful packages

In [None]:
# Generic packages
import numpy as np
import pandas as pd
import copy

In [None]:
# Sci-kit learn for machine learning tasks
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, roc_curve, auc, f1_score, make_scorer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC

In [None]:
# NLTK for natural language processing
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# HyperOptfor Bayesian optimization
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import sys

In [None]:
# Custom helper-functions script (supplied to Colab manually)
import utils as uu

## Load and process data

In [None]:
# Load training set
train_set_file  = "train_set_imb_0.csv"
train_set = pd.read_csv(train_set_file)

In [None]:
# Preprocess texts
train_set['text'] = train_set['text'].apply(str)
train_set['text'] = train_set['text'].apply(uu.preprocess_text)

In [None]:
# Decouple texts and labels (deep copy to not modfy the original datasets accidentally)
texts_train = copy.deepcopy(train_set['text'])
labels_train = copy.deepcopy(train_set['is_about_cc'])

## Define hyperparameter search space and objective function

In [None]:
# Define the search space for hyperparameterss
hyperparam_space = {'max_features': scope.int(hp.qloguniform('max_features', np.log(1e3), np.log(1e5), q=1)),
                    'c_param': hp.loguniform('c_param', np.log(1), np.log(1e3)),
                    'vectorizer': hp.choice('vectorizer', ['tfidf', 'count']),
                    }

In [None]:
# Define the objective function for optimization
def objective(params):   
    print ('Params testing: ', params)

    # Split dataset into training validation sets
    inp_train, inp_valid, lbl_train, lbl_valid = train_test_split(texts_train, labels_train, 
                                                                  test_size=0.15, 
                                                                  random_state=12345, 
                                                                  stratify=labels_train)
    
    # Compute class weights for compensation the imbalance in classes
    class_weights = class_weight.compute_class_weight(class_weight = "balanced", classes = np.unique(lbl_train), y = lbl_train)
    class_weights = dict(zip(np.unique(lbl_train), class_weights))

    # Set vectorizer
    if params['vectorizer']=='tfidf':
        vectorizer = TfidfVectorizer(max_features=params['max_features'], lowercase=True, analyzer='word', dtype=np.float32)
    elif params['vectorizer']=='count':
        vectorizer = CountVectorizer()
    else:
        sys.exit("Unsupported vectorizer!")

    # Fit vectorizer
    vectorizer.fit(inp_train)
    features_train = vectorizer.transform(inp_train)
    features_valid = vectorizer.transform(inp_valid)

    # Define the model
    model = SVC(kernel='linear', C=params['c_param'], gamma='auto', class_weight=class_weights)

    # Fit the model
    model.fit(features_train, lbl_train)

    # Compute performance of the trial
    pred_scores = (model.predict(features_valid) > 0.5).astype("int32")
    f1 = f1_score(lbl_valid, pred_scores)
    print('DONE!      F1 = ' + str(f1))
    sys.stdout.flush() 
    return {'loss': -f1, 'status': STATUS_OK}


## Run Bayesian optimization with Tree Parzen Estimator

In [None]:
trials = Trials()
best = fmin(objective, hyperparam_space, algo=tpe.suggest, max_evals=500, trials=trials)
print('Best hyperparams: ', best)

Params testing: 
{'c_param': 982.7619957471281, 'max_features': 1609, 'vectorizer': 'tfidf'}
DONE!      F1 = 0.6065573770491803
Params testing: 
{'c_param': 130.75159230815453, 'max_features': 1971, 'vectorizer': 'tfidf'}
DONE!      F1 = 0.576271186440678
Params testing: 
{'c_param': 86.3958885700212, 'max_features': 16385, 'vectorizer': 'tfidf'}
DONE!      F1 = 0.6470588235294118
Params testing: 
{'c_param': 2.107326861843191, 'max_features': 5221, 'vectorizer': 'count'}
DONE!      F1 = 0.6799999999999999
Params testing: 
{'c_param': 408.3040682389376, 'max_features': 60024, 'vectorizer': 'count'}
DONE!      F1 = 0.6799999999999999
Params testing: 
{'c_param': 46.38355710652067, 'max_features': 34388, 'vectorizer': 'tfidf'}
DONE!      F1 = 0.6470588235294118
Params testing: 
{'c_param': 13.517291389782756, 'max_features': 2945, 'vectorizer': 'count'}
DONE!      F1 = 0.6799999999999999
Params testing: 
{'c_param': 24.18851575133561, 'max_features': 3297, 'vectorizer': 'count'}
DONE!   