# Tuning the KNN classifier

Hyperparameter sweep for the **$k$ nearest neighbors classifier** for producing results with various **degrees of dataset imbalance**. Optimize the hyperparameters for the worst-case imbalance configuration. Optimization conducted by means of the tree-structured Parzen estimator.

## Import useful packages

In [84]:
# Generic packages
import numpy as np
import pandas as pd
import copy
import sys

In [85]:
# Sci-kit learn for machine learning tasks
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, roc_curve, auc, f1_score, make_scorer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier

In [86]:
# NLTK for natural language processing
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [87]:
# HyperOptfor Bayesian optimization
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [88]:
# Custom helper-functions script (supplied to Colab manually)
import utils as uu

## Load and process data

In [89]:
# Load training set
train_set_file  = "train_set_imb_0.csv"
train_set = pd.read_csv(train_set_file)

In [90]:
# Preprocess texts
train_set['text'] = train_set['text'].apply(str)
train_set['text'] = train_set['text'].apply(uu.preprocess_text)

In [91]:
# Decouple texts and labels (deep copy to not modfy the original datasets accidentally)
texts_train = copy.deepcopy(train_set['text'])
labels_train = copy.deepcopy(train_set['is_about_cc'])

## Define hyperparameter search space and objective function

In [92]:
# Define the search space for hyperparameterss
hyperparam_space = {'leaf_size': scope.int(hp.qloguniform('leaf_size', np.log(1e1), np.log(1e3), q=1)),
                    'n_neighbors': scope.int(hp.quniform('n_neighbors', 3, 150, q=1)),
                    'max_features': scope.int(hp.qloguniform('max_features', np.log(5e4), np.log(5e5), q=1)),
                    'vectorizer': hp.choice('vectorizer', ['tfidf', 'count']),
                    'weights': hp.choice('weights', ['uniform', 'distance'])
                    }

In [93]:
# Define the objective function for optimization
def objective(params):   
    print ('Params testing: ', params)

    # Split dataset into training validation sets
    inp_train, inp_valid, lbl_train, lbl_valid = train_test_split(texts_train, labels_train, 
                                                                  test_size=0.15, 
                                                                  random_state=12345, 
                                                                  stratify=labels_train)
    
    # Compute class weights for compensation the imbalance in classes
    class_weights = class_weight.compute_class_weight(class_weight = "balanced", classes = np.unique(lbl_train), y = lbl_train)
    class_weights = dict(zip(np.unique(lbl_train), class_weights))

    # Set vectorizer
    if params['vectorizer']=='tfidf':
        vectorizer = TfidfVectorizer(max_features=params['max_features'], lowercase=True, analyzer='word', dtype=np.float32)
    elif params['vectorizer']=='count':
        vectorizer = CountVectorizer()
    else:
        sys.exit("Unsupported vectorizer!")

    # Fit vectorizer
    vectorizer.fit(inp_train)
    features_train = vectorizer.transform(inp_train)
    features_valid = vectorizer.transform(inp_valid)

    # Define the model
    model = KNeighborsClassifier(n_neighbors=2*params['n_neighbors']+1, leaf_size=params['leaf_size'], weights=params['weights'], algorithm='auto')    
    
    # Fit the model
    model.fit(features_train, lbl_train)

    # Compute performance of the trial
    pred_scores = (model.predict(features_valid) > 0.5).astype("int32")
    f1 = f1_score(lbl_valid, pred_scores)
    print('DONE!      F1 = ' + str(f1))
    sys.stdout.flush() 
    return {'loss': -f1, 'status': STATUS_OK}


## Run Bayesian optimization with Tree Parzen Estimator

In [94]:
trials = Trials()
best = fmin(objective, hyperparam_space, algo=tpe.suggest, max_evals=1e3, trials=trials)
print('Best hyperparams: ', best)

Params testing: 
{'leaf_size': 156, 'max_features': 149724, 'n_neighbors': 5, 'p': 2, 'vectorizer': 'count', 'weights': 'distance'}
DONE!      F1 = 0.03571428571428572
Params testing: 
{'leaf_size': 370, 'max_features': 169042, 'n_neighbors': 3, 'p': 2, 'vectorizer': 'count', 'weights': 'uniform'}
DONE!      F1 = 0.03571428571428572
Params testing: 
{'leaf_size': 135, 'max_features': 256031, 'n_neighbors': 3, 'p': 2, 'vectorizer': 'count', 'weights': 'uniform'}
DONE!      F1 = 0.03571428571428572
Params testing: 
{'leaf_size': 31, 'max_features': 66019, 'n_neighbors': 2, 'p': 2, 'vectorizer': 'tfidf', 'weights': 'uniform'}
DONE!      F1 = 0.19354838709677416
Params testing: 
{'leaf_size': 27, 'max_features': 209938, 'n_neighbors': 4, 'p': 2, 'vectorizer': 'tfidf', 'weights': 'uniform'}
DONE!      F1 = 0.25396825396825395
Params testing: 
{'leaf_size': 97, 'max_features': 361035, 'n_neighbors': 3, 'p': 2, 'vectorizer': 'tfidf', 'weights': 'distance'}
DONE!      F1 = 0.30303030303030304
