In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from scipy.sparse import vstack

from utils.pickle import load_pickles

In [2]:
(
    train_x,
    valid_x,
    test_x,
    train_y,
    valid_y,
    test_y
) = load_pickles()

### Vectorize features

In [3]:
word_vectorizer = TfidfVectorizer(lowercase=False,ngram_range=(1,3),max_features=20000)
word_vectorizer = word_vectorizer.fit(train_x)
train_features = word_vectorizer.transform(train_x)
validation_features = word_vectorizer.transform(valid_x)

### Grid search to find optimal hyperparameters

In [None]:
classes = 6
gamma = [0.001, 0.01, 0.1, 1]
kernel = ['linear', 'rbf']
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C, gamma=gamma, kernel=kernel)
model = SVC(probability=True)

ps = PredefinedSplit(test_fold=[-1]*train_features.shape[0] + [0]*validation_features.shape[0])

val_aucs = []
train_aucs = []
best_params = []

X = vstack((train_features,validation_features))
Y = np.concatenate((train_y, valid_y))

for i in range(classes):
    classifier = GridSearchCV(model, hyperparameters, scoring='roc_auc', cv=ps, verbose=0)
    classifier = classifier.fit(X, Y[:, i])

    valid_y_hat = classifier.predict_proba(validation_features)[:, 1]
    fpr, tpr, thresholds = roc_curve(valid_y[:, i], valid_y_hat)
    val_aucs.append(auc(fpr, tpr))

    train_y_hat = classifier.predict_proba(train_features)[:, 1]
    fpr, tpr, thresholds = roc_curve(train_y[:, i], train_y_hat)
    train_aucs.append(auc(fpr, tpr))
    
    best_params.append(classifier.best_params_)