In [1]:
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, auc

from utils.pickle import load_pickles

In [2]:
(
    train_x,
    valid_x,
    test_x,
    train_y,
    valid_y,
    test_y
) = load_pickles()

### Vectorize features

In [3]:
word_vectorizer = TfidfVectorizer(lowercase=False,ngram_range=(1,3),max_features=20000)
word_vectorizer = word_vectorizer.fit(train_x)
train_features = word_vectorizer.transform(train_x)
validation_features = word_vectorizer.transform(valid_x)

### Voting ensemble with more optimized hyperparameters

In [8]:
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)

val_aucs = []
train_aucs = []
LR_hyper_params = [{'C': 2.7825594022071245, 'max_iter': 8, 'penalty': 'l2'}, {'C': 1.0, 'max_iter': 20, 'penalty': 'l1'}, {'C': 1.0, 'max_iter': 10, 'penalty': 'l1'}, {'C': 1.0, 'max_iter': 40, 'penalty': 'l1'}, {'C': 464.15888336127773, 'max_iter': 8, 'penalty': 'l2'}, {'C': 1.0, 'max_iter': 20, 'penalty': 'l2'}]

for i in range(6):
    estimators = [
        ('logistic', LogisticRegression(**LR_hyper_params[i])),
        ('cart', DecisionTreeClassifier(max_depth=80, min_samples_split=0.3, max_leaf_nodes=80)),
        ('svm', SVC(probability=True, gamma='auto'))
    ]
    classifier = VotingClassifier(estimators, voting='soft', weights=[2,1,1])
    classifier = classifier.fit(train_features, train_y[:, i])

    valid_y_hat = classifier.predict_proba(validation_features)[:, 1]
    fpr, tpr, thresholds = roc_curve(valid_y[:, i], valid_y_hat)
    val_aucs.append(auc(fpr, tpr))

    train_y_hat = classifier.predict_proba(train_features)[:, 1]
    fpr, tpr, thresholds = roc_curve(train_y[:, i], train_y_hat)
    train_aucs.append(auc(fpr, tpr))



In [9]:
print("Final Validation AUC with optimized hyperparameters: {}".format(np.mean(val_aucs)))
print("Final Training AUC with optimized hyperparameters: {}".format(np.mean(train_aucs)))

Final Validation AUC with optimized hyperparameters: 0.9780675616595312
Final Training AUC with optimized hyperparameters: 0.9879593529235565


In [None]:
train_features = word_vectorizer.transform(train_x)
test_features = word_vectorizer.transform(test_x)
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)

val_aucs = []
train_aucs = []
LR_hyper_params = [{'C': 2.7825594022071245, 'max_iter': 8, 'penalty': 'l2'}, {'C': 1.0, 'max_iter': 20, 'penalty': 'l1'}, {'C': 1.0, 'max_iter': 10, 'penalty': 'l1'}, {'C': 1.0, 'max_iter': 40, 'penalty': 'l1'}, {'C': 464.15888336127773, 'max_iter': 8, 'penalty': 'l2'}, {'C': 1.0, 'max_iter': 20, 'penalty': 'l2'}]

for i in range(6):
    estimators = [
        ('logistic', LogisticRegression(**LR_hyper_params[i])),
        ('cart', DecisionTreeClassifier(max_depth=80, min_samples_split=0.3, max_leaf_nodes=80)),
        ('svm', SVC(probability=True, gamma='auto'))
    ]
    classifier = VotingClassifier(estimators, voting='soft', weights=[2,1,1])
    classifier = classifier.fit(train_features, train_y[:, i])

    test_y_hat = classifier.predict_proba(test_features)[:, 1]
    fpr, tpr, thresholds = roc_curve(test_y.values[:, i], test_y_hat)
    val_aucs.append(auc(fpr, tpr))

    train_y_hat = classifier.predict_proba(train_features)[:, 1]
    fpr, tpr, thresholds = roc_curve(train_y[:, i], train_y_hat)
    train_aucs.append(auc(fpr, tpr))

