In [2]:
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, auc

from utils.pickle import load_pickles

In [3]:
(
    train_x,
    valid_x,
    test_x,
    train_y,
    valid_y,
    test_y
) = load_pickles()

### Vectorize features

In [4]:
word_vectorizer = TfidfVectorizer(lowercase=False,ngram_range=(1,3),max_features=20000)
word_vectorizer = word_vectorizer.fit(train_x)
train_features = word_vectorizer.transform(train_x)
validation_features = word_vectorizer.transform(valid_x)

### Create estimators to ensemble

In [11]:
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)

# create the sub models
estimators = [
    ('logistic', LogisticRegression()),
    ('cart', DecisionTreeClassifier()),
    ('svm', SVC(probability=True))
]

### Create the ensembled model

In [12]:
val_aucs = []
train_aucs = []
for i in range(6):
    classifier = VotingClassifier(estimators, voting='soft')
    classifier = classifier.fit(train_features, train_y[:, i])

    valid_y_hat = classifier.predict_proba(validation_features)[:, 1]
    fpr, tpr, thresholds = roc_curve(valid_y[:, i], valid_y_hat)
    val_aucs.append(auc(fpr, tpr))

    train_y_hat = classifier.predict_proba(train_features)[:, 1]
    fpr, tpr, thresholds = roc_curve(train_y[:, i], train_y_hat)
    train_aucs.append(auc(fpr, tpr))



In [14]:
np.mean(val_aucs)

0.9750659269869545

### Voting ensemble with more optimized hyperparameters

In [17]:
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)

# create the sub models
estimators = [
    ('logistic', LogisticRegression(penalty='l2')),
    ('cart', DecisionTreeClassifier(max_depth=80, min_samples_split=0.3, max_leaf_nodes=80)),
    ('svm', SVC(probability=True))
]

val_aucs2 = []
train_aucs2 = []
for i in range(6):
    classifier = VotingClassifier(estimators, voting='soft')
    classifier = classifier.fit(train_features, train_y[:, i])

    valid_y_hat = classifier.predict_proba(validation_features)[:, 1]
    fpr, tpr, thresholds = roc_curve(valid_y[:, i], valid_y_hat)
    val_aucs2.append(auc(fpr, tpr))

    train_y_hat = classifier.predict_proba(train_features)[:, 1]
    fpr, tpr, thresholds = roc_curve(train_y[:, i], train_y_hat)
    train_aucs2.append(auc(fpr, tpr))



In [18]:
np.mean(val_aucs2)

0.9758543717969387