In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.pipeline import make_pipeline
import pickle
import xgboost as xgb
from sklearn.decomposition import PCA



In [2]:
with open('output/2/train.pickle', 'rb') as f:
    train = pickle.load(f)

In [None]:
print(train.data.shape, train.target.shape, train.target.mean())

(10444, 146) (10444,) 0.5


In [None]:
tuner = RandomizedSearchCV(make_pipeline(PCA(n_components=50), xgb.XGBClassifier(n_estimators=100)), {
    'xgbclassifier__max_depth': [2, 3, 4, 5],
    'xgbclassifier__gamma': [0, 1, 2, 3],
    'xgbclassifier__min_child_weight': [0, 1, 2, 3],
    'xgbclassifier__subsample': [0.5, 0.75, 1.0],
    'xgbclassifier__colsample_bytree': [0.25, 0.5, 0.75, 1.0],
    'xgbclassifier__colsample_bylevel': [0.5, 0.75, 1.0],
}, scoring='roc_auc', n_iter=30, refit=False, verbose=5)
tuner.fit(train.data, train.target)

In [None]:
tuner.best_score_, tuner.best_params_

In [None]:
model = make_pipeline(PCA(n_components=50), xgb.XGBClassifier(**tuner.best_params_))
X_train, X_val, y_train, y_val = train_test_split(train.data, train.target, test_size=1000)
eval_set = [(X_train, y_train), (X_val, y_val)]
model.fit(X_train, y_train, eval_set=eval_set, eval_metric='error')

In [None]:
def plot_error(model):
    plt.figure(figsize=(10, 5))
    plt.plot(model.evals_result_['validation_0']['error'], c='r', label='Training error')
    if 'validation_1' in model.evals_result_:
        plt.plot(model.evals_result_['validation_1']['error'], c='g', label='Validation error')
    plt.xlabel('Number of trees')
    plt.ylabel('Error')
    plt.legend()
    plt.grid()

plot_error(model)

In [None]:
def plot_precision_recall(y_true, y_probas):
    precision, recall, thresholds = precision_recall_curve(y_true, y_probas[:, 1])
    plt.plot(recall, precision)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.grid()

def report(name, y_true, y_probas):
    y_pred = np.argmax(y_probas, axis=1)
    print('{} accuracy: {:.3f}'.format(name, accuracy_score(y_true, y_pred)))
    print(classification_report(y_true, y_pred))
    plot_precision_recall(y_true, y_probas)
    plt.show()

report('training', y_train, model.predict_proba(X_train))
report('validation', y_val, model.predict_proba(X_val))

In [None]:
feature_importances = pd.DataFrame(list(zip(train.feature_names, model.feature_importances_ * 100)))
feature_importances.rename(columns={0: 'Feature', 1: 'Importance'}, inplace=True)
feature_importances.sort_values(by='Importance', ascending=False)

# Simple neural network

In [None]:
from sklearn.neural_network import MLPClassifier

nn = make_pipeline(PCA(n_components=50), MLPClassifier(
    hidden_layer_sizes=[300, 200, 100, 50, 50],
    learning_rate='adaptive',
    validation_fraction=0.2,
    verbose=True,
))
nn.fit(train.data, train.target)

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(nn.loss_curve_, c='r', label='training')
plt.ylabel('Loss')
plt.legend()
plt.grid()

In [None]:
report('train', train.target, nn.predict_proba(train.data))

# Final model

In [None]:
model = make_pipeline(PCA(n_components=50), xgb.XGBClassifier(**tuner.best_params_))
eval_set = [(train.data, train.target)]
model.fit(train.data, train.target, eval_set=eval_set, eval_metric='error')

In [None]:
plot_error(model)

In [None]:
with open('output/2/test_balanced.pickle', 'rb') as f:
    test = pickle.load(f)
print(test.data.shape, test.target.shape, test.target.mean())
report('balanced test', test.target, model.predict_proba(test.data))

In [None]:
ratio1 = 0.0161864681127  # See the output of `train_test_split` in the previous notebook.
ratio0 = 1 - ratio1
index1 = test.target == 1
index0 = ~index1
accuracy1 = accuracy_score(test.target[index1], model.predict(test.data[index1]))
accuracy0 = accuracy_score(test.target[index0], model.predict(test.data[index0]))
print('balanced test accuracy: {:.3f}'.format(accuracy1 * 0.5 + accuracy0 * 0.5))
print('stratified test accuracy: {:.3f}'.format(accuracy1 * ratio1 + accuracy0 * ratio0))