In [30]:
import numpy as np
import pandas as pd
import os
import cPickle as pickle
from pprint import pprint
from time import time
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import interp
from itertools import cycle

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import label_binarize

from mlxtend.classifier import StackingClassifier

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import seaborn as sns

### Define the cross validation scorer

In [17]:
def cross_validated_scorer(X_train_list, y_train_list, X_val_list, y_val_list, model_class):
    
    mod = model_class
    scores = []  
    t0 = time()
    for i in xrange(len(X_train_list)):
        
        train_text = X_train_list[i]
        train_label = y_train_list[i]
        
        val_text = X_val_list[i]
        val_label = y_val_list[i]
        
        mod.fit(train_text, train_label)
        
        val_score = mod.score(val_text, val_label)
        scores.append(val_score)
        
    print "Duration ", time() - t0
    scores = np.array(scores)
    return scores

### Import data 

In [8]:
with open("../../data/train_corpus_labels_full_reports.pkl", "rb") as filein:
    train_corpus = pickle.load(filein)
    labels = pickle.load(filein)

print train_corpus.shape
print labels.shape

(1208L,)
(1208L,)


### Preprocess and cache the folds 

In [37]:
countVect = CountVectorizer(ngram_range=(1,3), min_df=0.05, max_df=0.9, analyzer='word', stop_words="english")
pca = PCA(n_components=150)
sfmlr = SelectFromModel(LogisticRegression(penalty='l1', C=0.01))
combined_features = FeatureUnion([("pca", pca)])

preprocess2 = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1,3), analyzer='word', stop_words="english", min_df=0.05, max_df=0.9)),
        ('ToDense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
        ('features', combined_features)
    ])

In [None]:
train_text_list = []
train_label_list = []
val_text_list = []
val_label_list = []

k_fold = StratifiedKFold(5)

for train_indices, val_indices in k_fold.split(train_corpus, labels):
    train_text = np.asarray(train_corpus[train_indices])
    train_y    = np.asarray(labels[train_indices])

    val_text  = np.asarray(train_corpus[val_indices])
    val_y     = np.asarray(labels[val_indices])
    
    start = time()
    preprocessor = preprocess2.fit(train_text, train_y)
    
    train_text_processed = preprocessor.transform(train_text)
    print "Train shape ", train_text_processed.shape
    val_text_processed = preprocessor.transform(val_text)
    print "Val shape ", val_text_processed.shape
    train_text_list.append(train_text_processed)
    train_label_list.append(train_y)
    val_text_list.append(val_text_processed)
    val_label_list.append(val_y)
    
    print "Duration ", time() - start

### Base classifiers 

In [None]:
knnClassifier = KNeighborsClassifier()
svmClassifier = SVC(kernel='linear', probability=True, C=7.0)
gbmClassifier = GradientBoostingClassifier()
etClassifier = ExtraTreesClassifier(n_estimators=100)
rfClassifier = RandomForestClassifier(n_estimators=100)
xgbClassifier = XGBClassifier(objective='multi:softprob', 
                              max_depth=12, 
                              min_child_weight=1, 
                              learning_rate=0.1, 
                              n_estimators=200, 
                              gamma = 0.3,
                              subsample = 0.7,
                              colsample_bytree = 0.7,
                              nthread=-1)
lr = LogisticRegression()

### Meta classifier

In [None]:
sclfLr = StackingClassifier(classifiers=[knnClassifier, svmClassifier, gbmClassifier, etClassifier, rfClassifier, xgbClassifier], 
                          meta_classifier=lr, use_probas=True, average_probas=False)


sclfXGB = StackingClassifier(classifiers=[knnClassifier, svmClassifier, gbmClassifier, etClassifier, rfClassifier, xgbClassifier], 
                          meta_classifier=xgbClassifier, use_probas=True, average_probas=False)

### Run each classifier

In [None]:
print('5-fold cross validation:\n')

for clf, label in zip([knnClassifier, svmClassifier, rfClassifier, gbmClassifier, etClassifier, xgbClassifier, sclfLr], 
                      ['KNN Classifier', 
                       'SVM', 
                       'Random forest',
                       "Gradient Boosted Trees",
                       "Extra trees classifier",
                       'XGB Classifier',
                       'StackingClassifier LR']):

    scores = cross_validated_scorer(train_text_list, train_label_list, val_text_list, val_label_list, clf)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

In [None]:
print('5-fold cross validation:\n')

for clf, label in zip([knnClassifier, svmClassifier, rfClassifier, gbmClassifier, etClassifier, xgbClassifier, sclfXGB], 
                      ["KNN",
                       'SVM', 
                       'Random forest',
                       'GBM Classifier',
                       'ET Classifier',
                       'XGB Classifier',
                       'StackingClassifier XGB']):

    scores = cross_validated_scorer(train_text_list, train_label_list, val_text_list, val_label_list, clf)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))