In [84]:
import pandas as pd
import numpy as np
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
from IPython.core.debugger import Tracer

In [85]:
df = pd.read_csv('train.csv', encoding='iso-8859-1')
df = pd.get_dummies(data=df, columns=['InterviewerId', 'Interviewee'])

In [86]:
# Dropping the text columns
df = df.drop(['Desc_A','Desc_B'], axis=1)

In [87]:
# Some useful parameters which will come in handy later on
ntrain = df.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 6 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)


In [92]:
def get_oof(clf, x_train, y_train):
    oof_train = np.zeros((ntrain,))
    meanAccuracyScore = 0.0
    
    for i, (train_index, valid_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_va = x_train[valid_index]
        y_va = y_train[valid_index]
        
        clf.train(x_tr, y_tr)

        oof_train[valid_index] = clf.predict(x_va)
        accuracy = accuracy_score(y_va, oof_train[valid_index])
        print(accuracy)
        meanAccuracyScore = meanAccuracyScore + accuracy

    print("Accuracy Score", meanAccuracyScore / 6)
    #oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1)

In [93]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [94]:
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [95]:
y_train = df['Hire_call_interviewer_1'].ravel()
train = df.drop(['Hire_call_interviewer_1'], axis=1)
x_train = df.values # Creates an array of the train data

# Create our OOF train and test predictions. These base results will be used as new features
print("Extra Trees")
et_oof_train = get_oof(et, x_train, y_train) # Extra Trees
print("Random Forest")
rf_oof_train = get_oof(rf,x_train, y_train) # Random Forest
print("Ada boost")
ada_oof_train = get_oof(ada, x_train, y_train) # AdaBoost 
print("Gradient Boost")
gb_oof_train = get_oof(gb,x_train, y_train) # Gradient Boost
print("SVC")
svc_oof_train = get_oof(svc,x_train, y_train) # Support Vector Classifier

print("Training is complete")

Extra Trees
1.0
1.0
1.0
1.0
1.0
1.0
Accuracy Score 1.0
Random Forest
0.0
1.0


  warn("Warm-start fitting without increasing n_estimators does not "


1.0
1.0
0.0
0.0
Accuracy Score 0.5
Ada boost
0.0
1.0
1.0
1.0
1.0
1.0
Accuracy Score 0.8333333333333334
Gradient Boost
0.0
1.0
1.0
1.0
1.0
1.0
Accuracy Score 0.8333333333333334
SVC
0.0
0.0
0.0
0.0
0.0
0.0
Accuracy Score 0.0
Training is complete


In [96]:
from sklearn.ensemble import ExtraTreesClassifier
forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
forest.fit(x_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [97]:
import pickle
pickle_out = open("extra_trees.pickle", "wb")
pickle.dump(forest, pickle_out)
pickle_out.close()

In [102]:
pickle_in = open("extra_trees.pickle", "rb")
classifier = pickle.load(pickle_in)
classifier.predict(x_train.iloc[0].head(1))

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'