In [251]:
import pandas as pd
import numpy as np
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
from IPython.core.debugger import Tracer

In [252]:
df = pd.read_csv('sample_data.csv')

In [253]:
# Drop final call since we need it later
df = df.drop(["Final Call"], axis=1)

In [254]:
# df = pd.get_dummies(data=df, columns=['intervieweeId', 'interviewerId'])

In [255]:
# Some useful parameters which will come in handy later on
ntrain = df.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def predict_proba(self, x):
        return self.clf.predict_proba(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

In [256]:
def get_oof(clf, x_train, y_train):
    oof_train = np.zeros((ntrain,))
    meanAccuracyScore = 0.0
    
    for i, (train_index, valid_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_va = x_train[valid_index]
        y_va = y_train[valid_index]
        
        clf.train(x_tr, y_tr)
        oof_train[valid_index] = clf.predict(x_va)
        #print(clf.predict_proba(x_va))
        accuracy = accuracy_score(y_va, oof_train[valid_index])
        print(accuracy)
        meanAccuracyScore = meanAccuracyScore + accuracy

    print("Accuracy Score", meanAccuracyScore / NFOLDS)
    #oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1)

In [257]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

In [258]:
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)

In [259]:
y_train = df['Interviewer Call'].ravel()
train = df.drop(['Interviewer Call','Score'], axis=1)
print(df.info())
print(train.info())
x_train = train.values # Creates an array of the train data
print(x_train.shape)
print(y_train.shape)

# Create our OOF train and test predictions. These base results will be used as new features
print("Extra Trees")
et_oof_train = get_oof(et, x_train, y_train) # Extra Trees
#print("Random Forest")
#rf_oof_train = get_oof(rf,x_train, y_train) # Random Forest
print("Ada boost")
ada_oof_train = get_oof(ada, x_train, y_train) # AdaBoost 
print("Gradient Boost")
gb_oof_train = get_oof(gb,x_train, y_train) # Gradient Boost

print("Training is complete")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
Problem Solving     1000 non-null int64
Design              1000 non-null int64
CS Skills           1000 non-null int64
Test Enumeration    1000 non-null int64
Communication       1000 non-null int64
intervieweeId       1000 non-null int64
interviewerId       1000 non-null int64
Score               1000 non-null float64
Interviewer Call    1000 non-null int64
dtypes: float64(1), int64(8)
memory usage: 70.4 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
Problem Solving     1000 non-null int64
Design              1000 non-null int64
CS Skills           1000 non-null int64
Test Enumeration    1000 non-null int64
Communication       1000 non-null int64
intervieweeId       1000 non-null int64
interviewerId       1000 non-null int64
dtypes: int64(7)
memory usage: 54.8 KB
None
(1000, 7)
(1000,)
Extra Trees
0.81
0.83
0.87
0.8

In [260]:
import pickle
pickle_out = open("extra_trees.pickle", "wb")
pickle.dump(forest, pickle_out)
pickle_out.close()

In [261]:
from sklearn.ensemble import ExtraTreesClassifier
forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
forest.fit(x_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [262]:
import pickle
pickle_out = open("extra_trees.pickle", "wb")
pickle.dump(forest, pickle_out)
pickle_out.close()

In [263]:
pickle_in = open("extra_trees.pickle", "rb")
classifier = pickle.load(pickle_in)
classifier.predict_proba(x_train[1].reshape(1, -1))[0][1]

0.0

In [264]:
y_train[1]

0

In [265]:
x_train[1]

array([ 4,  7, -1,  6,  4,  1, 83], dtype=int64)

In [266]:
from sklearn.ensemble import AdaBoostClassifier
ada_classifier = AdaBoostClassifier()
ada_classifier.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [267]:
import pickle
import pandas as pd
import numpy as np
pickle_in = open("extra_trees.pickle", "rb")
classifier = pickle.load(pickle_in)
# df = pd.DataFrame(np.random.randint(low=0, high=12, size=(1, 5)), columns=['Problem Solving', 'Design', 'CS Skills' , 'Test Enumeration' , 'Communication'])
# df = pd.DataFrame([9,9,9,7,8])
data = {'Problem Solving' : [3] , 'Design' : [3], 'CS Skills' : [4], 'Test Enumeration' : [9] , 'Communication' : [3] }
df = pd.DataFrame(data=data)
print(df.head())
df = df.applymap(lambda x : (-1 if x in (0, 1, 10, 11) else x))
intervieweeId = []
for x in range (0, 1):
    intervieweeId.append(np.ceil(x/4))
print(intervieweeId)
df['intervieweeId'] = [5]
#Populate interviewerId
df['interviewerId'] = np.random.randint(1, 101, df.shape[0])
print(df.values)
dummY_row = np.array([9, 9, 9, 9, 9, 7, 9])
print(classifier.predict_proba(dummY_row.reshape(1, -1))[0][1])

   CS Skills  Communication  Design  Problem Solving  Test Enumeration
0          4              3       3                3                 9
[0.0]
[[ 4  3  3  3  9  5 15]]
0.948
