In [147]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_curve
from sklearn.metrics import make_scorer

In [148]:
#Reading 3 different CSV files 1) one with label encoding 2) one with label encoding for ordinal variables and one hot
#encoding for categorical variables 3) onw with all categorical variables one hot encoded

tot_data1=pd.read_csv("final_label_encoding.csv")
tot_data2=pd.read_csv("one_hot+label.csv")
tot_data3=pd.read_csv("final_one_hot.csv")

In [149]:
#Preprocessing tot_data1

train_data1=tot_data1.iloc[0:18359]
test_data1=tot_data1.iloc[18359:]
ids=test_data1['enrollee_id']
target1=train_data1['target']
train_data1.drop('target',axis=1,inplace=True)
test_data1.drop('target',axis=1,inplace=True)
train_data1.drop('enrollee_id',axis=1,inplace=True)
test_data1.drop('enrollee_id',axis=1,inplace=True)

In [150]:
#Preprocessing tot_data2

train_data2=tot_data2.iloc[0:18359]
test_data2=tot_data2.iloc[18359:]
ids=test_data2['enrollee_id']
target2=train_data2['target']
train_data2.drop('target',axis=1,inplace=True)
test_data2.drop('target',axis=1,inplace=True)
train_data2.drop('enrollee_id',axis=1,inplace=True)
test_data2.drop('enrollee_id',axis=1,inplace=True)

In [151]:
#Preprocessing tot_data3

train_data3=tot_data3.iloc[0:18359]
test_data3=tot_data3.iloc[18359:]
ids=test_data3['enrollee_id']
target3=train_data3['target']
train_data3.drop('target',axis=1,inplace=True)
test_data3.drop('target',axis=1,inplace=True)
train_data3.drop('enrollee_id',axis=1,inplace=True)
test_data3.drop('enrollee_id',axis=1,inplace=True)

In [None]:
#Grid Search

run_gs = True
#run_gs to true if you want to run grid search again

my_scorer=make_scorer(score_func=roc_auc_score)

if run_gs:
    parameter_grid = {
                 'criterion':['gini','entropy'],
                 'bootstrap':[True,False],
                 'max_depth':[7,8],
                 'min_samples_split':[2,3,5],
                 'min_samples_leaf':[1,3],
                 'max_leaf_nodes':[50,100]
        }
    forest = RandomForestClassifier()
    cross_validation = StratifiedKFold(n_splits=5)

    grid_search = GridSearchCV(forest,
                               scoring=my_scorer,
                               param_grid=parameter_grid,
                               cv=cross_validation,
                               verbose=1
                              )

    grid_search.fit(train_data,target)
    model = grid_search
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))
    
else: 
    parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 100, 
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}
    
    model = RandomForestClassifier(**parameters)
    model.fit(train_data, target)

In [152]:
#Random Forest Classifier model

model1=RandomForestClassifier(bootstrap=False,max_depth=8,criterion='entropy',max_features='auto',min_samples_leaf=2,
                             min_samples_split=5,n_estimators=500,max_leaf_nodes=50)

In [153]:
#Random Forest Regressor model

model2=RandomForestRegressor(bootstrap=False,max_depth=8,max_features=0.5,min_samples_leaf=5,
                             min_samples_split=5,n_estimators=500)

In [154]:
#KFold split

skf=StratifiedKFold(n_splits=5,random_state=1234)

In [155]:
#DataFrame initialization for saving all models test predictions for stacking

stacking_model_preds = pd.DataFrame(np.nan, index=range(0,len(test_data1)), 
                                    columns=['c1','c2','c3','c4','c5','c6'],dtype='float')

In [156]:
#Function for running KFold on all datasets

def kfold_predictions(train_data,target,test_data,model,model_no):
    score=0
    i=0
    test_kfolds = pd.DataFrame(np.nan, index=range(0,len(test_data)), columns=['c1','c2','c3','c4','c5'],dtype='float')
    for train_index, test_index in skf.split(train_data,target):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = train_data.iloc[train_index], train_data.iloc[test_index]
        y_train, y_test = target[train_index], target[test_index]
    
        
    
        if model=='classifier':
            model1.fit(X=X_train,y=y_train)
            pred=model1.predict_proba(test_data)[:,1]
        else:
            model2.fit(X=X_train,y=y_train)
            pred=model2.predict(test_data)
            
        pred=pred.reshape([-1,1])
        
        test_kfolds.iloc[:,i]=pred
        i=i+1
        
    test_kfolds=np.mean(a=test_kfolds,axis=1)
    test_kfolds=test_kfolds.tolist()

    stacking_model_preds.iloc[:,model_no]=test_kfolds

In [157]:
#Running different models

kfold_predictions(train_data1,target1,test_data1,'classifier',0)
kfold_predictions(train_data2,target2,test_data2,'classifier',1)
kfold_predictions(train_data3,target3,test_data3,'classifier',2)
kfold_predictions(train_data1,target1,test_data1,'regressor',3)
kfold_predictions(train_data2,target2,test_data2,'regressor',4)
kfold_predictions(train_data3,target3,test_data3,'regressor',5)


TRAIN: [ 3477  3488  3504 ..., 18356 18357 18358] TEST: [   0    1    2 ..., 3695 3696 3697]
TRAIN: [    0     1     2 ..., 18356 18357 18358] TEST: [3477 3488 3504 ..., 7350 7351 7352]
TRAIN: [    0     1     2 ..., 18356 18357 18358] TEST: [ 7286  7297  7315 ..., 11021 11022 11024]
TRAIN: [    0     1     2 ..., 18356 18357 18358] TEST: [10957 10970 10979 ..., 14691 14692 14693]
TRAIN: [    0     1     2 ..., 14691 14692 14693] TEST: [14647 14648 14674 ..., 18356 18357 18358]
TRAIN: [ 3477  3488  3504 ..., 18356 18357 18358] TEST: [   0    1    2 ..., 3695 3696 3697]
TRAIN: [    0     1     2 ..., 18356 18357 18358] TEST: [3477 3488 3504 ..., 7350 7351 7352]
TRAIN: [    0     1     2 ..., 18356 18357 18358] TEST: [ 7286  7297  7315 ..., 11021 11022 11024]
TRAIN: [    0     1     2 ..., 18356 18357 18358] TEST: [10957 10970 10979 ..., 14691 14692 14693]
TRAIN: [    0     1     2 ..., 14691 14692 14693] TEST: [14647 14648 14674 ..., 18356 18357 18358]
TRAIN: [ 3477  3488  3504 ..., 183

In [158]:
stacking_model_preds.head()

Unnamed: 0,c1,c2,c3,c4,c5,c6
0,0.583913,0.570295,0.595526,0.618597,0.658254,0.643062
1,0.072815,0.07403,0.074693,0.064489,0.067017,0.070965
2,0.353955,0.355345,0.355125,0.348376,0.348719,0.350436
3,0.068545,0.067629,0.065346,0.062225,0.062659,0.063898
4,0.058761,0.059198,0.063405,0.062875,0.067552,0.069399


In [159]:
#Stacking - mean of all test predictions

predictions=np.mean(a=stacking_model_preds,axis=1)
predictions=predictions.tolist()

In [160]:
#To Dataframe

sub=pd.DataFrame({'enrollee_id':ids,'target':predictions})

In [161]:
sub.to_csv("predictions_rf.csv",index=False)