In [19]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.ensemble import (AdaBoostClassifier,RandomForestClassifier,ExtraTreesClassifier,
                              GradientBoostingClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
full_data = [train_df, test_df]
Passenger_Id_train = train_df['PassengerId']
Passenger_Id_test = test_df['PassengerId']
train_df.drop(['PassengerId'], axis=1, inplace=True)
test_df.drop(['PassengerId'], axis=1, inplace=True)

In [4]:
for dataset in full_data:
    dataset['Embarked'].fillna('S', inplace= True)
    dataset['Has_Cabin'] = dataset['Cabin'].apply(lambda x : 0 if type(x) == float else 1)
    dataset['Family_Size'] = dataset['Parch'] + dataset['SibSp'] + 1
    dataset['Is_Alone'] = dataset['Family_Size'].apply(lambda x: 1 if x == 1 else 0)    
    dataset.drop(['Cabin', 'Ticket'], inplace=True, axis=1)
    
    #Dividing the Age and the Fare column into Numerical Categories
    dataset['Age'].fillna(dataset['Age'].mean(),inplace = True)
    dataset['Categorical_Age'] = pd.cut(dataset['Age'],5)
    
    dataset['Fare'] = dataset['Fare'].fillna(dataset['Fare'].median())
    dataset['Categorical_Fare'] = pd.cut(dataset['Fare'], 4)
    
    dataset.loc[(dataset['Fare']< 128.082), 'Fare'] = 0
    dataset.loc[(dataset['Fare'] >=128.082) & (dataset['Fare']< 256.165), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] >=256.165) & (dataset['Fare']< 384.247), 'Fare'] = 2
    dataset.loc[(dataset['Fare'] >=384.247) & (dataset['Fare']< 513), 'Fare'] = 3   
    dataset.loc[(dataset['Age']< 16.336), 'Age'] = 0
    dataset.loc[(dataset['Age'] >=16.336) & (dataset['Age']< 32.252), 'Age'] = 1
    dataset.loc[(dataset['Age'] >=32.252) & (dataset['Age']< 48.168), 'Age'] = 2
    dataset.loc[(dataset['Age'] >=48.168) & (dataset['Age']< 64.084), 'Age'] = 3
    dataset.loc[(dataset['Age'] >=64.084) & (dataset['Age']< 81), 'Age'] = 4
    dataset.drop(['Categorical_Age', "Categorical_Fare"], axis= 1, inplace= True)
    
    #Extracting the different titles from Name column and assigning categories to them

    saldict2 = {1: [' Mr', ' Master', ' Don', ' Sir'], 2: [' Mrs', ' Miss', ' Ms', ' Lady', ' the Countess'], 
                3: [' Dr', ' Major', ' Col', ' Capt',]}

    dataset['Title'] = [i.split(',')[1] for i in [i[0] for i in dataset['Name'].apply(str.split, args=('.'))]]
    for title, index in zip(dataset['Title'].values, np.arange(len(dataset['Title']))):
        for (key, value) in saldict2.items():
            if title in value:
                dataset['Title'][index] = key
                break

    dataset['Title'] = dataset['Title'].apply(lambda x: 4 if type(x) == str else x)
    dataset.drop('Name', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
train_df = pd.get_dummies(train_df,columns=['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title', 'Has_Cabin', 'Is_Alone'], drop_first=True)
test_df = pd.get_dummies(test_df,columns=['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title', 'Has_Cabin', 'Is_Alone'], drop_first=True)

In [62]:
class SklearnHelper():
    
    def __init__(self, clf, x_train, y_train, param_grid= None):
        grid = GridSearchCV(clf, scoring='accuracy', cv=10, param_grid=param_grid, n_jobs=-1)
        grid.fit(x_train, y_train)
        print(grid.best_score_, grid.best_params_)
        self.clf = clf(**grid.best_params_)
        
    def fit(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
    
    def predict(self, x):
        return self.clf.predict(x)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

In [63]:
from sklearn.model_selection import KFold
n_split = 5
kfold = KFold(n_split)

In [64]:
def oof_Cal(clf, x_train, y_train, test_df):
    n_split = 5
    oof_train = np.zeros(x_train.shape[0],)
    oof_test = np.zeros(test_df.shape[0],)
    oof_test_svf = np.empty((n_split, test_df.shape[0]))
    
    for i, (train_index, test_index) in enumerate(kfold.split(x_train)):
        
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
    
        clf.fit(x_tr, y_tr)
        oof_train[test_index] = clf.predict(x_te)
        oof_test_svf[i, :] =  clf.predict(test_df)
        oof_test = oof_test_svf.mean(axis=0)
        
    return oof_train.reshape(-1,1), oof_test.reshape(-1,1)

In [65]:
# Random Forest parameters
rf_params = {
    'n_jobs': [-1],
    'n_estimators': [500, 600, 700],
     'warm_start': [True],
    'max_depth': [6],
    'min_samples_leaf': [2],
    'max_features' : ['sqrt'],
    'verbose': [0]
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}


# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}



In [66]:
x_train = train_df.drop('Survived', axis= True).values
y_train = train_df['Survived'].values
x_test = test_df.values

In [67]:
#adb = SklearnHelper(AdaBoostClassifier, rf_params)
rf = SklearnHelper(RandomForestClassifier,x_train, y_train, rf_params)
#et = SklearnHelper(ExtraTreesClassifier)
#gb = SklearnHelper(GradientBoostingClassifier)
#sv = SklearnHelper(SVC)

TypeError: get_params() missing 1 required positional argument: 'self'

In [53]:
rf.get_hyperparameters(rf_params, x_train, y_train)

0.8002244668911336 {'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 700, 'n_jobs': -1, 'verbose': 0, 'warm_start': True}


In [None]:
def ini

In [14]:
adb_oof_train, adb_oof_test = oof_Cal(adb, x_train, y_train, x_test)
rf_oof_train, rf_oof_test = oof_Cal(rf, x_train, y_train, x_test)
et_oof_train, et_oof_test = oof_Cal(et, x_train, y_train, x_test)
gb_oof_train, gb_oof_test = oof_Cal(gb, x_train, y_train, x_test)
sv_oof_train, sv_oof_test = oof_Cal(sv, x_train, y_train, x_test)

  warn("Warm-start fitting without increasing n_estimators does not "


In [15]:
x_train = np.concatenate([adb_oof_train, rf_oof_train, et_oof_train, gb_oof_train, sv_oof_train], axis=1)
x_test = np.concatenate([adb_oof_test, rf_oof_test, et_oof_test, gb_oof_test, sv_oof_test], axis=1)

In [16]:
from xgboost import XGBClassifier

In [17]:
xg = XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1)

In [18]:
xg.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.9, learning_rate=0.1,
       max_delta_step=0, max_depth=4, min_child_weight=2, missing=None,
       n_estimators=2000, n_jobs=1, nthread=-1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.8)