In [214]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error
import xgboost as xgb
%matplotlib inline

In [169]:
%%time
test = pd.read_csv('test.csv', index_col = 0)
train = pd.read_csv('train.csv', index_col = 0)
# train.head()
# train.info()
# test.info()
### encode values: Sex
train.Sex = train.Sex.apply(lambda x: 1 if x=='male' else 0)
test.Sex = test.Sex.apply(lambda x: 1 if x=='male' else 0)

### impute missing or zero Fare values with median Fares for each Pclass
train.loc[(train.Fare==0)&(train.Pclass==1),"Fare"] = train[(train['Fare']!=0)]\
                                                                .groupby('Pclass')['Fare'].describe()['50%'][1]
train.loc[(train.Fare==0)&(train.Pclass==2),"Fare"] = train[(train['Fare']!=0)]\
                                                                .groupby('Pclass')['Fare'].describe()['50%'][2]

train.loc[(train.Fare==0)&(train.Pclass==3),"Fare"] = train[(train['Fare']!=0)]\
                                                                .groupby('Pclass')['Fare'].describe()['50%'][3]

test.loc[((pd.isnull(test['Fare'])==True)|(test.Fare==0))&(test.Pclass==1),"Fare"] = train[(train['Fare']!=0)]\
                                                                .groupby('Pclass')['Fare'].describe()['50%'][1]
test.loc[((pd.isnull(test['Fare'])==True)|(test.Fare==0))&(test.Pclass==2),"Fare"] = train[(train['Fare']!=0)]\
                                                                .groupby('Pclass')['Fare'].describe()['50%'][2]

test.loc[((pd.isnull(test['Fare'])==True)|(test.Fare==0))&(test.Pclass==3),"Fare"] = train[(train['Fare']!=0)]\
                                                                .groupby('Pclass')['Fare'].describe()['50%'][3]

# extract titles from names
def title_extractor(row):
    return row.split(',')[1].strip().split('.')[0]
titles_train = train.Name.apply(title_extractor)
titles_test = test.Name.apply(title_extractor)
titles_train = titles_train.map({'Mr':'Mr','Mrs':'Mrs','Miss':'Miss','Master':'Master','Dr':'Dr','Rev':'Rev',
                                   'Don':'Mr', 'Mlle':'Miss', 'Lady':'Mrs', 'Ms':'Mrs', 
                                   'Mme':'Mrs', 'the Countess': 'Mrs', 'Col':'Mr','Major':'Mr',
                                  'Sir':'Mr','Jonkheer':'Mr','Capt':'Mr'})
titles_test = titles_test.map({'Mr':'Mr','Mrs':'Mrs','Miss':'Miss','Master':'Master','Dr':'Dr','Rev':'Rev',
                                   'Don':'Mr', 'Mlle':'Miss', 'Lady':'Mrs', 'Ms':'Mrs', 
                                   'Mme':'Mrs', 'the Countess': 'Mrs', 'Col':'Mr','Major':'Mr',
                                  'Sir':'Mr','Jonkheer':'Mr','Capt':'Mr',
                              'Dona':'Mrs'})

if 'Title' not in train.columns:
    train['Title'] = titles_train
    title_encoded_train = pd.get_dummies(train.Title, prefix_sep = '_', drop_first = True)
    train = pd.concat([train, title_encoded_train], axis = 1)
if 'Title' not in test.columns:
    test['Title'] = titles_test
    title_encoded_test = pd.get_dummies(test.Title, prefix_sep = '_', drop_first = True)
    test = pd.concat([test, title_encoded_test], axis = 1)

#encode port of embarkation
train = pd.concat([train, pd.get_dummies(train.Embarked, prefix_sep = '_', drop_first = True)], axis = 1)
test = pd.concat([test, pd.get_dummies(test.Embarked, prefix_sep = '_', drop_first = True)], axis = 1)

# add Family size, IsAlone and Has_Cabin

train['FamilySize'] = train['Parch'] + train['SibSp'] + 1
test['FamilySize'] = test['Parch'] + test['SibSp'] + 1
train['IsAlone'] = (train['FamilySize'] == 1).astype(np.int8)
test['IsAlone'] = (test['FamilySize'] == 1).astype(np.int8)
train['HasCabin'] = train['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
test['HasCabin'] = test['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

Wall time: 117 ms


In [170]:
    #categorical columns
col_types = train.dtypes.to_frame().reset_index()
cat_cols = col_types.loc[col_types[0]=='object']['index']
cat_cols

2         Name
7       Ticket
9        Cabin
10    Embarked
11       Title
Name: index, dtype: object

In [171]:
### ### impute Age by prediction with random forest regressor
#y and X sets for Age
y_age = train[pd.isnull(train['Age'])==False]['Age']
X_age = train[pd.isnull(train['Age'])==False].drop(cat_cols,axis=1).drop(['Age','Survived'],axis=1)

#train and val split for Age 
X_age_train, X_age_val, y_age_train, y_age_val = train_test_split(X_age, y_age, test_size = 0.25, random_state = 21)

# select entries with missing Age
X_nullage = train[pd.isnull(train['Age'])==True].drop(cat_cols,axis=1).drop(['Age','Survived'],axis=1)

# instantiate random forest regressor
age_reg = RandomForestRegressor(max_depth=4, random_state=21)
age_reg.fit(X_age_train, y_age_train)

# predict age for train and val sets
y_age_train_pred = age_reg.predict(X_age_train)
y_age_val_pred = age_reg.predict(X_age_val)

# check mean square erorrs and compare to simple median age imputer
print(f'train age MSE: {mean_squared_error(y_age_train, y_age_train_pred)}')
print(f'valid age MSE: {mean_squared_error(y_age_val, y_age_val_pred)}')
print(f'median age MSE: {mean_squared_error(y_age_val, np.asarray([y_age.median() for i in range(len(y_age_val))]))}')

# predict misisng Age values
age_imputed = age_reg.predict(X_nullage)
# replace null Age with predictions
X_nullage['Age'] = age_imputed
train.Age.loc[X_nullage['Age'].index] = X_nullage['Age'].copy()

### test set age imputation
test_nullage = test[pd.isnull(test['Age'])==True].drop(cat_cols,axis=1).drop('Age',axis=1)
age_test_imputed = age_reg.predict(test[pd.isnull(test['Age'])==True].drop(cat_cols,axis=1).drop('Age',axis=1))
test_nullage['Age'] = age_test_imputed
test.Age.loc[test_nullage['Age'].index] = test_nullage['Age'].copy()

train age MSE: 108.90607454508152
valid age MSE: 116.78580854916186
median age MSE: 236.75963240223462


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [195]:
drop_cols.extend(['SibSp', 'Parch'])
train = train.drop(cat_cols, axis = 1)
test = test.drop(cat_cols, axis = 1)

In [184]:
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 21 # for reproducibility
n_splits = 5 # set folds for out-of-fold prediction
kf = KFold(n_splits = n_splits, shuffle = True, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=21, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, X):
        return self.clf.predict(X)
    
    def predict_proba(self, X):
        return self.clf.predict_proba(X)
    
    def fit(self,X,y):
        return self.clf.fit(X,y)
    
    def feature_importances(self,X,y):
        print(self.clf.fit(X,y).feature_importances_)
    
# Class to extend XGboost classifer

In [185]:
def get_meta_features(clf, X_train, y_train, X_test, kf):
    '''
    stacking using fits on folds to create meta features for test set
    '''
    
    meta_train = np.zeros_like(y_train, dtype=float)
    meta_test = np.zeros_like(test.index, dtype=float)
    
    for i, (train_ind, test_ind) in enumerate(kf.split(X_train, y_train)):
        
        clf.fit(X_train.iloc[train_ind], y_train.iloc[train_ind]) #fit on train
        meta_train[test_ind] = clf.predict_proba(X_train.iloc[test_ind])[:, 1] #create meta features on train
        meta_test += clf.predict_proba(X_test)[:, 1] #create meta features on test
    
    return meta_train, meta_test / kf.n_splits

In [199]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025,
    'probability': True,
    }

In [200]:
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [201]:
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train['Survived']
X_train = train.drop(['Survived'], axis=1)
X_test = test.copy() # Creats an array of the test data

In [202]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_meta_features(et, X_train, y_train, X_test, kf = kf) # Extra Trees
rf_oof_train, rf_oof_test = get_meta_features(rf, X_train, y_train, X_test, kf = kf) # Random Forest
ada_oof_train, ada_oof_test = get_meta_features(ada, X_train, y_train, X_test, kf = kf) # AdaBoost 
gb_oof_train, gb_oof_test = get_meta_features(gb, X_train, y_train, X_test, kf = kf) # Gradient Boost
svc_oof_train, svc_oof_test = get_meta_features(svc, X_train, y_train, X_test, kf = kf) # Support Vector Classifier

print("Training is complete")

  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Training is complete


In [231]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train,
     'ExtraTrees': et_oof_train,
     'AdaBoost': ada_oof_train,
      'GradientBoost': gb_oof_train,
        'SVC': svc_oof_train
    })

base_predictions_test = pd.DataFrame( {'RandomForest': rf_oof_test,
     'ExtraTrees': et_oof_test,
     'AdaBoost': ada_oof_test,
      'GradientBoost': gb_oof_test,
        'SVC': svc_oof_test
    })

base_predictions_train.head()

Unnamed: 0,RandomForest,ExtraTrees,AdaBoost,GradientBoost,SVC
0,0.120591,0.077373,0.497292,0.000618,0.124649
1,0.9857,0.990899,0.503169,0.999962,0.884937
2,0.44515,0.544296,0.50115,0.621773,0.761258
3,0.976006,0.985545,0.503348,0.999989,0.832512
4,0.102511,0.112781,0.498439,0.000443,0.138896


In [278]:
gbm = xgb.XGBClassifier()

params_xgb = {
    #learning_rate = 0.02,
 'n_estimators': [10,100,1000],
 'max_depth': [5],
 'min_child_weight': np.arange(1,10,2),
 'gamma': [0.8],                        
 'subsample': [0.8],
 'colsample_bytree': [0.8],
 'objective': ['binary:logistic'],
 'nthread': [-1],
 'scale_pos_weight': [1],
 'random_state': [21],
}

grid = GridSearchCV(gbm, param_grid=params_xgb, scoring='accuracy', n_jobs = -1, )
grid.fit(base_predictions_train, y_train)
# predictions = gbm. predict(base_predictions_test)

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_es...
             iid='deprecated', n_jobs=-1,
             param_grid={'colsample_bytree': [0.8], 'gamma': [0.8],
                         'max_depth': [5],
                         'min_child_weight': array([1, 3, 5, 7, 9]),
                         'n_estimators': [10, 100, 1000], 'nt

In [279]:
pd.DataFrame(grid.cv_results_).sort_values(by='rank_test_score', ascending=True)[:3].T

Unnamed: 0,13,14,4
mean_fit_time,0.153343,1.17176,0.204428
std_fit_time,0.00650259,0.249367,0.0260644
mean_score_time,0.00238814,0.0060039,0.00398583
std_score_time,0.00221696,0.0037472,0.00199293
param_colsample_bytree,0.8,0.8,0.8
param_gamma,0.8,0.8,0.8
param_max_depth,5,5,5
param_min_child_weight,9,9,3
param_n_estimators,100,1000,100
param_nthread,-1,-1,-1


In [281]:
predictions_train = grid.best_estimator_.predict(base_predictions_train)
predictions_test = grid.best_estimator_.predict(base_predictions_test)
np.mean(y_train == predictions_train)

0.9461279461279462

In [283]:
# Generate Submission File 
StackingSubmission = pd.DataFrame({ 'PassengerId': test.index,
                            'Survived': predictions_test })
StackingSubmission.to_csv("StackingSubmission.csv", index=False)