In [110]:
import pandas as pd
from pandas import Series,DataFrame

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [111]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

PassengerId = test_df['PassengerId']
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [112]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [113]:
train_df = train_df.drop(['PassengerId','Name','Ticket'],axis = 1)
test_df = test_df.drop(['Name','Ticket','PassengerId'],axis =1 )

In [114]:
 #------------- Feature Engineering for 'Embarked' --------------

train_df['Embarked'] = train_df['Embarked'].fillna("S")
train_df.drop(['Embarked'],axis=1 ,inplace=True)
test_df.drop(['Embarked'],axis =1 ,inplace=True)


# ------------- Feature Engineering for 'Fare' --------------

train_df['Fare'].fillna(train_df['Fare'].median(),inplace=True)
test_df['Fare'].fillna(test_df['Fare'].median(),inplace=True)

train_df['Fare'] =  train_df['Fare'].astype(int)
test_df['Fare'] = train_df['Fare'].astype(int)


# ------------- Feature Engineering for 'Age' --------------

train_df['Age'].fillna(train_df['Age'].median(),inplace=True)
test_df['Age'].fillna(test_df['Age'].median(),inplace=True)

train_df['Age'] = train_df['Age'].astype(int)
test_df['Age'] = test_df['Age'].astype(int)


# ------------- Feature Engineering for 'Cabin' --------------


train_df['Cabin'] = train_df.drop(['Cabin'],axis=1)
test_df['Cabin'] = test_df.drop(['Cabin'],axis = 1)


# ------------- Feature Engineering for 'Family' --------------

#check if the passenger had any family member
# if he had any family member , make it 1 else 0
train_df['Family'] = train_df['Parch'] + train_df['SibSp']
train_df['Family'].loc[train_df['Family']>0] = 1
train_df['Family'].loc[train_df['Family'] == 0 ] = 0

test_df['Family'] =test_df['Parch'] + test_df['SibSp']
test_df['Family'].loc[test_df['Family']>0] = 1
test_df['Family'].loc[test_df['Family'] == 0] = 0

train_df.drop(['SibSp','Parch'] , axis = 1 , inplace= True)
test_df.drop(['SibSp','Parch'] , axis = 1 , inplace= True)


# ------------- Feature Engineering for 'Sex' --------------

train_df['Sex'] = train_df['Sex'].map({'female' : 0 , 'male' : 1}).astype(int)
test_df['Sex'] = test_df['Sex'].map({'female' : 0 , 'male' : 1}).astype(int)

# ------------- Feature Engineering for 'Pclass' --------------

# Segment the three classes into three separate classes 
pclass_dummies_train = pd.get_dummies(train_df['Pclass'])
pclass_dummies_train.columns = ['Class_1', 'Class_2' , 'Class_3']
pclass_dummies_train.drop(['Class_3'],axis = 1 ,inplace = True)

pclass_dummies_test = pd.get_dummies(train_df['Pclass'])
pclass_dummies_test.columns = ['Class_1', 'Class_2' , 'Class_3']
pclass_dummies_test.drop(['Class_3'],axis = 1 ,inplace = True)

train_df =  train_df.join(pclass_dummies_train)
test_df  =  test_df.join(pclass_dummies_test)

train_df.drop(['Pclass'], axis = 1 ,inplace = True)
test_df.drop(['Pclass'] , axis = 1 , inplace = True)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [115]:
train_df.head()

Unnamed: 0,Survived,Sex,Age,Fare,Cabin,Family,Class_1,Class_2
0,0,1,22,7,0,1,0,0
1,1,0,38,71,1,1,1,0
2,1,0,26,7,1,0,0,0
3,1,0,35,53,1,1,1,0
4,0,1,35,8,0,0,0,0


In [116]:
# Pearson Coefficient Matrix Heat Map

train_df.columns


Index(['Survived', 'Sex', 'Age', 'Fare', 'Cabin', 'Family', 'Class_1',
       'Class_2'],
      dtype='object')

In [117]:
train_df.corr()

Unnamed: 0,Survived,Sex,Age,Fare,Family,Class_1,Class_2
Survived,1.0,-0.543351,-0.064909,0.257482,0.203367,0.285904,0.093349
Sex,-0.543351,1.0,0.08075,-0.182331,-0.303646,-0.098013,-0.064746
Age,-0.064909,0.08075,1.0,0.097064,-0.171807,0.32409,0.015628
Fare,0.257482,-0.182331,0.097064,1.0,0.272355,0.591693,-0.116346
Family,0.203367,-0.303646,-0.171807,0.272355,1.0,0.113364,0.03907
Class_1,0.285904,-0.098013,0.32409,0.591693,0.113364,1.0,-0.288585
Class_2,0.093349,-0.064746,0.015628,-0.116346,0.03907,-0.288585,1.0


In [118]:
# Start with makin a skeleton code for training and predicting on sklearn models

from sklearn.cross_validation import KFold;

ntrain = train_df.shape[0]
ntest = test_df.shape[0]
SEED = 0
NFOLDS = 5
kf = KFold(ntrain, n_folds = NFOLDS , random_state = SEED )
# Provides train/test indices to split data in train test sets. Split dataset into k consecutive folds (without shuffling by default).
# Each fold is then used a validation set once while the k - 1 remaining fold form the training set.
# desing a class which helps in training adn crossvalidating baseline models

class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
        
len(kf)

5

In [119]:
# Stacking uses predictions of base classifies as input for training to a second-level model
# Here , we simply do not train the base models on the full training data, generate predictions on the full 
# test set a test set 

# For the k-folds developed out of the first level models , the get_off generate predictions

def get_oof(clf, x_train,y_train,x_test):
    oof_train = np.zeros((ntrain,))
    oof_test  = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS,ntest))
    
    for i, (train_index,test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
    

In [120]:
# ---- For the first level classification , we use the foloowing classifiers
#Note the following parameters have not been optimized . They have been arbitrarily chosen 

rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [121]:
# Creat 5 objects through above mentioned classes that representative of the above mentioned learning models

# Going to use these 5 base models for the stacking
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [122]:
# Data prepration for first level classification 

y_train = train_df['Survived'].ravel()
train_df.drop(['Survived'],axis =1,inplace=True)
x_train = train_df.values
x_test = test_df.values 

In [123]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier

  warn("Warm-start fitting without increasing n_estimators does not "


In [124]:
#--------------Second level learning model via XGBoost -----------------------------------#
# Comnbine the training and testing sets from the above model ----------------------------#

x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

import xgboost as xgb
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)


In [125]:
StackingSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': predictions })
StackingSubmission.to_csv("TitanicSubmission.csv", index=False)