In [249]:
# Import all libraries needed
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import tree
# Going to use these 5 base models for the stacking
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import KFold;
from sklearn.model_selection import cross_val_score


In [250]:
# Import data needed and separate them into training and test dataset
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)

# Just to be careful, we will copy them here
original_train =  train.copy()
original_test = test.copy()
PassengerId = test['PassengerId']

# Check the columns
print(train.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [251]:
# Check the sample data
print(train.head())
print(test.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [252]:
# Get the statistics
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [253]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [254]:
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return "Unknown"

data = [train, test]

# Create new features
for partial in data: 
    partial["Title"] = "Unknown"
    partial["Title"] = partial["Name"].apply(get_title)



In [255]:
# Find the proportion of survived based on features
train["Survived"][train["Age"] < 18].value_counts(normalize = True, dropna = True)
train["Survived"][train["Age"] >= 18].value_counts(normalize = True, dropna = True)

# Adult tends to not survived

0    0.618968
1    0.381032
Name: Survived, dtype: float64

In [256]:
import numpy as np

data = [train, test]

# Remove NA values
for partial in data:
    partial["Fare"] = partial["Fare"].fillna(train.Fare.median())
    partial["Embarked"] = partial["Embarked"].fillna('S')
    # Fill age by using random number
    avg_age = partial["Age"].mean()
    var_age = partial["Age"].std()
    null_count = partial["Age"].isnull().sum()
    age_null_random_list = np.random.randint(avg_age - var_age, avg_age + var_age, size=null_count)
    partial.loc[np.isnan(partial['Age']), 'Age'] = age_null_random_list
    partial['Age'] = partial['Age'].astype(int)
    
    # Group data
    partial['Title'] = partial['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    partial['Title'] = partial['Title'].replace('Mlle', 'Miss')
    partial['Title'] = partial['Title'].replace('Ms', 'Miss')
    partial['Title'] = partial['Title'].replace('Mme', 'Mrs')
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
    partial['Title'] = partial['Title'].map(title_mapping)
    partial['Title'] = partial['Title'].fillna(0)
    
    # Mapping Embarked
    partial['Embarked'] = partial['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Sex
    partial['Sex'] = partial['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
     # Mapping Fare
    partial.loc[ partial['Fare'] <= 7.91, 'Fare'] 						        = 0
    partial.loc[(partial['Fare'] > 7.91) & (partial['Fare'] <= 14.454), 'Fare'] = 1
    partial.loc[(partial['Fare'] > 14.454) & (partial['Fare'] <= 31), 'Fare']   = 2
    partial.loc[ partial['Fare'] > 31, 'Fare'] 							        = 3
    partial['Fare'] = partial['Fare'].astype(int)
    
    # Mapping Age
    partial.loc[ partial['Age'] <= 16, 'Age'] 					       = 0
    partial.loc[(partial['Age'] > 16) & (partial['Age'] <= 32), 'Age'] = 1
    partial.loc[(partial['Age'] > 32) & (partial['Age'] <= 48), 'Age'] = 2
    partial.loc[(partial['Age'] > 48) & (partial['Age'] <= 64), 'Age'] = 3
    partial.loc[ partial['Age'] > 64, 'Age'] = 4 ;

    
train.isnull().sum()
    
    


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Title            0
dtype: int64

In [257]:
# Feature engineering
# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# Create new feature IsAlone from FamilySize
for dataset in data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
# Feature that tells whether a passenger had a cabin on the Titanic
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

In [258]:
test.isnull().sum()


PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
Title            0
FamilySize       0
IsAlone          0
Has_Cabin        0
dtype: int64

In [259]:
# Convert categorical variables to factor
train["Embarked"] = train["Embarked"].astype("category")
train["Title"] = train["Title"].astype("category")
train["Sex"] = train["Sex"].astype("category")

In [260]:
# Cleaning the data
# Preparing the data
# Drop unused columns
train = train.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp'])
test = test.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp'])

In [261]:
# Calculate Gini impurity
def get_gini_impurity(survived_count, total):
    survived_prob = survived_count / total
    survived_prob_c  = (1 -  survived_prob)
    rand_survived_prob = survived_prob
    rand_survived_prob_c = (1 - rand_survived_prob)
    mislabelling_survived_prob = survived_prob_c * rand_survived_prob
    mislabelling_not_survived_prob = survived_prob * rand_survived_prob_c
    gini_impurity = mislabelling_not_survived_prob + mislabelling_survived_prob
    return gini_impurity
    

In [262]:
train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).agg(['mean', 'count', 'sum'])

Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,mean,count,sum
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0.742038,314,233
1,0.188908,577,109


In [263]:
# Let's use our 'original_train' dataframe to check the sex distribution for each title.
# We use copy() again to prevent modifications in out original_train dataset
title_and_sex = original_train.copy()[['Name', 'Sex']]

# Create 'Title' feature
title_and_sex['Title'] = title_and_sex['Name'].apply(get_title)

# Map 'Sex' as binary feature
title_and_sex['Sex'] = title_and_sex['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

# Table with 'Sex' distribution grouped by 'Title'
title_and_sex[['Title', 'Sex']].groupby(['Title'], as_index=False).agg(['mean', 'count', 'sum'])


Unnamed: 0_level_0,Sex,Sex,Sex
Unnamed: 0_level_1,mean,count,sum
Title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Capt,1.0,1,1
Col,1.0,2,2
Don,1.0,1,1
Dr,0.857143,7,6
Jonkheer,1.0,1,1
Lady,0.0,1,0
Major,1.0,2,2
Master,1.0,40,40
Miss,0.0,182,0
Mlle,0.0,2,0


In [264]:
# Gini Impurity of starting node
gini_impurity_starting_node = get_gini_impurity(342, 891)
gini_impurity_starting_node

0.47301295786144265

In [265]:
gini_impurity_men = get_gini_impurity(109, 577)
gini_impurity_men

0.3064437162277843

In [266]:
# Gini Impurity decrease if node splited for 'female' observations
gini_impurity_women = get_gini_impurity(233, 314)
gini_impurity_women

0.3828350034484158

In [267]:
# Gini Impurity decrease if node splited by Sex
men_weight = 577/891
women_weight = 314/891
weighted_gini_impurity_sex_split = (gini_impurity_men * men_weight) + (gini_impurity_women * women_weight)

sex_gini_decrease = weighted_gini_impurity_sex_split - gini_impurity_starting_node
sex_gini_decrease


-0.13964795747285214

In [268]:
# Gini Impurity decrease of node for observations with Title == 1 == Mr
gini_impurity_title_1 = get_gini_impurity(81, 517)
gini_impurity_title_1

0.26425329886377663

In [269]:
# Gini Impurity decrease if node splited for observations with Title != 1 != Mr
gini_impurity_title_others = get_gini_impurity(261, 374)
gini_impurity_title_others

0.42170207898424317

In [270]:
# Gini Impurity decrease if node splited for observations with Title == 1 == Mr
title_1_weight = 517/891
title_others_weight = 374/891
weighted_gini_impurity_title_split = (gini_impurity_title_1 * title_1_weight) + (gini_impurity_title_others * title_others_weight)

title_gini_decrease = weighted_gini_impurity_title_split - gini_impurity_starting_node
title_gini_decrease

-0.14267004758907514

In [271]:
train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age           0
Parch         0
Fare          0
Embarked      0
Title         0
FamilySize    0
IsAlone       0
Has_Cabin     0
dtype: int64

In [272]:
# List of our model
# Some useful parameters which will come in handy later on
ntrain = train.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
ntest = test.shape[0]
kf = KFold(ntrain, n_folds=NFOLDS, random_state=SEED)

In [273]:


# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
        

def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train['Survived'].ravel()
train = train.drop(['Survived'], axis=1)
x_train = train.values # Creates an array of the train data
x_test = test.values # Create an array of the test data

# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier

print("Training is complete")

x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

  warn("Warm-start fitting without increasing n_estimators does not "


Training is complete


In [277]:
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)

In [278]:
# Generate Submission File 
StackingSubmission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': predictions })
StackingSubmission.to_csv("submission.csv", index=False)