# Analysis of survivors of Titanic

In [56]:
import random

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score,cross_val_predict,cross_validate,train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score

## Load DataSet

In [57]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
submission = pd.read_csv("data/gender_submission.csv", index_col="PassengerId")
data = train.append(test) # The entire data: train + test.
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Preprocessing

### Encode Sex

In [58]:
train.loc[train['Sex'] == 'male', 'Sex_encode'] = 0
train.loc[train['Sex'] == 'female', 'Sex_encode'] = 1
train.Sex_encode = train.Sex_encode.astype('int32')

test.loc[test['Sex'] == 'male', 'Sex_encode'] = 0
test.loc[test['Sex'] == 'female', 'Sex_encode'] = 1
test.Sex_encode = test.Sex_encode.astype('int32')

### Find null data

In [59]:
print("train's null data")
for x in train.columns:
    null_value = train[pd.isnull(train[x])]
    print(x, np.shape(null_value))
print("\ntest's null data")
for x in test.columns:
    null_value = test[pd.isnull(test[x])]
    print(x, np.shape(null_value))

train's null data
PassengerId (0, 13)
Survived (0, 13)
Pclass (0, 13)
Name (0, 13)
Sex (0, 13)
Age (177, 13)
SibSp (0, 13)
Parch (0, 13)
Ticket (0, 13)
Fare (0, 13)
Cabin (687, 13)
Embarked (2, 13)
Sex_encode (0, 13)

test's null data
PassengerId (0, 12)
Pclass (0, 12)
Name (0, 12)
Sex (0, 12)
Age (86, 12)
SibSp (0, 12)
Parch (0, 12)
Ticket (0, 12)
Fare (1, 12)
Cabin (327, 12)
Embarked (0, 12)
Sex_encode (0, 12)


### Fill Fare null value

In [60]:
mean_fare = train['Fare'].mean()
max_fare = train['Fare'].max()
print("max fare: {0:.3f}$, mean fare: {1:.3f}$".format(mean_fare, max_fare))

max fare: 32.204$, mean fare: 512.329$


In [61]:
mean_fare = test['Fare'].mean()
max_fare = test['Fare'].max()
print("max fare: {0:.3f}$, mean fare: {1:.3f}$".format(mean_fare, max_fare))

max fare: 35.627$, mean fare: 512.329$


In [62]:
train.loc[pd.isnull(train["Fare"]), "Fare"] = mean_fare
test.loc[pd.isnull(test["Fare"]), "Fare"] = mean_fare

### Fill Age null value

In [63]:
mean_age = train['Age'].mean()
max_age = train['Age'].max()
print("mean_age: {0:.3f}, max_age: {1:.3f}".format(mean_age, max_age))

mean_age: 29.699, max_age: 80.000


In [64]:
mean_age = test['Age'].mean()
max_age = test['Age'].max()
print("mean_age: {0:.3f}, max_age: {1:.3f}".format(mean_age, max_age))

mean_age: 30.273, max_age: 76.000


In [65]:
train.loc[pd.isnull(train["Age"]), "Age"] = int(mean_age)
test.loc[pd.isnull(test["Age"]), "Age"] = int(mean_age)

### Embarked

In [66]:
# temp_pd = pd.get_dummies(train.Embarked, prefix="Embarked")
# train = pd.concat([train, temp_pd], axis = 1)
# temp_pd = pd.get_dummies(test.Embarked, prefix="Embarked")
# test = pd.concat([test, temp_pd], axis = 1)
# train

# null_value = train[pd.isnull(train["Embarked"])]
# null_value

### FareBand feature

In [67]:
# #fill in missing Fare value in training set based on mean fare for that Pclass 
# for x in range(len(train["Fare"])):
#     if pd.isnull(train["Fare"][x]):
#         pclass = train["Pclass"][x] #Pclass = 3
#         train["Fare"][x] = round(train[train["Pclass"] == pclass]["Fare"].mean(), 8)
        
# #fill in missing Fare value in test set based on mean fare for that Pclass
# for x in range(len(test["Fare"])):
#     if pd.isnull(test["Fare"][x]):
#         pclass = test["Pclass"][x] #Pclass = 3
#         test["Fare"][x] = round(test[test["Pclass"] == pclass]["Fare"].mean(), 8)
        
# #map Fare values into groups of numerical values
# data["FareBand"] = pd.qcut(data['Fare'], 8, labels = [1, 2, 3, 4,5,6,7,8]).astype('int')
# train["FareBand"] = pd.qcut(train['Fare'], 8, labels = [1, 2, 3, 4,5,6,7,8]).astype('int')
# test["FareBand"] = pd.qcut(test['Fare'], 8, labels = [1, 2, 3, 4,5,6,7,8]).astype('int')
# train[["FareBand", "Survived"]].groupby(["FareBand"], as_index=False).mean()
# print('FareBand feature created')

### Title Feature

In [68]:
#Get titles
data["Title"] = data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

#Unify common titles. 
data["Title"] = data["Title"].replace('Mlle', 'Miss')
data["Title"] = data["Title"].replace('Master', 'Master')
data["Title"] = data["Title"].replace(['Mme', 'Dona', 'Ms'], 'Mrs')
data["Title"] = data["Title"].replace(['Jonkheer','Don'],'Mr')
data["Title"] = data["Title"].replace(['Capt','Major', 'Col','Rev','Dr'], 'Millitary')
data["Title"] = data["Title"].replace(['Lady', 'Countess','Sir'], 'Honor')

# Age in df_train and df_test:
train["Title"] = data['Title'][:891]
test["Title"] = data['Title'][891:]

# convert Title categories to Columns
titledummies=pd.get_dummies(train[['Title']], prefix_sep='_') #Title
train = pd.concat([train, titledummies], axis=1) 
ttitledummies=pd.get_dummies(test[['Title']], prefix_sep='_') #Title
test = pd.concat([test, ttitledummies], axis=1) 
print('Title categories added')

Title categories added


### Title Cetegory

In [69]:
# Mapping titles
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Millitary": 5, "Honor": 6}
data["TitleCat"] = data['Title'].map(title_mapping)
data["TitleCat"] = data["TitleCat"].astype(int)
train["TitleCat"] = data["TitleCat"][:891]
test["TitleCat"] = data["TitleCat"][891:]
print('Title Category created')

Title Category created


### Fill age based on title

In [70]:
titles = ['Master', 'Miss', 'Mr', 'Mrs', 'Millitary','Honor']
for title in titles:
    age_to_impute = data.groupby('Title')['Age'].median()[title]
    data.loc[(data['Age'].isnull()) & (data['Title'] == title), 'Age'] = age_to_impute
# Age in df_train and df_test:
train["Age"] = data['Age'][:891]
test["Age"] = data['Age'][891:]
print('Missing Ages Estimated')

Missing Ages Estimated


### Create AgeBands

In [71]:
# sort Age into band categories
# bins = [0,7,12,18,35,np.inf]
bins = [0,12,24,45,50,np.inf]
labels = ['Child', 'Young Adult', 'Adult','Older Adult','Senior']
train["AgeBand"] = pd.cut(train["Age"], bins, labels = labels)
test["AgeBand"] = pd.cut(test["Age"], bins, labels = labels)
print('Age Feature created')

# convert AgeGroup categories to Columns
dummies=pd.get_dummies(train[["AgeBand"]], prefix_sep='_')
train = pd.concat([train, dummies], axis=1) 
dummies=pd.get_dummies(test[["AgeBand"]], prefix_sep='_')
test = pd.concat([test, dummies], axis=1)
print('AgeBand feature created')

# pd.cut(train["Age"], bins, labels = labels)

Age Feature created
AgeBand feature created


### Lone Travellers Feature

In [72]:
train["Alone"] = np.where(train['SibSp'] + train['Parch'] + 1 == 1, 1,0) # People travelling alone
test["Alone"] = np.where(test['SibSp'] + test['Parch'] + 1 == 1, 1,0) # People travelling alone
print('Lone traveller feature created')

Lone traveller feature created


### Mother

In [73]:
data['Mother'] = (data['Title'] == 'Mrs') & (data['Parch'] > 0)
data['Mother'] = data['Mother'].astype(int)

train["Mother"] = data["Mother"][:891]
test["Mother"] = data["Mother"][891:]
print('Mother Category created')

Mother Category created


### Family Size Feature

In [74]:
train["Family Size"] = (train['SibSp'] + train['Parch'] + 1)
test["Family Size"] = test['SibSp'] + test['Parch'] + 1
print('Family size feature created')

Family size feature created


### Family Survival

In [75]:
# get last name
data["Last_Name"] = data['Name'].apply(lambda x: str.split(x, ",")[0])
# Set survival value
DEFAULT_SURVIVAL_VALUE = 0.5
data["Family_Survival"] = DEFAULT_SURVIVAL_VALUE

# Find Family groups by Fare
for grp, grp_df in data[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
    
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0

print("Number of passengers with family survival information:", 
      data.loc[data['Family_Survival']!=0.5].shape[0])

# Find Family groups by Ticket
for _, grp_df in data.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
                        
print("Number of passenger with family/group survival information: " 
      +str(data[data['Family_Survival']!=0.5].shape[0]))

# Family_Survival in train and test:
train["Family_Survival"] = data['Family_Survival'][:891]
test["Family_Survival"] = data['Family_Survival'][891:]

Number of passengers with family survival information: 420
Number of passenger with family/group survival information: 546


In [76]:
train["SibSp_has"] = np.where(train['SibSp'] != 0 , 1,0)
test["SibSp_has"] = np.where(test['SibSp'] != 0 , 1,0)
print('SibSp_has feature created')

train

SibSp_has feature created


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encode,Title,Title_Honor,Title_Master,Title_Millitary,Title_Miss,Title_Mr,Title_Mrs,TitleCat,AgeBand,AgeBand_Child,AgeBand_Young Adult,AgeBand_Adult,AgeBand_Older Adult,AgeBand_Senior,Alone,Mother,Family Size,Family_Survival,SibSp_has
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0,Mr,0,0,0,0,1,0,1,Young Adult,0,1,0,0,0,0,0,2,0.5,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,Mrs,0,0,0,0,0,1,3,Adult,0,0,1,0,0,0,0,2,0.5,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1,Miss,0,0,0,1,0,0,2,Adult,0,0,1,0,0,1,0,1,0.5,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1,Mrs,0,0,0,0,0,1,3,Adult,0,0,1,0,0,0,0,2,0.0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0,Mr,0,0,0,0,1,0,1,Adult,0,0,1,0,0,1,0,1,0.5,0
5,6,0,3,"Moran, Mr. James",male,29.0,0,0,330877,8.4583,,Q,0,Mr,0,0,0,0,1,0,1,Adult,0,0,1,0,0,1,0,1,0.5,0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,0,Mr,0,0,0,0,1,0,1,Senior,0,0,0,0,1,1,0,1,0.5,0
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,0,Master,0,1,0,0,0,0,4,Child,1,0,0,0,0,0,0,5,0.0,1
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,1,Mrs,0,0,0,0,0,1,3,Adult,0,0,1,0,0,0,1,3,1.0,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,1,Mrs,0,0,0,0,0,1,3,Young Adult,0,1,0,0,0,0,0,2,0.0,1


### Cabin feature

In [77]:
# check if cabin inf exists
data["HadCabin"] = (data["Cabin"].notnull().astype('int'))
# split Embanked into df_train and df_test:
train["HadCabin"] = data["HadCabin"][:891]
test["HadCabin"] = data["HadCabin"][891:]
print('Cabin feature created')

Cabin feature created


### Deck feature

In [78]:
# Extract Deck
data["Deck"] = data.Cabin.str.extract('([A-Za-z])', expand=False)
data["Deck"] = data["Deck"].fillna("N")
# Map Deck
deck_mapping = {"N":0,"A": 1, "B": 2, "C": 3, "D": 4, "E": 5}
data['Deck'] = data['Deck'].map(deck_mapping)
#Split to training and test
train["Deck"] = data["Deck"][:891]
test["Deck"] = data["Deck"][891:]
print('Deck feature created')

#Map and Create Deck feature for training
data["Deck"] = data.Cabin.str.extract('([A-Za-z])', expand=False)
deck_mapping = {"0":0,"A": 1, "B": 2, "C": 3, "D": 4, "E": 5}
data['Deck'] = data['Deck'].map(deck_mapping)
data["Deck"] = data["Deck"].fillna("0")
data["Deck"]=data["Deck"].astype('int')

train["Deck"] = data['Deck'][:891]
test["Deck"] = data['Deck'][891:]
print('Deck feature created')

# convert categories to Columns
dummies=pd.get_dummies(train['Deck'].astype('category'), prefix='Deck') #Gender
train = pd.concat([train, dummies], axis=1) 
dummies=pd.get_dummies(test['Deck'].astype('category'), prefix='Deck') #Gender
test = pd.concat([test,dummies], axis=1)
print('Deck Categories created')

Deck feature created
Deck feature created
Deck Categories created


### Ticket feature

In [79]:
## Treat Ticket by extracting the ticket prefix. When there is no prefix it returns X. 

Ticket = []
for i in list(data.Ticket):
    if not i.isdigit() :
        Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0]) #Take prefix
    else:
        Ticket.append("X")
        
data["Ticket"] = Ticket
data["Ticket"].head()

train["Ticket"] = data["Ticket"][:891]
test["Ticket"] = data["Ticket"][891:]
print('Ticket feature created')

Ticket feature created


### Ticket Type Feature

In [80]:
# ticket prefix

data['TicketRef'] = data['Ticket'].apply(lambda x: str(x)[0])
data['TicketRef'].value_counts()
#data["ticketBand"] = pd.qcut(data['ticket_ref'], 5, labels = [1, 2, 3, 4,5]).astype('int')

# split to test and training
train["TicketRef"] = data["TicketRef"][:891]
test["TicketRef"] = data["TicketRef"][891:]

# convert AgeGroup categories to Columns
dummies=pd.get_dummies(train["TicketRef"].astype('category'), prefix='TicketRef') #Embarked
train = pd.concat([train, dummies], axis=1) 
dummies=pd.get_dummies(test["TicketRef"].astype('category'), prefix='TicketRef') #Embarked
test = pd.concat([test, dummies], axis=1)
print("TicketBand categories created")

TicketBand categories created


### Passenger Class Feature

In [81]:
# convert AgeGroup categories to Columns
dummies=pd.get_dummies(train["Pclass"].astype('category'), prefix='Pclass') #Embarked
train = pd.concat([train, dummies], axis=1) 
dummies=pd.get_dummies(test["Pclass"].astype('category'), prefix='Pclass') #Embarked
test = pd.concat([test, dummies], axis=1)
print("pclass categories created")

pclass categories created


### Free Passage

In [82]:
# create free feature based on fare = 0 
data["Free"] = np.where(data['Fare'] ==0, 1,0)
data["Free"] = data['Free'].astype(int)

train["Free"] = data["Free"][:891]
test["Free"] = data["Free"][891:]
print('Free Category created')

Free Category created


### FareBand

In [83]:
Pclass = [1,2,3]
for aclass in Pclass:
    fare_to_impute = data.groupby('Pclass')['Fare'].median()[aclass]
    data.loc[(data['Fare'].isnull()) & (data['Pclass'] == aclass), 'Fare'] = fare_to_impute
        
train["Fare"] = data["Fare"][:891]
test["Fare"] = data["Fare"][891:]        

#map Fare values into groups of numerical values
train["FareBand"] = pd.qcut(train['Fare'], 4, labels = [1, 2, 3, 4]).astype('category')
test["FareBand"] = pd.qcut(test['Fare'], 4, labels = [1, 2, 3, 4]).astype('category')

# convert FareBand categories to Columns
dummies=pd.get_dummies(train[["FareBand"]], prefix_sep='_') #Embarked
train = pd.concat([train, dummies], axis=1) 
dummies=pd.get_dummies(test[["FareBand"]], prefix_sep='_') #Embarked
test = pd.concat([test, dummies], axis=1)
print("Fareband categories created")

Fareband categories created


### Embarked categories

In [84]:
# convert Embarked categories to Columns
dummies=pd.get_dummies(train["Embarked"].astype('category'), prefix='Embarked') #Embarked
train = pd.concat([train, dummies], axis=1) 
dummies=pd.get_dummies(test["Embarked"].astype('category'), prefix='Embarked') #Embarked
test = pd.concat([test, dummies], axis=1)
print("Embarked feature created")

Embarked feature created


## Data describe

In [85]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_encode,Title_Honor,Title_Master,Title_Millitary,Title_Miss,Title_Mr,Title_Mrs,TitleCat,AgeBand_Child,AgeBand_Young Adult,AgeBand_Adult,AgeBand_Older Adult,AgeBand_Senior,Alone,Mother,Family Size,Family_Survival,SibSp_has,HadCabin,Deck,Deck_0,Deck_1,Deck_2,Deck_3,Deck_4,Deck_5,TicketRef_A,TicketRef_C,TicketRef_F,TicketRef_L,TicketRef_P,TicketRef_S,TicketRef_W,TicketRef_X,Pclass_1,Pclass_2,Pclass_3,Free,FareBand_1,FareBand_2,FareBand_3,FareBand_4,Embarked_C,Embarked_Q,Embarked_S
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.30266,0.523008,0.381594,32.204208,0.352413,0.003367,0.044893,0.020202,0.20651,0.582492,0.142536,1.723906,0.08193,0.27385,0.527497,0.044893,0.071829,0.602694,0.062851,1.904602,0.519641,0.317621,0.228956,0.648709,0.791246,0.016835,0.05275,0.066218,0.037037,0.035915,0.032548,0.05275,0.007856,0.004489,0.072952,0.072952,0.01459,0.741863,0.242424,0.20651,0.551066,0.016835,0.250281,0.251403,0.249158,0.249158,0.188552,0.08642,0.722783
std,257.353842,0.486592,0.836071,13.240228,1.102743,0.806057,49.693429,0.47799,0.057961,0.207186,0.14077,0.405028,0.493425,0.349796,1.032114,0.274413,0.446183,0.499524,0.207186,0.25835,0.489615,0.242831,1.613459,0.323961,0.465813,0.420397,1.376792,0.406647,0.128725,0.223659,0.248802,0.188959,0.186182,0.177549,0.223659,0.088337,0.06689,0.260203,0.260203,0.119973,0.437855,0.42879,0.405028,0.497665,0.128725,0.433418,0.434063,0.432769,0.432769,0.391372,0.281141,0.447876
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,446.0,0.0,3.0,29.0,0.0,0.0,14.4542,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,2.0,0.5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.5,1.0,0.0,0.0,0.0,0.0,1.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [86]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_encode,Title,Title_Master,Title_Millitary,Title_Miss,Title_Mr,Title_Mrs,TitleCat,AgeBand,AgeBand_Child,AgeBand_Young Adult,AgeBand_Adult,AgeBand_Older Adult,AgeBand_Senior,Alone,Mother,Family Size,Family_Survival,SibSp_has,HadCabin,Deck,Deck_0,Deck_1,Deck_2,Deck_3,Deck_4,Deck_5,TicketRef,TicketRef_A,TicketRef_C,TicketRef_F,TicketRef_L,TicketRef_P,TicketRef_S,TicketRef_W,TicketRef_X,Pclass_1,Pclass_2,Pclass_3,Free,FareBand,FareBand_1,FareBand_2,FareBand_3,FareBand_4,Embarked_C,Embarked_Q,Embarked_S
0,892,3,"Kelly, Mr. James",male,34.5,0,0,X,7.8292,,Q,0,Mr,0,0,0,1,0,1,Adult,0,0,1,0,0,1,0,1,0.5,0,0,0,1,0,0,0,0,0,X,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,X,7.0000,,S,1,Mrs,0,0,0,0,1,3,Older Adult,0,0,0,1,0,0,0,2,0.5,1,0,0,1,0,0,0,0,0,X,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,X,9.6875,,Q,0,Mr,0,0,0,1,0,1,Senior,0,0,0,0,1,1,0,1,0.5,0,0,0,1,0,0,0,0,0,X,0,0,0,0,0,0,0,1,0,1,0,0,2,0,1,0,0,0,1,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,X,8.6625,,S,0,Mr,0,0,0,1,0,1,Adult,0,0,1,0,0,1,0,1,0.5,0,0,0,1,0,0,0,0,0,X,0,0,0,0,0,0,0,1,0,0,1,0,2,0,1,0,0,0,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,X,12.2875,,S,1,Mrs,0,0,0,0,1,3,Young Adult,0,1,0,0,0,0,1,3,1.0,1,0,0,1,0,0,0,0,0,X,0,0,0,0,0,0,0,1,0,0,1,0,2,0,1,0,0,0,0,1
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,X,9.2250,,S,0,Mr,0,0,0,1,0,1,Young Adult,0,1,0,0,0,1,0,1,0.5,0,0,0,1,0,0,0,0,0,X,0,0,0,0,0,0,0,1,0,0,1,0,2,0,1,0,0,0,0,1
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,X,7.6292,,Q,1,Miss,0,0,1,0,0,2,Adult,0,0,1,0,0,1,0,1,0.5,0,0,0,1,0,0,0,0,0,X,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,1,0
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,X,29.0000,,S,0,Mr,0,0,0,1,0,1,Adult,0,0,1,0,0,0,0,3,1.0,1,0,0,1,0,0,0,0,0,X,0,0,0,0,0,0,0,1,0,1,0,0,3,0,0,1,0,0,0,1
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,X,7.2292,,C,1,Mrs,0,0,0,0,1,3,Young Adult,0,1,0,0,0,1,0,1,0.5,0,0,0,1,0,0,0,0,0,X,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,0
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A4,24.1500,,S,0,Mr,0,0,0,1,0,1,Young Adult,0,1,0,0,0,0,0,3,0.0,1,0,0,1,0,0,0,0,0,A,1,0,0,0,0,0,0,0,0,0,1,0,3,0,0,1,0,0,0,1


In [87]:
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Sex_encode', 'Title',
       'Title_Master', 'Title_Millitary', 'Title_Miss', 'Title_Mr',
       'Title_Mrs', 'TitleCat', 'AgeBand', 'AgeBand_Child',
       'AgeBand_Young Adult', 'AgeBand_Adult', 'AgeBand_Older Adult',
       'AgeBand_Senior', 'Alone', 'Mother', 'Family Size', 'Family_Survival',
       'SibSp_has', 'HadCabin', 'Deck', 'Deck_0', 'Deck_1', 'Deck_2', 'Deck_3',
       'Deck_4', 'Deck_5', 'TicketRef', 'TicketRef_A', 'TicketRef_C',
       'TicketRef_F', 'TicketRef_L', 'TicketRef_P', 'TicketRef_S',
       'TicketRef_W', 'TicketRef_X', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Free', 'FareBand', 'FareBand_1', 'FareBand_2', 'FareBand_3',
       'FareBand_4', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

## Train

In [88]:
feature_names = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch',
       'Fare', 'Sex_encode',
       'Title_Master', 'Title_Millitary', 'Title_Miss', 'Title_Mr',
       'Title_Mrs', 'TitleCat', 'AgeBand_Child',
       'AgeBand_Young Adult', 'AgeBand_Adult', 'AgeBand_Older Adult',
       'AgeBand_Senior', 'Alone', 'Mother', 'Family Size', 'Family_Survival',
       'SibSp_has', 'HadCabin', 'Deck_0', 'Deck_1', 'Deck_2', 'Deck_3',
       'Deck_4', 'Deck_5', 'TicketRef_A', 'TicketRef_C',
       'TicketRef_F', 'TicketRef_L', 'TicketRef_P', 'TicketRef_S',
       'TicketRef_W', 'TicketRef_X', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Free', 'FareBand', 'FareBand_1', 'FareBand_2', 'FareBand_3',
       'FareBand_4', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
label_name = "Survived"
x_train = train[feature_names]
y_train = train[label_name]
x_test = test[feature_names]

### testing GB Random Features

In [89]:
# features = []
# score = []
# for i in range(1000):
#     feature_names = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch',
#        'Fare', 'Sex_encode',
#        'Title_Master', 'Title_Millitary', 'Title_Miss', 'Title_Mr',
#        'Title_Mrs', 'TitleCat', 'AgeBand_Child',
#        'AgeBand_Young Adult', 'AgeBand_Adult', 'AgeBand_Older Adult',
#        'AgeBand_Senior', 'Alone', 'Mother', 'Family Size', 'Family_Survival',
#        'SibSp_has', 'HadCabin', 'Deck_0', 'Deck_1', 'Deck_2', 'Deck_3',
#        'Deck_4', 'Deck_5', 'TicketRef_A', 'TicketRef_C',
#        'TicketRef_F', 'TicketRef_L', 'TicketRef_P', 'TicketRef_S',
#        'TicketRef_W', 'TicketRef_X', 'Pclass_1', 'Pclass_2', 'Pclass_3',
#        'Free', 'FareBand', 'FareBand_1', 'FareBand_2', 'FareBand_3',
#        'FareBand_4', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
#     temp_features = []
#     for j in range(random.randint(1, len(feature_names)-1)):
#         temp_features.append(feature_names.pop(random.randint(0, len(feature_names)-1)))
#     data_to_train = x_train[temp_features]
#     X_train, X_val, y_train, y_val = train_test_split(data_to_train, y_train, test_size=0.3, random_state=21, stratify=y_train)
# #     print('Data split')

#     # Gradient Boosting Classifier
#     gbk = GradientBoostingClassifier()
#     gbk.fit(X_train, y_train)
#     y_pred = gbk.predict(X_val)
#     acc_gbk = round(accuracy_score(y_pred, y_val) * 100, 2)
# #     print(acc_gbk)
    
#     features.append(temp_features)
#     score.append(acc_gbk)
    
# models = pd.DataFrame({
#     'Features': features,
#     'Score': score})
# models.sort_values(by='Score', ascending=False).head(10)

In [90]:
# feature_names = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch',
#        'Fare', 'Sex_encode',
#        'Title_Master', 'Title_Millitary', 'Title_Miss', 'Title_Mr',
#        'Title_Mrs', 'TitleCat', 'AgeBand_Child',
#        'AgeBand_Young Adult', 'AgeBand_Adult', 'AgeBand_Older Adult',
#        'AgeBand_Senior', 'Alone', 'Mother', 'Family Size', 'Family_Survival',
#        'SibSp_has', 'HadCabin', 'Deck_0', 'Deck_1', 'Deck_2', 'Deck_3',
#        'Deck_4', 'Deck_5', 'TicketRef_A', 'TicketRef_C',
#        'TicketRef_F', 'TicketRef_L', 'TicketRef_P', 'TicketRef_S',
#        'TicketRef_W', 'TicketRef_X', 'Pclass_1', 'Pclass_2', 'Pclass_3',
#        'Free', 'FareBand', 'FareBand_1', 'FareBand_2', 'FareBand_3',
#        'FareBand_4', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
# temp1 = set(models.iloc[47]["Features"])
# temp2 = set(feature_names)
# print(temp2 - temp1)
# print(temp1)

In [91]:
# temp1 = {'Embarked_S', 'Pclass_3', 'Title_Miss', 'AgeBand_Older Adult', 'Title_Mr', 'Deck_5', 
#  'TicketRef_A', 'Free', 'TicketRef_C', 'Family Size', 'SibSp', 'PassengerId', 'Embarked_C', 'FareBand_2', 'Deck_2'}
# temp1 = list(temp1)

In [92]:
# for i in range(5, 5):
#     model = DecisionTreeClassifier(max_depth=i)
#     model.fit(x_train, y_train)
#     print(model.score(x_train, y_train))

# model = RandomForestClassifier(n_estimators=1000, random_state=2)
# model.fit(x_train, y_train)

## Predict

In [93]:
feature_names = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch',
       'Fare', 'Sex_encode',
       'Title_Master', 'Title_Millitary', 'Title_Miss', 'Title_Mr',
       'Title_Mrs', 'TitleCat', 'AgeBand_Child',
       'AgeBand_Young Adult', 'AgeBand_Adult', 'AgeBand_Older Adult',
       'AgeBand_Senior', 'Alone', 'Mother', 'Family Size', 'Family_Survival',
       'SibSp_has', 'HadCabin', 'Deck_0', 'Deck_1', 'Deck_2', 'Deck_3',
       'Deck_4', 'Deck_5', 'TicketRef_A', 'TicketRef_C',
       'TicketRef_F', 'TicketRef_L', 'TicketRef_P', 'TicketRef_S',
       'TicketRef_W', 'TicketRef_X', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Free', 'FareBand', 'FareBand_1', 'FareBand_2', 'FareBand_3',
       'FareBand_4', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
label_name = "Survived"
x_train = train[feature_names]
y_train = train[label_name]
x_test = test[feature_names]

In [94]:
# x_test = test[feature_names]
# prediction = model.predict(x_test)
# len(prediction)

In [95]:
# from sklearn.tree import export_graphviz
# import graphviz

# export_graphviz(model,
#                 feature_names=feature_names,
#                 class_names=["Perish", "Survived"],
#                 out_file="decision-tree.dot")

# with open("decision-tree.dot") as f:
#     dot_graph = f.read()
    
# graphviz.Source(dot_graph)

In [106]:
# Gradient Boosting Classifier

# Setup the parameters and distributions to sample from: param_dist
param_dist = {'max_depth':np.arange(1, 7),
              'min_samples_leaf': np.arange(1, 10),
              'n_estimators':np.arange(10, 100, 10),
              'max_features':[0.1, 0.3, 0.5, 0.7, 0.9, 1.0]}

# Instantiate Classifier
gbk = GradientBoostingClassifier()

# Instantiate the RandomizedSearchCV object: tree_cv
gbk_cv = RandomizedSearchCV(gbk, param_dist, cv=30)

gbk_cv.fit(x_train, y_train)
prediction = gbk_cv.predict(x_test)

print("Tuned Gradient Boost Parameters: {}".format(gbk_cv.best_params_))
print("Best score is {}".format(gbk_cv.best_score_))

Tuned Gradient Boost Parameters: {'n_estimators': 20, 'min_samples_leaf': 6, 'max_features': 0.9, 'max_depth': 3}
Best score is 0.8540965207631874


In [107]:
# gb_cv = GradientBoostingClassifier(n_estimators = 100,
#                                     max_depth = 3,
#                                     max_features = 0.9,
#                                     random_state = 27)
# # Fit and prediction
# gb_cv.fit(x_train[temp1], y_train)
# prediction = gb_cv.predict(x_test[temp1])

## Submit

In [108]:
submission = pd.read_csv("data/gender_submission.csv")
submission["Survived"] = prediction
submission.to_csv("submission/baseline_GB.csv", index=False)