In [1]:
import pandas as pd
import numpy as np
import random

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
def substrings_in_string(full_string, substrings):
    for substring in substrings:
        if full_string.find(substring) != -1:
            return substring
    return ""

In [3]:
# Load Training Data
df_train = pd.read_csv('data/train.csv', index_col = 'PassengerId')
df_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Load Test Data
df_test = pd.read_csv('data/test.csv', index_col = 'PassengerId')
df_test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# Merge Data
df_all = pd.concat([df_train, df_test])
print('All data({0[0]},{0[1]})'.format(df_all.shape))
df_all.head()

All data(1309,11)


Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282
4,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1.0,113803
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0.0,373450


In [6]:
# Select Numeric Featues
df_numeric = df_all[['Age', 'Fare', 'Parch', 'Pclass', 'SibSp']]
df_numeric.head()

Unnamed: 0_level_0,Age,Fare,Parch,Pclass,SibSp
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,22.0,7.25,0,3,1
2,38.0,71.2833,0,1,1
3,26.0,7.925,0,3,0
4,35.0,53.1,0,1,1
5,35.0,8.05,0,3,0


In [7]:
# Select Categorical Features
df_category = df_all[['Cabin', 'Embarked', 'Name', 'Sex', 'Ticket']]
df_category.head()

Unnamed: 0_level_0,Cabin,Embarked,Name,Sex,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,S,"Braund, Mr. Owen Harris",male,A/5 21171
2,C85,C,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599
3,,S,"Heikkinen, Miss. Laina",female,STON/O2. 3101282
4,C123,S,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803
5,,S,"Allen, Mr. William Henry",male,373450


In [8]:
# Numeric Feature Engineering

# Create a Family Size Column
df_numeric['FamilySize']= df_numeric['SibSp'] + df_numeric['Parch']

# Create a Fare/Person Column
df_numeric['FarePerPerson'] = df_numeric['Fare'] / (df_numeric['FamilySize'] +1)

df_numeric.head()

Unnamed: 0_level_0,Age,Fare,Parch,Pclass,SibSp,FamilySize,FarePerPerson
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,22.0,7.25,0,3,1,1,3.625
2,38.0,71.2833,0,1,1,1,35.64165
3,26.0,7.925,0,3,0,0,7.925
4,35.0,53.1,0,1,1,1,26.55
5,35.0,8.05,0,3,0,0,8.05


In [9]:
# Category Feature Engineering

# Create a Deck Column
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
df_category['Cabin'] = df_category['Cabin'].astype(str)
df_category['Deck'] = df_category['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

df_category.head()

Unnamed: 0_level_0,Cabin,Embarked,Name,Sex,Ticket,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,,S,"Braund, Mr. Owen Harris",male,A/5 21171,
2,C85,C,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C
3,,S,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,
4,C123,S,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C
5,,S,"Allen, Mr. William Henry",male,373450,


In [10]:
# Impute Missing Numeric Values
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
numeric_data = imp.fit_transform(df_numeric)

# Normalize Numeric Features
numeric_column_names = df_numeric.columns
from sklearn.preprocessing import Normalizer
norm = Normalizer(copy=False)
numeric_data = norm.fit_transform(numeric_data)

df_numeric = pd.DataFrame(columns=numeric_column_names, data=numeric_data, index=df_numeric.index)

print('({0[0]},{0[1]})'.format(df_numeric.shape))
df_numeric.head()

(1309,7)


Unnamed: 0_level_0,Age,Fare,Parch,Pclass,SibSp,FamilySize,FarePerPerson
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.929087,0.306176,0.0,0.126694,0.042231,0.042231,0.153088
2,0.430303,0.807195,0.0,0.011324,0.011324,0.011324,0.403598
3,0.913202,0.278351,0.0,0.10537,0.0,0.0,0.278351
4,0.507699,0.770252,0.0,0.014506,0.014506,0.014506,0.385126
5,0.947815,0.217998,0.0,0.081241,0.0,0.0,0.217998


In [11]:
# Encode Categorical Features
from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder()

for column in df_category:
    dt = df_category[column].dtype
    colum_index = df_category.columns.get_loc(df_category[column].name)
    
    # Clean-up NaN's in the Categorical data
    if df_category[column].dtype == int or df_category[column].dtype == float:
        df_category[column].fillna(0, inplace=True)
    else:
        df_category[column].fillna("", inplace=True)

    df_category[column] = label_enc.fit_transform(df_category[column])

print('({0[0]},{0[1]})'.format(df_category.shape))
df_category.head()

(1309,6)


Unnamed: 0_level_0,Cabin,Embarked,Name,Sex,Ticket,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,186,3,155,1,720,0
2,106,1,286,0,816,3
3,186,3,523,0,914,0
4,70,3,422,0,65,3
5,186,3,22,1,649,0


In [12]:
# Merge Numeric and Category Feaures
df_clean = pd.concat([df_category, df_numeric], axis=1)

print('({0[0]},{0[1]})'.format(df_clean.shape))
df_clean.head()

(1309,13)


Unnamed: 0_level_0,Cabin,Embarked,Name,Sex,Ticket,Deck,Age,Fare,Parch,Pclass,SibSp,FamilySize,FarePerPerson
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,186,3,155,1,720,0,0.929087,0.306176,0.0,0.126694,0.042231,0.042231,0.153088
2,106,1,286,0,816,3,0.430303,0.807195,0.0,0.011324,0.011324,0.011324,0.403598
3,186,3,523,0,914,0,0.913202,0.278351,0.0,0.10537,0.0,0.0,0.278351
4,70,3,422,0,65,3,0.507699,0.770252,0.0,0.014506,0.014506,0.014506,0.385126
5,186,3,22,1,649,0,0.947815,0.217998,0.0,0.081241,0.0,0.0,0.217998


In [13]:
# Split back into Train and Test Datasets
df_clean_train = df_clean[:891]
df_clean_test = df_clean[891:]


# Add "Survived" Column back to the Train dataset
df_clean_train = pd.concat([df_clean_train, df_train['Survived']], axis=1)

print('Test data({0[0]},{0[1]})'.format(df_clean_test.shape))
print('Train data({0[0]},{0[1]})'.format(df_clean_train.shape))
df_clean_train.tail()

Test data(418,13)
Train data(891,14)


Unnamed: 0_level_0,Cabin,Embarked,Name,Sex,Ticket,Deck,Age,Fare,Parch,Pclass,SibSp,FamilySize,FarePerPerson,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
887,186,3,811,1,130,0,0.825029,0.397236,0.0,0.061113,0.0,0.0,0.397236,0
888,40,3,464,0,18,2,0.408626,0.645199,0.0,0.021507,0.0,0.0,0.645199,1
889,186,3,607,0,923,0,0.771488,0.605445,0.051637,0.077456,0.025819,0.077456,0.151361,0
890,77,1,113,1,11,3,0.522409,0.602779,0.0,0.020093,0.0,0.0,0.602779,1
891,186,2,338,1,642,0,0.942349,0.228225,0.0,0.088345,0.0,0.0,0.228225,0


In [14]:
# Split the Train DataSet into X and y
X = df_clean_train.drop('Survived', axis=1)
y = df_clean_train.Survived

# Shuffle and Split the data
# This is running a StratifiedShuffleSplit in sklearn
import sklearn.model_selection as skms
X_train, X_validation, y_train, y_validation = skms.train_test_split(X, y,
                                                                     test_size=0.2, train_size=0.8,
                                                                     random_state=42, stratify=y)

In [15]:
# Train XGBoost Model
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [16]:
# Predict on the Cross Validation Set
y_val_pred = model.predict(X_validation)

# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_validation, y_val_pred))

# Accuracy Score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_validation, y_val_pred))

             precision    recall  f1-score   support

          0       0.78      0.76      0.77       110
          1       0.63      0.65      0.64        69

avg / total       0.72      0.72      0.72       179

0.720670391061


In [107]:
# Predict on the Test Dataset
X_test = df_clean_test
y_test = model.predict(X_test)

In [108]:
# Build the Submission Dataset
predictions = pd.DataFrame()
predictions['PassengerId'] = X_test.index
predictions['Survived'] = y_test.tolist()

print('Predictions({0[0]},{0[1]})'.format(predictions.shape))
print(predictions.head())

Predictions(418,2)
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         0


In [109]:
# Save Output
predictions.to_csv('data/submission.csv', index=False)