<h2>Kaggle Titanic ML Challenge Using XGBoost</h2>

In [65]:
try:
    import xgboost as xgb
except:
    ! pip install xgboost
    import xgboost as xgb
import pandas as pd
import numpy as np
# Import your packages

In [124]:
try: 
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
except:
    ! pip install sklearn
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
# Grab the relevant sklearn modules

from sklearn.ensemble import RandomForestClassifier
# To see if XGBoost lives up to its reputation

In [146]:
training_data = pd.read_csv("titanic/train.csv")
test_data = pd.read_csv("titanic/test.csv")
# The data for this challenge is on the Kaggle website.

In [147]:
training_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [148]:
training_data.select_dtypes(include=['object'])
# Object dtypes in order that I can drop or create dummy values for them

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S
...,...,...,...,...,...
886,"Montvila, Rev. Juozas",male,211536,,S
887,"Graham, Miss. Margaret Edith",female,112053,B42,S
888,"Johnston, Miss. Catherine Helen ""Carrie""",female,W./C. 6607,,S
889,"Behr, Mr. Karl Howell",male,111369,C148,C


In [149]:
training_data = pd.get_dummies(training_data, columns=["Sex","Embarked"])
# These are object dtypes that may impactful on the survival rate.

In [150]:
training_data.head()
# See how pandas get_dummies works. It takes the number of values for each 'dummy' column and creates that number of colum,
# assigning a binary value.

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,1,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,1,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,1,0,0,1


<h3>Matching The Outcomes</h3>

Here I trained the model on just a portion of the training data to see how accurately it matched known results.

In [151]:
reduced_df = training_data.drop(columns=['Name','Ticket','Cabin'])
# Drop the columns that likely have no bearing on the outcome. Additionally there are many NaN values in these columns.

In [152]:
reduced_df = reduced_df[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S','Survived']]
# Moved survived to the end just to make the next bit easier

In [137]:
X = reduced_df.iloc[:,:11]
y = reduced_df.iloc[:,-1]
# Create your data and label splits

In [157]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [158]:
train = xgb.DMatrix(X_train, label=y_train)
test = xgb.DMatrix(X_test, label=y_test)
# XGB uses DMatrix as its data format

In [159]:
param = {
    'max_depth':4,
    'eta':0.3,
    'objective': 'multi:softmax',
    'num_class':2
}
epochs=10
# Create parameters and a number of epochs or periods to train on. These are not exhaustive.

In [160]:
model=xgb.train(param,train,epochs)
# Train the model using the params, the training DMatrix and the number of epochs

In [161]:
predictions = model.predict(test)
# Create predictions against the test set of data

In [162]:
print(predictions)

[0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0.
 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 1. 0. 0.
 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1.
 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0.]


In [57]:
accuracy_score(y_test,predictions)
# accuracy_score allows you to compare your predictions to the actual results and returns a value to denote accuracy
# ~85% without much effort.

0.8491620111731844

In [163]:
param = {
    'max_depth':20,
    'eta':0.2,
    'objective': 'multi:softmax',
    'num_class':2
}
epochs=40
# a bit of tweaking

In [100]:
model=xgb.train(param,train,epochs)
# retrain the model with the new params

In [101]:
predictions = model.predict(test)
accuracy_score(y_test,predictions)
# New predictions and our accuracy score is slightly improved

0.8547486033519553

<h3>Comparing to another Model</h3>

I wanted to see how XGBoost faired against another model. I've previously used RandomForestClassifier in similar challenges, so thought I'd try that.

In [139]:
random_forest = RandomForestClassifier(n_estimators=100)
# Fairly standard parameters for this model

In [166]:
reduced_df.dropna(axis=0,how='any', inplace=True)
#XGB just handled the NaN values, which was nice. Random Forest will not

In [168]:
X = reduced_df.iloc[:,:11]
y = reduced_df.iloc[:,-1]
# Create your data and label splits

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [170]:
random_forest.fit(X_train,y_train)

In [172]:
y_pred = random_forest.predict(X_test)
accuracy_score(y_test,y_pred)
# To be fair, not too bad given we had to drop a bunch of lines due to the NaNs and had less data to train on.

0.8111888111888111

<h3>Predicting the outcomes</h3>

Here we use the data as intended. Training data to train the model and create predictions against the test data. There's no
way to know how accurate you've been as there is not a survived column in the test data. More reflective of a real world use. Back to XGB.

In [106]:
training_data=pd.get_dummies(training_data, columns=["Sex","Embarked"])

In [117]:
test_data = pd.get_dummies(test_data, columns=["Sex","Embarked"])
test_data=test_data.drop(columns=['Name','Ticket','Cabin'])

In [118]:
test_data = test_data[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]

In [108]:
training_data=training_data.drop(columns=['Name','Ticket','Cabin'])

In [110]:
training_data = training_data[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S','Survived']]

In [119]:
X_train = training_data.iloc[:,:11]
y_train = training_data.iloc[:,-1]
X_test = test_data.iloc[:,::]
# We can't use train_test_split as there is no y_test values
# Otherwise the process is similar

In [115]:
train = xgb.DMatrix(X_train, label=y_train)

In [116]:
model=xgb.train(param,train,epochs)

In [120]:
test = xgb.DMatrix(X_test)

In [121]:
predictions = model.predict(test)

In [122]:
test_data['Survived_Prediction'] = predictions
# Add the predictions as a new column to the data frame to see how it predicts each individual faired.

In [123]:
test_data

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived_Prediction
0,892,3,34.5,0,0,7.8292,0,1,0,1,0,0.0
1,893,3,47.0,1,0,7.0000,1,0,0,0,1,0.0
2,894,2,62.0,0,0,9.6875,0,1,0,1,0,0.0
3,895,3,27.0,0,0,8.6625,0,1,0,0,1,0.0
4,896,3,22.0,1,1,12.2875,1,0,0,0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,,0,0,8.0500,0,1,0,0,1,0.0
414,1306,1,39.0,0,0,108.9000,1,0,1,0,0,1.0
415,1307,3,38.5,0,0,7.2500,0,1,0,0,1,0.0
416,1308,3,,0,0,8.0500,0,1,0,0,1,0.0
