In [None]:
# Import Dependencies
%matplotlib inline

# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize

# Machine learning
import catboost
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier, Pool, cv

# Let's be rebels and ignore warnings for now
import warnings
warnings.filterwarnings('ignore')

In [None]:
test = pd.read_csv('../input/titanic/test.csv')
train = pd.read_csv('../input/titanic/train.csv')
gender_submission = pd.read_csv('../input/titanic/gender_submission.csv')
print('Datasets Loaded')

In [None]:
train.head()

In [None]:
len(train)

In [None]:
test.head()

In [None]:
gender_submission.head()

In [None]:
train.describe()

It appears a few rows in the Age and Class columns are missing as well as a couple in Embarked. We can see the missing values in the data set using the missingno.matrix() function.

In [None]:
missingno.matrix(train, figsize = (30,10))

forming bins:

In [None]:
df_bin = pd.DataFrame() #for discretized continuous variables
df_con = pd.DataFrame() #for continous variables

In [None]:
train.dtypes

first lets see how many people survived:

In [None]:
fig = plt.figure(figsize=(20,1))
sns.countplot(y='Survived', data=train);
print(train.Survived.value_counts())

549 to 342, not great odds for our prospective survivors.

In [None]:
#we can add this to our subset dataframes
df_bin['Survived'] = train['Survived']
df_con['Survived'] = train['Survived']

let's look at another feature : Pclass

the ticket class of the passenger

In [None]:
#plotting the distribution
sns.distplot(train.Pclass)

if we recall from the missingno function, there were no missing values here. So we can go ahead and add them to our sub dataframes.

In [None]:
df_bin['Pclass'] = train['Pclass']
df_con['Pclass'] = train['Pclass']

Ok, the Name feature is next

In [None]:
train.Name.value_counts()

seems like now two names are repeating which is good. We could probably clean up the column by removing the title ex: 'Mr.' 'Mrs' 'Master'

Next we will look at the break down of the Sex feature:

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(y='Sex', data=train)

quite a higher value of males to females, since we know there are no missing values here we can add it to our subset.


In [None]:
df_bin['Sex'] = train['Sex']
#because it's a binary value, lets add it as either 1 or 0
df_bin['Sex'] = np.where(df_bin['Sex'] == 'female', 1,0)

df_con['Sex'] = train['Sex']

Because 'Sex' and 'Survival' are both binary, we can easily compare the two:

In [None]:
fig = plt.figure(figsize=(10,10))
sns.distplot(df_bin.loc[df_bin['Survived'] == 1]['Sex'], kde_kws={'label': 'Survived'});
sns.distplot(df_bin.loc[df_bin['Survived'] == 0]['Sex'], kde_kws={'label': 'Did not survive'});

this chart shows 0 for male and 1 for female. with the yellow bar showing deaths and the blue bar showing survivals. Clearly females had a significantly higher survival rate than males.

Ok we will move on the the Age feature
we can recall there were quite a fiew missing values in age. We have the option to either average out the age or cut them out. I've chosen to fill the blanks with the average:

In [None]:
#first find the average
train['Age'].mean()

the mean age is roughly 29.70, we can fill this into the missing cells and add it to our subset

In [None]:
train['Age'].fillna(29.70, inplace = True)
train['Age']

In [None]:
df_bin['Age'] = pd.cut(train['Age'], 10) #this will bucket our bin into different age groups
df_con['Age'] = train['Age'] #non-bucketed

On to the SibSp feature, which is a measure of how many siblins/spouses the passenger has abourd the Titanic:

In [None]:
train.SibSp.value_counts()

No missing values so we can add it to our sub dataframe

In [None]:
df_bin['SibSp'] = train['SibSp']
df_con['SibSp'] = train['SibSp']

In [None]:

#setting up visualization function for ease of use.
def plot_count_dist(data, bin_df, label_column, target_column, figsize=(20,5), use_bin_df=False):

    if use_bin_df:
        fig = plt.figure(figsize=figsize)
        plt.subplot(1,2,1)
        sns.countplot(y=target_column, data=bin_df);
        plt.subplot(1,2,2)
        sns.distplot(data.loc[data[label_column] == 1][target_column],
                    kde_kws={"label": "Survived"});
        sns.distplot(data.loc[data[label_column] == 0][target_column],
                    kde_kws={"label": "Did not survive"});
    else:
        fig = plt.figure(figsize=figsize)
        plt.subplot(1,2,1)
        sns.countplot(y=target_column, data=bin_df);
        plt.subplot(1,2,2)
        sns.distplot(data.loc[data[label_column] == 1][target_column],
                    kde_kws={"label": "Survived"});
        sns.distplot(data.loc[data[label_column] == 0][target_column],
                    kde_kws={"label": "Did not survive"});

we can now easily visualize our Sibsp feature compared to survivability

In [None]:
plot_count_dist(train,
               bin_df=df_bin,
               label_column='Survived',
               target_column='SibSp',
               figsize=(20,10))

We can see on the right graph, if you have only 1 sibling or spouse, your survivability greatly exceeds the fatality rate.

Next we will look at the feature Parch: which is the number of parents/childeren the passenger has aboard the titanic, this is similar to SibSp so the analysis will be similar.

In [None]:
#add to subset
df_bin['Parch'] = train['Parch']
df_con['Parch'] = train['Parch']

In [None]:
plot_count_dist(train,
               bin_df=df_bin,
               label_column='Survived',
               target_column='Parch',
               figsize=(20,10))

We can see that having 1-2 children or parents greatly increases your survivability

Now we will look at the Ticket feature: the passangers ticket number

In [None]:
sns.countplot(y='Ticket', data=train);

not an ideal depiction, how else can we look at it.

In [None]:
#how many kinds of ticket were there?
train.Ticket.value_counts()

681 different types with a difficult pattern to determine. There may be a way to reduce this down, but for now it's not going to be usable for much.

Let's continue to the Fare feature: the price of the ticket.

In [None]:
#sns.countplot(y='Fare',data=train);
train.Fare.value_counts()

Fare has 248 different values but since they are quantitative we can still use it in by cutting it into bins.

In [None]:
#add to subset in bins
df_bin['Fare'] = pd.cut(train['Fare'], bins=5) #discretised 'cut into bins'
df_con['Fare'] = train['Fare'] 

In [None]:
df_bin.Fare.value_counts()

the values have been catagorized into 5 bins, each incremint is roughly $100. there also appears to be a value of '-50.' an outlier that we may wish to remove.

In [None]:
plot_count_dist(data = train,
               bin_df = df_bin,
               label_column='Survived',
               target_column='Fare',
               figsize=(20,5),
               use_bin_df=True)

You can see having a cheaper ticket greatly reduced your chances of survival.

feature Cabin: the passengers Cabin number
this feature had quite a significant amount of missing values, since this is an innitial EDA we aren't going to try and use it now and move on.

Feature: Embarked
    the port where the passenger boarded the titanic
    key: C= Cherbourg, Q = Queenstown, S = Southampton
    
  this feature had a couple missing values but is overall fine to use.

In [None]:
sns.countplot(y='Embarked', data=train)

Southhampton was clearly the most common embarking point for passangers

As for the 2 missing values, we can probably assume they are from Southampton. However as 2 values will likely not skew our predictions much we will just remove them.

In [None]:
#add to subset
df_bin['Embarked'] = train['Embarked']
df_con['Embarked'] = train['Embarked']

In [None]:
#remove the NAN values based on the Embarked feature
print(len(df_con))
df_con = df_con.dropna(subset=['Embarked'])
df_bin = df_bin.dropna(subset=['Embarked'])
print(len(df_con))

this has removed the two rows. With that we have our two cleaned sub dataframes:

In [None]:
df_bin.head()

In [None]:
df_con.head()

Feature Encoding:
Now we have our two sub dataframes ready, we can encode the features so they're ready to be used with our machine learning models.
We will encode our binned dataframe (df_bin) with one-hot encoding and our continuous datafram (df_con) with the label encoding function from sklearn.

In [None]:
#one-hot encode binned variables
one_hot_cols = df_bin.columns.tolist()
one_hot_cols.remove('Survived')
df_bin_enc = pd.get_dummies(df_bin,columns=one_hot_cols)

df_bin_enc.head()

"one hot encoding gives it a 0 for what it isnt and a 1 for what it is"

In [None]:
# One hot encode the categorical columns
df_embarked_one_hot = pd.get_dummies(df_con['Embarked'], 
                                     prefix='embarked')

df_sex_one_hot = pd.get_dummies(df_con['Sex'], 
                                prefix='sex')

df_plcass_one_hot = pd.get_dummies(df_con['Pclass'], 
                                   prefix='pclass')

In [None]:
# Combine the one hot encoded columns with df_con_enc
df_con_enc = pd.concat([df_con, 
                        df_embarked_one_hot, 
                        df_sex_one_hot, 
                        df_plcass_one_hot], axis=1)

# Drop the original categorical columns (because now they've been one hot encoded)
df_con_enc = df_con_enc.drop(['Pclass', 'Sex', 'Embarked'], axis=1)

In [None]:
df_con_enc.head(20)

Building Machine Learning Models:

now our data has been manipulated and converted into numbers, we can run a series of different machine learning algorithms over it to find which yield the best results.

In [None]:
#select the datafram we want to use first for predictions
selected_df = df_con_enc

In [None]:
#split the dataframe into data and labels
X_train = selected_df.drop('Survived', axis=1) #taking the selected_df, dropping survived and using the remaining variables
y_train = selected_df.Survived #taking the survived variable

In [None]:
#shape of the data (without labels)
X_train.shape

In [None]:
#shape of the data (with labels)
y_train.shape

Define a function to fit machine learning algorithms:
Since many of the algorithms we will use are from the sklearn library, they all take similar (practically the same) inputs and produce similar outputs.

To prevent writing code multiple times, we will functionise fitting the model and returning the accuracy scores.

In [None]:
# Function that runs the requested algorithm and returns the accuracy metrics
def fit_ml_algo(algo, X_train, y_train, cv):
    
    # One Pass
    model = algo.fit(X_train, y_train)
    acc = round(model.score(X_train, y_train) * 100, 2)
    
    # Cross Validation 
    train_pred = model_selection.cross_val_predict(algo, 
                                                  X_train, 
                                                  y_train, 
                                                  cv=cv, 
                                                  n_jobs = -1)
    # Cross-validation accuracy metric
    acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
    
    return train_pred, acc, acc_cv

Logistic Regression

In [None]:
# Logistic Regression
start_time = time.time()
train_pred_log, acc_log, acc_cv_log = fit_ml_algo(LogisticRegression(), 
                                                               X_train, 
                                                               y_train, 
                                                                    10)
log_time = (time.time() - start_time)
print("Accuracy: %s" % acc_log)
print("Accuracy CV 10-Fold: %s" % acc_cv_log)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))

K-Nearest Neighbors

In [None]:
# k-Nearest Neighbours
start_time = time.time()
train_pred_knn, acc_knn, acc_cv_knn = fit_ml_algo(KNeighborsClassifier(), 
                                                  X_train, 
                                                  y_train, 
                                                  10)
knn_time = (time.time() - start_time)
print("Accuracy: %s" % acc_knn)
print("Accuracy CV 10-Fold: %s" % acc_cv_knn)
print("Running Time: %s" % datetime.timedelta(seconds=knn_time))

Gaurssian Naive Bayes

In [None]:
# Gaussian Naive Bayes
start_time = time.time()
train_pred_gaussian, acc_gaussian, acc_cv_gaussian = fit_ml_algo(GaussianNB(), 
                                                                      X_train, 
                                                                      y_train, 
                                                                           10)
gaussian_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gaussian)
print("Accuracy CV 10-Fold: %s" % acc_cv_gaussian)
print("Running Time: %s" % datetime.timedelta(seconds=gaussian_time))


Linear Support Vector Machines (SVC)

In [None]:
# Linear SVC
start_time = time.time()
train_pred_svc, acc_linear_svc, acc_cv_linear_svc = fit_ml_algo(LinearSVC(),
                                                                X_train, 
                                                                y_train, 
                                                                10)
linear_svc_time = (time.time() - start_time)
print("Accuracy: %s" % acc_linear_svc)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_svc)
print("Running Time: %s" % datetime.timedelta(seconds=linear_svc_time))

Stochastic Gradient Descent

In [None]:
# Stochastic Gradient Descent
start_time = time.time()
train_pred_sgd, acc_sgd, acc_cv_sgd = fit_ml_algo(SGDClassifier(), 
                                                  X_train, 
                                                  y_train,
                                                  10)
sgd_time = (time.time() - start_time)
print("Accuracy: %s" % acc_sgd)
print("Accuracy CV 10-Fold: %s" % acc_cv_sgd)
print("Running Time: %s" % datetime.timedelta(seconds=sgd_time))

Decision Tree Classifier

In [None]:
# Decision Tree Classifier
start_time = time.time()
train_pred_dt, acc_dt, acc_cv_dt = fit_ml_algo(DecisionTreeClassifier(), 
                                                                X_train, 
                                                                y_train,
                                                                10)
dt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_dt)
print("Accuracy CV 10-Fold: %s" % acc_cv_dt)
print("Running Time: %s" % datetime.timedelta(seconds=dt_time))

Gradient Boost Trees

In [None]:
# Gradient Boosting Trees
start_time = time.time()
train_pred_gbt, acc_gbt, acc_cv_gbt = fit_ml_algo(GradientBoostingClassifier(), 
                                                                       X_train, 
                                                                       y_train,
                                                                       10)
gbt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gbt)
print("Accuracy CV 10-Fold: %s" % acc_cv_gbt)
print("Running Time: %s" % datetime.timedelta(seconds=gbt_time))

CatBoost Algorithm

In [None]:
#define the categorical features for the CatBoost model
cat_features = np.where(X_train.dtypes != np.float)[0]
cat_features

In [None]:
#use the CatBoost Pool() function to pool together the training data and categorical feature labels
train_pool = Pool(X_train,
                 y_train,
                 cat_features)

In [None]:
#CatBoost model definition
catboost_model = CatBoostClassifier(iterations=1000,
                                   custom_loss=['Accuracy'],
                                   loss_function='Logloss')

#Fit CatBoost model
catboost_model.fit(train_pool,
                  plot=True)

#CatBoost accuracy
acc_catboost = round(catboost_model.score(X_train, y_train) * 100, 2)

In [None]:
# How long will this take?
start_time = time.time()

# Set params for cross-validation as same as initial model
cv_params = catboost_model.get_params()

# Run the cross-validation for 10-folds (same as the other models)
cv_data = cv(train_pool,
             cv_params,
             fold_count=10,
             plot=True)

# How long did it take?
catboost_time = (time.time() - start_time)

# CatBoost CV results save into a dataframe (cv_data), let's withdraw the maximum accuracy score
acc_cv_catboost = round(np.max(cv_data['test-Accuracy-mean']) * 100, 2)

In [None]:
# Print out the CatBoost model metrics
print("---CatBoost Metrics---")
print("Accuracy: {}".format(acc_catboost))
print("Accuracy cross-validation 10-Fold: {}".format(acc_cv_catboost))
print("Running Time: {}".format(datetime.timedelta(seconds=catboost_time)))

Results:
which model had the best cross-validation accuracy?

In [None]:
models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Naive Bayes', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree', 'Gradient Boosting Trees',
              'CatBoost'],
    'Score': [
        acc_knn, 
        acc_log,  
        acc_gaussian, 
        acc_sgd, 
        acc_linear_svc, 
        acc_dt,
        acc_gbt,
        acc_catboost
    ]})
print("---Reuglar Accuracy Scores---")
models.sort_values(by='Score', ascending=False)

In [None]:
cv_models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Naive Bayes', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree', 'Gradient Boosting Trees',
              'CatBoost'],
    'Score': [
        acc_cv_knn, 
        acc_cv_log,      
        acc_cv_gaussian, 
        acc_cv_sgd, 
        acc_cv_linear_svc, 
        acc_cv_dt,
        acc_cv_gbt,
        acc_cv_catboost
    ]})
print('---Cross-validation Accuracy Scores---')
cv_models.sort_values(by='Score', ascending=False)

We can see that Gradient Boosting trees slightly edges out CatBoost in accuracy
We'll pay more attention to the cross-validation figure.

Cross-validation is more robust than just the .fit() models as it does multiple passes over the data instead of one.

Because the Gradiant boosting tree model got the best results, we'll use it for the next steps.

Feature Importance:
which features of the best model were most important for making predictions?

In [None]:
def feature_importance(model, data):
    
    fea_imp = pd.DataFrame({'imp': model.feature_importances_, 'col': data.columns})
    fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc [-30:]
    _ = fea_imp.plot(kind='barh', x='col', y='imp', figsize=(20,10))
    return fea_imp

In [None]:
#using catboost, not sure how to use gt
feature_importance(catboost_model, X_train)

We can see here that the greatest factors deciding survivability would be sex as well as having a child/parent as well as age.

Submission!!:

In [None]:
X_train.head()

In [None]:
test.head()

In [None]:
# One hot encode the columns in the test data frame (like X_train)

test_embarked_one_hot = pd.get_dummies(test['Embarked'], 
                                       prefix='embarked')

test_sex_one_hot = pd.get_dummies(test['Sex'], 
                                prefix='sex')

test_plcass_one_hot = pd.get_dummies(test['Pclass'], 
                                   prefix='pclass')

In [None]:
# Combine the test one hot encoded columns with test
test = pd.concat([test, 
                  test_embarked_one_hot, 
                  test_sex_one_hot, 
                  test_plcass_one_hot], axis=1)

In [None]:
# Create a list of columns to be used for the predictions
wanted_test_columns = X_train.columns
wanted_test_columns

In [None]:
# Make a prediction using the CatBoost model on the wanted columns
predictions = catboost_model.predict(test[wanted_test_columns])
predictions[:20]

In [None]:
# create a new dataframe we can submit
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survived'] = predictions.astype(int)
submission.head()

In [None]:
# our submission must have 418 rows
submission.describe()

In [None]:
# Convert submission dataframe to a csv for submission to kaggle
submission.to_csv('./df.csv', index=False)