In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks", context="talk")

import re
from mlxtend.plotting import plot_confusion_matrix

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, accuracy_score, roc_auc_score, confusion_matrix, classification_report, cohen_kappa_score, recall_score, precision_score
from sklearn import ensemble
from sklearn.inspection import permutation_importance


import warnings
warnings.filterwarnings('ignore')

# Data input

In [1]:
testing = pd.read_csv('/kaggle/input/titanic/test.csv')
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = testing.copy()

target = 'Survived'

data = pd.concat([train, test], axis = 0)
data.info()
data.head(5)

In [1]:
plt.subplots(figsize=(10, 4))
plt.pie(data['Survived'].value_counts()
        ,labels = data['Survived'].value_counts().index
        ,autopct='%1.1f%%'
        ,shadow=True
        ,explode = (0.05, 0.0)
       )
plt.title('Survived')
plt.show()

# Data cleansing

In [1]:
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

In [1]:
data[data['Fare'].isna()]

In [1]:
data_corr = data.corr().abs().unstack().sort_values(kind = "quicksort", ascending = False).reset_index()
data_corr.loc[data_corr['level_0'] == 'Fare']

In [1]:
data['Fare'] = data['Fare'].fillna(data.groupby(['Pclass'])['Fare'].transform('median'))

In [1]:
data.isna().sum()

# Feature engineering

In [1]:
data['Family_size'] =  data["Parch"] + data["SibSp"] + 1

plt.subplots(figsize=(10, 4))
sns.barplot(data = data,x = 'Family_size',y = 'Survived')
plt.show()

In [1]:
# data.loc[ data['Family_size'] == 1, 'Family_size_group'] = 0
data.loc[(data['Family_size'] > 1) & (data['Family_size'] <= 4), 'small_family_size'] = 1
data.loc[(data['Family_size'] > 4) | (data['Family_size'] == 1), 'small_family_size'] = 0

plt.subplots(figsize=(10, 4))
sns.barplot(data = data,x = 'small_family_size',y = 'Survived')
plt.show()

In [1]:
data.loc[data['Family_size'] == 1, 'Alone'] = 1
data.loc[data['Family_size'] > 1, 'Alone'] = 0

plt.subplots(figsize=(10, 4))
sns.barplot(data = data,x = 'Alone',y = 'Survived')
plt.show()

In [1]:
data['Title'] = data.Name.apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1))
data['Title'].unique()

In [1]:
data['Title'] = data['Title'].replace({'Mlle':'Miss', 'Mme':'Mrs', 'Ms':'Miss'})
data['Title'] = data['Title'].replace(['Don', 'Dona', 'Rev', 'Dr', 'Major', 'Lady', 'Sir', 'Col', 'Capt', 'Countess', 'Jonkheer'],'Special')

plt.subplots(figsize=(13, 5))
sns.barplot(data=data, x='Title', y='Survived')
plt.show()

In [1]:
data['Title'].value_counts(normalize = True).round(3)

In [1]:
data['Age'].isna().sum()

In [1]:
data[data['Age'] < 1]

In [1]:
data.loc[data['Age'] < 1, 'Age'] = None
data['Age'].isna().sum()

In [1]:
data_corr = data.corr().abs().unstack().sort_values(kind = "quicksort", ascending = False).reset_index()
data_corr.loc[data_corr['level_0'] == 'Age']

In [1]:
# np.random.seed(42)
# data['Age'][data['Age'].isna()] = np.random.randint(high = data['Age'].mean() + data['Age'].std()
#                                                     ,low = data['Age'].mean() - data['Age'].std()
#                                                     ,size = data['Age'].isna().sum())

data['Age'] = data['Age'].fillna(data.groupby(['Pclass', 'Title'])['Age'].transform('median'))

data['Age'].isna().sum()

In [1]:
data.groupby('Title')['Age'].median()

In [1]:
plt.subplots(figsize = (10, 5))
sns.distplot(data['Age'][data['Survived'] == 1].dropna(), kde = True, label = 'Survived = 1', color = 'orange', bins = 15)
sns.distplot(data['Age'][data['Survived'] == 0].dropna(), kde = True, label = 'Survived = 0', bins = 15)
plt.legend(prop = {'size': 12})
plt.title('Surival by age groups')
plt.show()

In [1]:
data.info()

In [1]:
data_corr = data.corr().abs().unstack().sort_values(kind = "quicksort", ascending = False).reset_index()
data_corr.loc[data_corr['level_0'] == 'Age']

In [1]:
def age_grouping(age):
    if (age < 4):
        return 'Infants'
    elif (age >= 4) & (age < 6):
        return 'Preschool'
    elif (age >= 6) & (age < 13):
        return 'Children'
    elif (age >= 13) & (age < 19):
        return 'Adolescents'
    elif (age >= 19) & (age < 45):
        return 'Adults'
    elif (age >= 45) & (age < 60):
        return 'Middle age'
    else:
        return 'Seniors'  
    
data['age_group'] = np.vectorize(age_grouping)(data['Age'])
    
plt.subplots(figsize=(18, 5))
sns.barplot(data = data, x = 'age_group', y = 'Survived')
plt.show()

# EDA

In [1]:
plt.subplots(figsize=(13, 5))
sns.barplot(data=data, x='Parch', y='Survived')
plt.show()

In [1]:
plt.subplots(figsize=(13, 5))
sns.barplot(data=data, x='SibSp', y='Survived')
plt.show()

In [1]:
plt.subplots(figsize=(13, 5))
sns.barplot(data=data, x='Pclass', y='Survived')
plt.show()

In [1]:
plt.subplots(figsize=(13, 5))
sns.barplot(data=data, x='Embarked', y='Survived')
plt.show()

In [1]:
data[['Pclass', 'Embarked', 'Survived']].groupby(['Pclass', 'Embarked']).mean().round(2)

In [1]:
data['Cabin'].unique()

In [1]:
data['Cabin_group'] = data['Cabin'].str[:1]
data.loc[data['Cabin'].isna(), 'Cabin_group'] = 'unkown'

plt.subplots(figsize=(13, 5))
sns.barplot(data=data, x='Cabin_group', y='Survived')
plt.show()

In [1]:
print('Pclass: ', data[data['Cabin_group'] == 'T']['Pclass'].values)
data.loc[data['Cabin'] == 'T', 'Cabin_group'] = 'A'

In [1]:
data[['Cabin_group', 'Pclass', 'Survived']].groupby(['Cabin_group', 'Pclass']).mean().round(2)

In [1]:
data.loc[data['Cabin_group'].isin(['A', 'B', 'C']), 'Cabin_group'] = 'ABC'
data.loc[data['Cabin_group'].isin(['D', 'E']), 'Cabin_group'] = 'DE'
data.loc[data['Cabin_group'].isin(['F', 'G']), 'Cabin_group'] = 'FG'

plt.subplots(figsize=(13, 5))
sns.barplot(data=data, x='Cabin_group', y='Survived')
plt.show()

In [1]:
plt.subplots(figsize = (10, 7))
sns.distplot(data['Fare'][data['Survived'] == 1].dropna(), kde = True, label = 'Survived = 1', color = 'orange', bins = 8)
sns.distplot(data['Fare'][data['Survived'] == 0].dropna(), kde = True, label = 'Survived = 0', bins = 8)
plt.legend(prop = {'size': 12})
plt.title('Surival by age groups')
plt.show()

In [1]:
data.head()

In [1]:
data['Cabin_group'].unique()

In [1]:
data['Sex_int'] = data['Sex'].replace({'male': 1, 'female': 0})
data['Embarked_int'] = data['Embarked'].replace({'S': 0, 'C': 1, 'Q':2})
data['Title_int'] = data['Title'].replace({'Mr': 0, 'Mrs': 1, 'Miss':2, 'Master':3, 'Special':4})
data['age_group_int'] = data['age_group'].replace({'Adults': 0, 'Middle age': 1, 'Infants':2, 'Adolescents':3, 'Preschool':4, 'Children':5, 'Seniors':6})
data['Cabin_group_int'] = data['Cabin_group'].replace({'unkown': 0, 'ABC': 1, 'DE':2, 'FG':3})

In [1]:
corrMatrix = data.corr()

plt.subplots(figsize=(22, 10))
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [1]:
plt.subplots(figsize=(12, 4))
corrMatrix['Survived'].drop(['Survived']).sort_values().plot(kind = 'bar')
plt.show()

In [1]:
data = data.loc[:,~data.columns.str.endswith('_int')]

# Prep before model

In [1]:
data.info()
data.head()

In [1]:
# Preparing features for analysis
dummy_features = ['Sex'
                  , 'Pclass'
                  , 'Embarked'
                  , 'Cabin_group'
                  , 'Title'
                  , 'age_group'
                 ]

for col in dummy_features:
    data[col] = data[col].astype(object)
    
drop_features = ['PassengerId', 'Ticket', 'Name', 'Cabin'
                 ,'small_family_size'
#                  ,'Family_size'
                 ,'Alone'
                 ,'SibSp'
                 ,'Parch'
#                  ,'Embarked'
#                  ,'Cabin_group'
#                  ,'Title'
                 ,'Age'
#                  ,'age_group'
                ]
    
data = pd.concat([data, pd.get_dummies(data[dummy_features], drop_first = True)], axis = 1, sort = False)
data.drop(columns = data[dummy_features], inplace = True)
data.drop(columns = data[drop_features], inplace = True)

data.tail()

In [1]:
test = data[data['Survived'].isnull()].drop(['Survived'], axis = 1)
train = data[data['Survived'].notnull()]

train.info()
print('-'*70)
test.info()

In [1]:
# Drop Nan just to be safe
train.dropna(inplace = True)
test.dropna(inplace = True)

In [1]:
# Separating target column from other features
y = train['Survived']
x = train.drop(columns = target)

# Train and Test dataset split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42
                                                    , stratify = y
                                                   )

# Modelling

In [1]:
# random forest model hyper-tuned
RF = ensemble.RandomForestClassifier()
params = {
          'n_estimators':[n for n in range(100, 250, 50)] # default 100 
          ,'max_depth':[n for n in range(3, 8)] # default None 
#           ,'criterion': ['gini', 'entropy'] # default 'gini'
          ,'min_samples_leaf': [n for n in range(3, 6, 1)] # default 1
          ,'max_features' : [None] # default 'sqrt'
          ,'random_state' : [42]
          }

RF_model = GridSearchCV(RF, param_grid = params, cv = 5, n_jobs = -1).fit(x_train, y_train)
print("Best Hyper Parameters:",RF_model.best_params_)

# Area under the curve probability score
RF_probs = RF_model.predict_proba(x_test)
RF_probs = RF_probs[:, 1]
RF_auc = roc_auc_score(y_test, RF_probs)
print('AUC: %.3f' % RF_auc)

RF_predictions = RF_model.predict(x_test).astype(int)
RF_accuracy = accuracy_score(y_test, RF_predictions)
print("RF accuracy: %.3f" % RF_accuracy)
print("RF Recall: " + '%.3f' % recall_score(y_test, RF_predictions)) # The recall is intuitively the ability of the classifier to find all the positive samples.
print("RF Precission: " + '%.3f' % precision_score(y_test, RF_predictions)) # The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.
print("RF cohen_kappa_score: %.3f" % cohen_kappa_score(y_test, RF_predictions)) # Scores above .8 are generally considered good agreement

# AUC plot
plt.figure(figsize = (8, 6))
RF_fpr, RF_tpr, RF_thresholds = roc_curve(y_test, RF_probs)
plt.plot([0, 1], [0, 1], linestyle = '--')
plt.plot(RF_fpr, RF_tpr, color = 'tab:green')
plt.show()

In [1]:
cm = confusion_matrix(y_test, RF_predictions)
plot_confusion_matrix(cm)
plt.title('RF')
plt.show()

# Feature Importance

In [1]:
plt.figure(figsize = [6, 6])
pd.Series(RF_model.best_estimator_.feature_importances_, index = x.columns).nlargest(10).plot(kind = 'barh')
plt.show()

In [1]:
perm_importance = permutation_importance(RF_model, x_test, y_test)
sorted_idx = perm_importance.importances_mean.argsort()

plt.figure(figsize = [8, 8])
plt.barh(x.columns[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
plt.show()

In [1]:
# from sklearn.feature_selection import SelectFromModel
# selector = SelectFromModel(RF_model.best_estimator_, threshold = 0.05, prefit = True)
# feature_idx = selector.get_support()
# feature_name = x.columns[feature_idx]
# feature_name

 # Model investigation

In [1]:
data['churn_proba'] = RF_model.best_estimator_.predict_proba(data[x.columns])[:,1]

In [1]:
import shap
shap.initjs()

explainer = shap.TreeExplainer(RF_model.best_estimator_)
shap_values = explainer.shap_values(data[x.columns])

shap.summary_plot(shap_values[1], data[x.columns], plot_type = "bar")

In [1]:
shap.summary_plot(shap_values[1], data[x.columns])

In [1]:
shap.dependence_plot("Fare", shap_values[1], data[x.columns])

In [1]:
row = 1

shap.force_plot(explainer.expected_value[1], shap_values[1][:1000], data[x.columns].iloc[:1000], matplotlib = False)

class ShapObject:
    def __init__(self, base_values, data, values, feature_names):
        self.base_values = base_values # Single value
        self.data = data # Raw feature values for 1 row of data
        self.values = values # SHAP values for the same row of data
        self.feature_names = feature_names # Column names
        
shap_object = ShapObject(base_values = explainer.expected_value[1],
                         values = shap_values[1][row,:],
                         feature_names = data[x.columns].columns,
                         data = data[x.columns].iloc[row,:])

shap.waterfall_plot(shap_object)

In [1]:
shap.force_plot(explainer.expected_value[1], shap_values[1][:500], data[x.columns].iloc[:500], matplotlib = False)

# Export data

In [1]:
predict_RF = RF_model.predict(test).astype(int)
submit_RF = pd.DataFrame({'PassengerId': testing['PassengerId'],
                          'Survived': predict_RF})

#creating submission file
filename_RF = 'Titanic Prediction RF.csv'
submit_RF.to_csv(filename_RF,index=False)
print('Saved file: ' + filename_RF)