# Import the relevant libraries

In [None]:
import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

import string
import warnings
warnings.filterwarnings('ignore')

seed = 4092022

# Exploratory Data Analysis

## Concatenation

In [None]:
def concatenated_df(train_data, test_data):
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

def divide_df(all_data):
    return all_data.loc[:890], all_data.loc[891:].drop(['Survived'], axis=1)

df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')
df_all = concatenated_df(df_train, df_test)

df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Sets' 

dfs = [df_train, df_test]

print('Number of Training Examples = {}'.format(df_train.shape[0]))
print('Number of Test Examples = {}\n'.format(df_test.shape[0]))
print('Training X Shape = {}'.format(df_train.shape))
print('Training y Shape = {}\n'.format(df_train['Survived'].shape[0]))
print('Test X Shape = {}'.format(df_test.shape))
print('Test y Shape = {}\n'.format(df_test.shape[0]))
print(df_train.columns)
print(df_test.columns)

In [None]:
print(df_train.info())
df_train.sample(3)
# The Training Data has 891 rows and 12 columns. The extra column/feature is 'Survived.'
# 'Age', 'Cabin', and 'Embarked' features have missing values.

In [None]:
print(df_test.info())
df_test.sample(3)
# The Test Data has 418 rows and 11 columns.
# 'Age', 'Fare', and 'Cabin' have missing values.

In [None]:
def display_missing(df):
    for col in df.columns.tolist():
        print('{} column missing values: {}'.format(col, df[col].isnull().sum()))
# This prints column missing values and sums up the total number of missing values per category.
for df in dfs:
    print('{}'.format(df.name))
    display_missing(df)
# To display the total number of Missing Values per column/feature in the Training, Validation, & Test Datasets.

## Dealing with Missing Values

### 'Age'

In [None]:
df_all_corr = df_all.corr().abs().unstack().sort_values(kind="quicksort", ascending=True).reset_index()
df_all_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
df_all_corr[df_all_corr['Feature 1'] == 'Age']
# To see the correlation of the 'Age' feature and other features.
# Age is not correlated with 'PassengerId', so it doesn't have any effect on the target.
# 'Survived' is the feature I'm trying to predict. 

## 'Passenger Class'

In [None]:
df_all_corr = df_all.corr().abs().unstack().sort_values(kind="quicksort", ascending=True).reset_index()
df_all_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
df_all_corr[df_all_corr['Feature 1'] == 'Pclass']
# It is noteworthy that 'Pclass' and 'Survived' features are correlated.

## Using the Medians of 'Passenger Class' and 'Sex' features to fill in the NaN in 'Age'

In [None]:
age_by_pclass_sex = df_all.groupby(['Sex', 'Pclass']).median()['Age']

for pclass in range(1, 4):
    for sex in ['female', 'male']:
        print('Median age of Pclass {} {}s: {}'.format(pclass, sex, age_by_pclass_sex[sex][pclass]))
print('Median age of all passengers: {}'.format(df_all['Age'].median()))
# Median age of 'Pclass' groups is the best choice because of its high correlation with Age (0.408) and Survived (0.338). 
# It is also more logical to group ages by passenger classes instead of other features.
# When 'Pclass' increases (1 being the highest), the median age for both males and females also increases. 
# However, females tend to have slightly lower median Age than males. 

df_all['Age'] = df_all.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))
# To fill the missing values in Age with the medians of Sex and Pclass groups (28).

### 'Embarked'

In [None]:
df_all[df_all['Embarked'].isnull()]
# To find out the 2 missing values of 'Embarked' feature.
# After researching, I found out that Mrs. Stone embarked from Southampton 'S' with her maid Miss Icard.

In [None]:
df_all['Embarked'] = df_all['Embarked'].fillna('S')
# To fill the missing values in 'Embarked' with S.

### 'Fare'

In [None]:
df_all[df_all['Fare'].isnull()]

In [None]:
median_fare = df_all.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
# Logically, 'Fare' is related to family size ('Parch' and 'SibSp') and 'Pclass' features.
df_all['Fare'] = df_all['Fare'].fillna(median_fare)
# Therefore, the median 'Fare' value of a male with a third class ticket and no family is a logical choice to fill in this particular missing value.

### 'Cabin'

In [None]:
df_all['Deck'] = df_all['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')
# The 'Cabin' feature has many missing values and it cannot be simply ignored because some cabins might have higher survival rates.
# I created the 'Deck' column, using the first letters of the values in the Cabin column. 
# M stands for Missing Value.
df_all_decks = df_all.groupby(['Deck', 'Pclass']).count().drop(columns=['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 
                                                                        'Fare', 'Embarked', 'Cabin', 'PassengerId', 'Ticket']).rename(columns={'Name': 'Count'}).transpose()
# After looking at the datasets and doing some research, it turns out that the first letter of the 'Cabin' values are in fact the decks on which the cabins are located.


def get_pclass_dist(df):
    
    deck_counts = {'A': {}, 'B': {}, 'C': {}, 'D': {}, 'E': {}, 'F': {}, 'G': {}, 'M': {}, 'T': {}}
    decks = df.columns.levels[0]    
    
    for deck in decks:
        for pclass in range(1, 4):
            try:
                count = df[deck][pclass][0]
                deck_counts[deck][pclass] = count 
            except KeyError:
                deck_counts[deck][pclass] = 0
                
    df_decks = pd.DataFrame(deck_counts)    
    deck_percentages = {}
    # Created a dictionary for every passenger class count in every deck

    for col in df_decks.columns:
        deck_percentages[col] = [(count / df_decks[col].sum()) * 100 for count in df_decks[col]]
        
    return deck_counts, deck_percentages
    # Created a dictionary for every passenger class percentage in every deck

def display_pclass_dist(percentages):
    
    df_percentages = pd.DataFrame(percentages).transpose()
    deck_names = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'M', 'T')
    bar_count = np.arange(len(deck_names))  
    bar_width = 0.9
    
    pclass1 = df_percentages[0]
    pclass2 = df_percentages[1]
    pclass3 = df_percentages[2]
    
    plt.figure(figsize=(30, 15))
    plt.bar(bar_count, pclass1, color='#87fbb9', edgecolor='black', width=bar_width, label='Passenger Class 1')
    plt.bar(bar_count, pclass2, bottom=pclass1, color='#87c9fb', edgecolor='black', width=bar_width, label='Passenger Class 2')
    plt.bar(bar_count, pclass3, bottom=pclass1 + pclass2, color='#f37200', edgecolor='black', width=bar_width, label='Passenger Class 3')

    plt.xlabel('Deck', size=20, labelpad=30)
    plt.ylabel('Passenger Class Percentage', size=20, labelpad=30)
    plt.xticks(bar_count, deck_names)    
    plt.tick_params(axis='x', labelsize=20)
    plt.tick_params(axis='y', labelsize=20)
    
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1), prop={'size': 20})
    plt.title('Passenger Class Distribution in Decks', size=24, y=1)   
    
    plt.show()    

all_deck_count, all_deck_per = get_pclass_dist(df_all_decks)
display_pclass_dist(all_deck_per)
# Decks A, B, C were only for Pclass 1. 100% of A, B and C decks are 1st class passengers
# Deck D was for Pclasses 1 and 2. Deck D has 87% 1st class and 13% 2nd class passengers
# Deck E was for all Pclasses. Deck E has 83% 1st class, 10% 2nd class and 7% 3rd class passengers
# Deck F was only for Pclasses 2 and 3. Deck F has 62% 2nd class and 38% 3rd class passengers
# Deck G was only for Pclass 3. 100% of G deck are 3rd class passengers
# M contains all the passengers with missing values.
# Deck T contains only one Pclass 1 passenger.

In [None]:
dect_regroup = df_all[df_all['Deck'] == 'T'].index
df_all.loc[dect_regroup, 'Deck'] = 'A'
# Deck T has the closest resemblance to Deck A, so I decided to regroup that 1 passenger with Deck A.

In [None]:
df_all_decks_survived = df_all.groupby(['Deck', 'Survived']).count().drop(columns=['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 
                                                                                   'Embarked', 'Pclass', 'Cabin', 'PassengerId', 'Ticket']).rename(columns={'Name':'Count'}).transpose()

def get_survived_dist(df):
    
    surv_counts = {'A':{}, 'B':{}, 'C':{}, 'D':{}, 'E':{}, 'F':{}, 'G':{}, 'M':{}}
    decks = df.columns.levels[0]    

    for deck in decks:
        for survive in range(0, 2):
            surv_counts[deck][survive] = df[deck][survive][0]
            
    df_surv = pd.DataFrame(surv_counts)
    surv_percentages = {}

    for col in df_surv.columns:
        surv_percentages[col] = [(count / df_surv[col].sum()) * 100 for count in df_surv[col]]
        
    return surv_counts, surv_percentages
    # Created a dictionary for every survival count in every deck
    
def display_surv_dist(percentages):
    
    df_survived_percentages = pd.DataFrame(percentages).transpose()
    deck_names = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'M')
    bar_count = np.arange(len(deck_names))  
    bar_width = 0.9   

    not_survived = df_survived_percentages[0]
    survived = df_survived_percentages[1]
    
    plt.figure(figsize=(30, 15))
    plt.bar(bar_count, not_survived, color='#aa99aa', edgecolor='black', width=bar_width, label="Not Survived")
    plt.bar(bar_count, survived, bottom=not_survived, color='#33cc33', edgecolor='black', width=bar_width, label="Survived")
 
    plt.xlabel('Deck', size=20, labelpad=30)
    plt.ylabel('Survival Percentage', size=20, labelpad=30)
    plt.xticks(bar_count, deck_names)    
    plt.tick_params(axis='x', labelsize=20)
    plt.tick_params(axis='y', labelsize=20)
    
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1), prop={'size': 15})
    plt.title('Survival Percentage in Decks', size=24, y=1.05)
    
    plt.show()

all_surv_count, all_surv_per = get_survived_dist(df_all_decks_survived)
display_surv_dist(all_surv_per)
# To count the survival rate per deck.
# Every deck has different survival rates.
# Decks B, C, D, E, and F have the highest survival rates. These decks were mostly occupied by Pclass 1.
# M has the lowest survival rate, which was mostly occupied by 2nd and 3rd class passengers.
# Though it might be that the cabin data of the victims in M is unretrievable, that's why it has the lowest survival rate.
# Cabins used by 1st class passengers have higher survival rates than cabins used by 2nd and 3rd class passengers.

In [None]:
df_all['Deck'] = df_all['Deck'].replace(['A', 'B', 'C'], 'ABC')
df_all['Deck'] = df_all['Deck'].replace(['D', 'E'], 'DE')
df_all['Deck'] = df_all['Deck'].replace(['F', 'G'], 'FG')

df_all['Deck'].value_counts()
# I regrouped Decks 'A', 'B', and 'C' 'ABC' because all of them contain only 1st class passengers
# Decks 'D' and 'E are regrouped as 'DE' because both of them have similar passenger class distribution and same survival rate. 
# Decks 'F' and 'G' are regrouped as 'FG' for the same reason above.
# I didn't regroup Deck 'M' with other decks because it contains missing values that cannot possibly filled in accurately with available data and tools.
# Deck 'M' has the lowest survival rate.

In [None]:
df_all.drop(['Cabin'], inplace=True, axis=1)

In [None]:
df_train, df_test = divide_df(df_all)
dfs = [df_train, df_test]

for df in dfs:
    display_missing(df)
# Dropped 'Cabin' because 'Deck' exists already and to prevent multicollinearity.

## Survival Distribution in the Training Dataset

In [None]:
survived = df_train['Survived'].value_counts()[1]
not_survived = df_train['Survived'].value_counts()[0]
survived_per = survived / df_train.shape[0] * 100
not_survived_per = not_survived / df_train.shape[0] * 100

print('{} of {} passengers survived and it is the {:.2f}% of the training set.'.format(survived, df_train.shape[0], survived_per))
print('{} of {} passengers did not survive and it is the {:.2f}% of the training set.'.format(not_survived, df_train.shape[0], not_survived_per))

plt.figure(figsize=(12, 10))
sns.countplot(df_train['Survived'])

plt.xlabel('Survival', size=20, labelpad=15)
plt.ylabel('Passenger Count', size=20, labelpad=15)
plt.xticks((0, 1), ['Not Survived ({0:.2f}%)'.format(not_survived_per), 'Survived ({0:.2f}%)'.format(survived_per)])
plt.tick_params(axis='x', labelsize=16)
plt.tick_params(axis='y', labelsize=16)

plt.title('Training Set Survival Distribution', size=24, y=1.10)

plt.show()

## Correlation

In [None]:
df_train_corr = df_train.drop(['PassengerId'], axis=1).corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
df_train_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
df_train_corr.drop(df_train_corr.iloc[1::2].index, inplace=True)
df_train_corr_nd = df_train_corr.drop(df_train_corr[df_train_corr['Correlation Coefficient'] == 1.0].index)
# Training Data

df_test_corr = df_test.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
df_test_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
df_test_corr.drop(df_test_corr.iloc[1::2].index, inplace=True)
df_test_corr_nd = df_test_corr.drop(df_test_corr[df_test_corr['Correlation Coefficient'] == 1.0].index)
# Test Data

In [None]:
corr = df_train_corr_nd['Correlation Coefficient'] > 0.1
df_train_corr_nd[corr]
# The highest correlation among features in the training set is 0.549500 - 'Fare' and 'Pclass'.
# There are 9 correlations in the training set that are higher than 0.1

In [None]:
corr = df_test_corr_nd['Correlation Coefficient'] > 0.1
df_test_corr_nd[corr]
# The highest correlation between features ('Pclass' and 'Fare') is 0.577 in the test set.
# There are 6 correlations in the test set that are higher than 0.1.

## Heatmap

In [None]:
fig, axs = plt.subplots(nrows=2, figsize=(30, 30))

sns.heatmap(df_train.drop(['PassengerId'], axis=1).corr(), ax=axs[0], annot=True, square=True, cmap='Blues', annot_kws={'size': 16})
sns.heatmap(df_test.drop(['PassengerId'], axis=1).corr(), ax=axs[1], annot=True, square=True, cmap='Blues', annot_kws={'size': 16})

for i in range(2):    
    axs[i].tick_params(axis='x', labelsize=16)
    axs[i].tick_params(axis='y', labelsize=16)
    
axs[0].set_title('Training Set Correlations', size=24)
axs[1].set_title('Test Set Correlations', size=24)

plt.show()
# This is a heatmap that visualizes the correlations of the features in the training and test sets.

## Continuous Features

In [None]:
cont_features = ['Age', 'Fare']
surv = df_train['Survived'] == 1

fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(25, 25))
plt.subplots_adjust(right=1.75)

for i, feature in enumerate(cont_features):    
    # Distribution of Survival in 'Age' and 'Fare' features
    sns.distplot(df_train[~surv][feature], label='Not Survived', hist=True, color='#aa99aa', ax=axs[0][i])
    sns.distplot(df_train[surv][feature], label='Survived', hist=True, color='#33cc33', ax=axs[0][i])
    
    # Distribution of 'Age' and 'Fare' features in the training and test sets
    sns.distplot(df_train[feature], label='Training Set', hist=False, color='#3fbddf', ax=axs[1][i])
    sns.distplot(df_test[feature], label='Test Set', hist=False, color='#ffcc00', ax=axs[1][i])
    
    axs[0][i].set_xlabel('')
    axs[1][i].set_xlabel('')
    
    for j in range(2):        
        axs[i][j].tick_params(axis='x', labelsize=20)
        axs[i][j].tick_params(axis='y', labelsize=20)
    
    axs[0][i].legend(loc='upper right', prop={'size': 20})
    axs[1][i].legend(loc='upper right', prop={'size': 20})
    axs[0][i].set_title('Distribution of Survival in {}'.format(feature), size=20, y=1.05)

axs[1][0].set_title('Distribution of {} Feature'.format('Age'), size=20, y=1.05)
axs[1][1].set_title('Distribution of {} Feature'.format('Fare'), size=20, y=1.05)
        
plt.show()
# Distribution of Age feature clearly shows that children younger than 15 has a higher survival rate than any of the other age groups
# In distribution of Fare feature, the survival rate is higher on distribution tails. The distribution also has positive skew because of the extremely large outliers

## Categorical Features

In [None]:
categorical_features = ['Embarked', 'Parch', 'Pclass', 'Sex', 'SibSp', 'Deck']

fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(20, 20))
plt.subplots_adjust(right=1.5, top=1.25)

for i, feature in enumerate(categorical_features, 1):    
    plt.subplot(2, 3, i)
    sns.countplot(x=feature, hue='Survived', data=df_train)
    
    plt.xlabel('{}'.format(feature), size=20, labelpad=15)
    plt.ylabel('Passenger Count', size=20, labelpad=15)    
    plt.tick_params(axis='x', labelsize=20)
    plt.tick_params(axis='y', labelsize=20)
    
    plt.legend(['Not Survived', 'Survived'], loc='upper center', prop={'size': 18})
    plt.title('Count of Survival in {} Feature'.format(feature), size=20, y=1.05)

plt.show()
# Every categorical feature has at least one class with high mortality rate. 
# Those classes are very helpful to predict whether the passenger is a survivor or victim.
# The categorical features with the most homogenous distributions are are 'Pclass' and 'Sex'.
# Passengers who boarded from Southampton has the lowest survival rate. More than half of the passengers boarded from Cherbourg had survived. Why? This observation could be related to 'Pclass' feature.
# 'Parch' and 'SibSp' features show that passengers with only one family member has the highest survival rate

## Checkpoint

In [None]:
df_all = concatenated_df(df_train, df_test)
df_all_i = df_all.copy()
dfs_i = dfs.copy()
# Most of the features are correlated with each other. This relationship can be used to create new features with feature transformation/engineering and feature interaction. 
# Created a new feature called 'Deck' and dropped 'Cabin' feature at the Exploratory Data Analysis part

## Advanced Feature Engineering

In [None]:
df_all_i['Fare'] = pd.qcut(df_all_i['Fare'], 13)

In [None]:
fig, axs = plt.subplots(figsize=(24, 11))
sns.countplot(x='Fare', hue='Survived', data=df_all_i)

plt.xlabel('Fare', size=18, labelpad=20)
plt.ylabel('Passenger Count', size=18, labelpad=20)
plt.tick_params(axis='x', labelsize=10)
plt.tick_params(axis='y', labelsize=15)

plt.legend(['Not Survived', 'Survived'], loc='upper right', prop={'size': 15})
plt.title('Survival Count in {} Feature'.format('Fare'), size=20, y=1.05)

plt.show()
# Fare feature is positively skewed and survival rate is extremely high on the right end. 
# 13 quantile-based bins are used for Fare feature. Even though the bins are too much, they provide decent amount of information gain.
# There is an unusual group (15.742, 23.25] in the middle with high survival rate (survived/not survived) that is captured in this process.
# The groups at the left side of the graph have the lowest survival rate and the groups at the right side of the graph have the highest survival rate.

In [None]:
df_all_i['Age'] = pd.qcut(df_all_i['Age'], 10)

In [None]:
fig, axs = plt.subplots(figsize=(24, 11))
sns.countplot(x='Age', hue='Survived', data=df_all_i)

plt.xlabel('Age', size=18, labelpad=20)
plt.ylabel('Passenger Count', size=18, labelpad=20)
plt.tick_params(axis='x', labelsize=10)
plt.tick_params(axis='y', labelsize=15)

plt.legend(['Not Survived', 'Survived'], loc='upper right', prop={'size': 15})
plt.title('Survival Count in {} Feature'.format('Age'), size=20, y=1.05)

plt.show()
# Age feature has a normal distribution with some spikes and bumps. 
# 10 quantile-based bins are used for Age. 
# The 1st bin has the highest survival rate while the 4th has the lowest survival rate.
# There is also an unusual group (34.0, 40.0) with high survival rate that is captured in this process

## Frequency Encoding

In [None]:
df_all_i['Family_Size'] = df_all_i['SibSp'] + df_all_i['Parch'] + 1
# Family_Size is created by adding SibSp, Parch and 1 - adding 1 at the end is the current passenger.

fig, axs = plt.subplots(figsize=(24, 24), ncols=2, nrows=2)
plt.subplots_adjust(right=1.5)

sns.barplot(x=df_all_i['Family_Size'].value_counts().index, y=df_all_i['Family_Size'].value_counts().values, ax=axs[0][0])
axs[0][0].set_title('Family Size Feature Value Counts', size=20, y=1.05)
sns.countplot(x='Family_Size', hue='Survived', data=df_all_i, ax=axs[0][1])
axs[0][1].set_title('Survival Counts in Family Size ', size=20, y=1.05)

family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 5: 'Medium', 6: 'Medium', 7: 'Large', 8: 'Large', 11: 'Large'}
# Family Size with 1 are labeled as 'Alone'
# Family Size with 2, 3 and 4 are labeled as 'Small'
# Family Size with 5 and 6 are labeled as 'Medium'
# Family Size with 7, 8 and 11 are labeled as 'Large'
df_all_i['Family_Size_Grouped'] = df_all_i['Family_Size'].map(family_map)

sns.barplot(x=df_all_i['Family_Size_Grouped'].value_counts().index, y=df_all_i['Family_Size_Grouped'].value_counts().values, ax=axs[1][0])
axs[1][0].set_title('Family Size Feature Value Counts After Grouping', size=20, y=1.05)
sns.countplot(x='Family_Size_Grouped', hue='Survived', data=df_all_i, ax=axs[1][1])
axs[1][1].set_title('Survival Counts in Family Size After Grouping', size=20, y=1.05)

for i in range(2):
    axs[i][1].legend(['Not Survived', 'Survived'], loc='upper right', prop={'size': 20})
    for j in range(2):
        axs[i][j].tick_params(axis='x', labelsize=20)
        axs[i][j].tick_params(axis='y', labelsize=20)
        axs[i][j].set_xlabel('')
        axs[i][j].set_ylabel('')

plt.show()
# Graphs show that family size is a predictor of survival because different values have different survival rates.

In [None]:
df_all_i['Ticket_Frequency'] = df_all_i.groupby('Ticket')['Ticket'].transform('count')
# There are too many unique Ticket values to analyze, so grouping them up by their frequencies makes things easier.
# 'Ticket_Frequency' is different from 'Family_Size' because many passengers travelled along with groups consisting of friends, nannies, maids, etc. - all of whom weren't counted as family but used the same ticket.

In [None]:
fig, axs = plt.subplots(figsize=(14, 11))
sns.countplot(x='Ticket_Frequency', hue='Survived', data=df_all_i)

plt.xlabel('Ticket Frequency', size=20, labelpad=20)
plt.ylabel('Passenger Count', size=20, labelpad=20)
plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=15)

plt.legend(['Not Survived', 'Survived'], loc='upper right', prop={'size': 15})
plt.title('Count of Survival in {} Feature'.format('Ticket Frequency'), size=24, y=1.05)

plt.show()
# According to the graph below, groups with 2, 3, and 4 members had a higher survival rate, while those who traveled alone had the lowest survival rate. 
# After 4 group members, survival rate decreases drastically. 
# This pattern is very similar to Family_Size feature but, there are minor differences as mentioned in the previous block. 
# Ticket_Frequency values are not grouped like Family_Size because that would basically create the same feature with perfect correlation.

In [None]:
df_all_i['Title'] = df_all_i['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
# The 'Title' feature is created by extracting the prefix before the 'Name' feature.

df_all_i['Is_Married'] = 0
df_all_i['Is_Married'].loc[df_all_i['Title'] == 'Mrs'] = 1
# 'Is_Married' is a binary feature based on the Mrs title. This title has the highest survival rate among other female titles and needs to be a feature because all female titles are grouped with each other.

In [None]:
fig, axs = plt.subplots(nrows=2, figsize=(20, 20))
sns.barplot(x=df_all_i['Title'].value_counts().index, y=df_all_i['Title'].value_counts().values, ax=axs[0])
axs[0].set_title('Title Feature Value Counts', size=24, y=1.05)
axs[0].tick_params(axis='x', labelsize=10)

df_all_i['Title'] = df_all_i['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
df_all_i['Title'] = df_all_i['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')

sns.barplot(x=df_all_i['Title'].value_counts().index, y=df_all_i['Title'].value_counts().values, ax=axs[1])
axs[1].set_title('Title Feature Value Counts After Grouping', size=24, y=1.05)
axs[1].tick_params(axis='x', labelsize=15)

for i in range(2):    
    axs[i].tick_params(axis='y', labelsize=15)

plt.show()
# According to first graph below, there are many titles that are occuring very few times. Some of those titles don't seem correct and they need to be replaced. 
# Miss, Mrs, Ms, Mlle, Lady, Mme, the Countess, Dona titles are replaced with Miss/Mrs/Ms because all of them are female. 
# Values like Mlle, Mme and Dona are actually the name of the passengers, but they were previously classified as titles because the 'Name' feature is split by comma. 
# Dr, Col, Major, Jonkheer, Capt, Sir, Don and Rev titles are replaced with Dr/Military/Noble/Clergy because those passengers have similar characteristics. 
# Master is a unique title. It is given to male passengers below age 26. They have the highest survival rate among all males.

## Target Encoding

In [None]:
def extract_surname(data):    
    
    families = []
    
    for i in range(len(data)):        
        name = data.iloc[i]

        if '(' in name:
            name_no_bracket = name.split('(')[0] 
        else:
            name_no_bracket = name
            
        family = name_no_bracket.split(',')[0]
        title = name_no_bracket.split(',')[1].strip().split(' ')[0]
        
        for c in string.punctuation:
            family = family.replace(c, '').strip()
            
        families.append(family)
            
    return families
# extract_surname function is used to extract surnames of passengers from the 'Name' feature. 

df_all_i['Family'] = extract_surname(df_all_i['Name'])
df_train = df_all_i.loc[:890]
df_test = df_all_i.loc[891:]

dfs_i = [df_train, df_test]
# 'Family' feature is thus created using the extract_surname function. This feature is important in order to group passengers in the same family.

In [None]:
non_unique_families = [x for x in df_train['Family'].unique() if x in df_test['Family'].unique()]
non_unique_tickets = [x for x in df_train['Ticket'].unique() if x in df_test['Ticket'].unique()]
# Created a list of families and tickets that are occurring in the training and test sets
# A list of family names (non_unique_families), that are occurring in all sets is created.

df_family_survival_rate = df_train.groupby('Family')['Survived', 'Family','Family_Size'].median()
df_ticket_survival_rate = df_train.groupby('Ticket')['Survived', 'Ticket','Ticket_Frequency'].median()

family_rates = {}
ticket_rates = {}


for i in range(len(df_family_survival_rate)):
    if df_family_survival_rate.index[i] in non_unique_families and df_family_survival_rate.iloc[i, 1] > 1:
        family_rates[df_family_survival_rate.index[i]] = df_family_survival_rate.iloc[i, 0]
    # Checked if a family exists in all sets, and has members more than 1

for i in range(len(df_ticket_survival_rate)):
    if df_ticket_survival_rate.index[i] in non_unique_tickets and df_ticket_survival_rate.iloc[i, 1] > 1:
        ticket_rates[df_ticket_survival_rate.index[i]] = df_ticket_survival_rate.iloc[i, 0]
    # Checked if a ticket exists in both training and test set, and has members more than 1

In [None]:
mean_survival_rate = np.mean(df_train['Survived'])

train_family_survival_rate = []
train_family_survival_rate_NA = []
test_family_survival_rate = []
test_family_survival_rate_NA = []

for i in range(len(df_train)):
    if df_train['Family'][i] in family_rates:
        train_family_survival_rate.append(family_rates[df_train['Family'][i]])
        train_family_survival_rate_NA.append(1)
    else:
        train_family_survival_rate.append(mean_survival_rate)
        train_family_survival_rate_NA.append(0)        
        
for i in range(len(df_test)):
    if df_test['Family'].iloc[j] in family_rates:
        test_family_survival_rate.append(family_rates[df_test['Family'].iloc[i]])
        test_family_survival_rate_NA.append(1)
    else:
        test_family_survival_rate.append(mean_survival_rate)
        test_family_survival_rate_NA.append(0)
        
df_train['Family_Survival_Rate'] = train_family_survival_rate
df_train['Family_Survival_Rate_NA'] = train_family_survival_rate_NA
df_test['Family_Survival_Rate'] = test_family_survival_rate
df_test['Family_Survival_Rate_NA'] = test_family_survival_rate_NA
# Family_Survival_Rate is calculated from families in training set since there is no Survived feature in test set.
# The survival rate is calculated for families with more than 1 members in that list, and stored in Family_Survival_Rate feature.
# An extra binary feature Family_Survival_Rate_NA is created for families that are unique to the test set. 
# This extra feature is also necessary because there is no way to calculate those families' survival rate, and implies that family survival rate is not applicable to those passengers because there is no way to retrieve their survival rate.

train_ticket_survival_rate = []
train_ticket_survival_rate_NA = []
test_ticket_survival_rate = []
test_ticket_survival_rate_NA = []
# Ticket_Survival_Rate and Ticket_Survival_Rate_NA features are also created with the same method. 

for i in range(len(df_train)):
    if df_train['Ticket'][i] in ticket_rates:
        train_ticket_survival_rate.append(ticket_rates[df_train['Ticket'][i]])
        train_ticket_survival_rate_NA.append(1)
    else:
        train_ticket_survival_rate.append(mean_survival_rate)
        train_ticket_survival_rate_NA.append(0)
        
for i in range(len(df_test)):
    if df_test['Ticket'].iloc[i] in ticket_rates:
        test_ticket_survival_rate.append(ticket_rates[df_test['Ticket'].iloc[i]])
        test_ticket_survival_rate_NA.append(1)
    else:
        test_ticket_survival_rate.append(mean_survival_rate)
        test_ticket_survival_rate_NA.append(0)
        
df_train['Ticket_Survival_Rate'] = train_ticket_survival_rate
df_train['Ticket_Survival_Rate_NA'] = train_ticket_survival_rate_NA
df_test['Ticket_Survival_Rate'] = test_ticket_survival_rate
df_test['Ticket_Survival_Rate_NA'] = test_ticket_survival_rate_NA

In [None]:
for df in [df_train, df_test]:
    df['Survival_Rate'] = (df['Ticket_Survival_Rate'] + df['Family_Survival_Rate']) / 2
    df['Survival_Rate_NA'] = (df['Ticket_Survival_Rate_NA'] + df['Family_Survival_Rate_NA']) / 2
# Ticket_Survival_Rate and Family_Survival_Rate are averaged and become Survival_Rate.
# Ticket_Survival_Rate_NA and Family_Survival_Rate_NA are also averaged and become Survival_Rate_NA.

## Feature Transformation

In [None]:
non_numeric_features = ['Embarked', 'Sex', 'Deck', 'Title', 'Family_Size_Grouped', 'Age', 'Fare']

for df in dfs_i:
    for feature in non_numeric_features:
        df[feature] = LabelEncoder().fit_transform(df[feature])
# Embarked, Sex, Deck , Title and Family_Size_Grouped are object type.
# Age and Fare features are category type. 
# LabelEncoder basically labels the classes from 0 to n, converting the aforementioned types into numerical type.
# This process is necessary for the model to learn from these features.

## One-hot Encoding

In [None]:
categorical_features = ['Pclass', 'Sex', 'Deck', 'Embarked', 'Title', 'Family_Size_Grouped']
encoded_features = []

for df in dfs_i:
    for feature in categorical_features:
        encoded_feat = OneHotEncoder().fit_transform(df[feature].values.reshape(-1, 1)).toarray()
        n = df[feature].nunique()
        cols = ['{}_{}'.format(feature, n) for n in range(1, n + 1)]
        encoded_df = pd.DataFrame(encoded_feat, columns=cols)
        encoded_df.index = df.index
        encoded_features.append(encoded_df)
# The categorical features (Pclass, Sex, Deck, Embarked, Title) are converted to one-hot encoded features with OneHotEncoder. 
# 'Age' and 'Fare' features are not converted because they are ordinal, unlike the other features. Label Encoding must be used for such ordinal features.

df_train = pd.concat([df_train, *encoded_features[:6]], axis=1)
df_test = pd.concat([df_test, *encoded_features[6:]], axis=1)

In [None]:
df_all_i = concatenated_df(df_train, df_test)
drop_cols = ['Deck', 'Embarked', 'Family', 'Family_Size', 'Family_Size_Grouped', 'Survived',
             'Name', 'Parch', 'PassengerId', 'Pclass', 'Sex', 'SibSp', 'Ticket', 'Title',
            'Ticket_Survival_Rate', 'Family_Survival_Rate', 'Ticket_Survival_Rate_NA', 'Family_Survival_Rate_NA']
# 'Family_Size' is created by adding 'Parch' and 'SibSp' features and 1. 
# 'Ticket_Frequency' is created by counting the occurrence of Ticket values.
# 'Name' is quite useful: First, 'Title' and 'Is_Married' features are created from the title prefix in the names.
# Second, 'Family_Survival_Rate' and 'Family_Survival_Rate_NA' features are created by target encoding the surname of the passengers. 
# 'Ticket_Survival_Rate' is created by target encoding the Ticket feature. 
# 'Survival_Rate' feature is created by averaging the 'Family_Survival_Rate' and 'Ticket_Survival_Rate' features.

df_all_i.drop(columns=drop_cols, inplace=True)
df_all_i.head()

# Model

In [None]:
X_train = StandardScaler().fit_transform(df_train.drop(columns=drop_cols))
y_train = df_train['Survived'].values
X_test = StandardScaler().fit_transform(df_test.drop(columns=drop_cols))

print('X_train shape: {}'.format(X_train.shape))
print('y_train shape: {}'.format(y_train.shape))
print('X_test: {}'.format(X_test.shape))

In [None]:
single_best_model = RandomForestClassifier(criterion='gini', 
                                           n_estimators=1100,
                                           # 1100
                                           max_depth=5,
                                           # 5
                                           min_samples_split=4,
                                           # 4
                                           min_samples_leaf=5,
                                           # 5
                                           max_features='auto',
                                           oob_score=True,
                                           random_state=seed,
                                           n_jobs=-1,
                                           verbose=1)

In [None]:
N = 5
oob = 0
probs = pd.DataFrame(np.zeros((len(X_test), N * 2)), columns=['Fold_{}_Prob_{}'.format(i, j) for i in range(1, N + 1) for j in range(2)])
importances = pd.DataFrame(np.zeros((X_train.shape[1], N)), columns=['Fold_{}'.format(i) for i in range(1, N + 1)], index=df_all_i.columns)
fprs, tprs, scores = [], [], []

skf = StratifiedKFold(n_splits=N, random_state=N, shuffle=True)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    print('Fold {}\n'.format(fold))
    
    single_best_model.fit(X_train[trn_idx], y_train[trn_idx])

    # Computing Train AUC score
    trn_fpr, trn_tpr, trn_thresholds = roc_curve(y_train[trn_idx], single_best_model.predict_proba(X_train[trn_idx])[:, 1])
    trn_auc_score = auc(trn_fpr, trn_tpr)
      
    scores.append(trn_auc_score)
    
    # X_test probabilities
    probs.loc[:, 'Fold_{}_Prob_0'.format(fold)] = single_best_model.predict_proba(X_test)[:, 0]
    probs.loc[:, 'Fold_{}_Prob_1'.format(fold)] = single_best_model.predict_proba(X_test)[:, 1]
    importances.iloc[:, fold - 1] = single_best_model.feature_importances_
        
    oob += single_best_model.oob_score_ / N
    print('Fold {} OOB Score: {}\n'.format(fold, single_best_model.oob_score_))   
    
print('Average OOB Score: {}'.format(oob))

In [None]:
importances['Mean_Importance'] = importances.mean(axis=1)
importances.sort_values(by='Mean_Importance', inplace=True, ascending=False)

plt.figure(figsize=(15, 20))
sns.barplot(x='Mean_Importance', y=importances.index, data=importances)

plt.xlabel('')
plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=15)
plt.title('Random Forest Classifier Mean Feature Importance Between Folds', size=15)

plt.show()

In [None]:
def plot_roc_curve(fprs, tprs):
    
    tprs_interp = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    f, ax = plt.subplots(figsize=(15, 15))
    
    # Plotting ROC for each fold and computing AUC scores
    for i, (fpr, tpr) in enumerate(zip(fprs, tprs), 1):
        tprs_interp.append(np.interp(mean_fpr, fpr, tpr))
        tprs_interp[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        ax.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC Fold {} (AUC = {:.3f})'.format(i, roc_auc))
        
    # Plotting ROC for random guessing
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=0.8, label='Random Guessing')
    
    mean_tpr = np.mean(tprs_interp, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    
    # Plotting the mean ROC
    ax.plot(mean_fpr, mean_tpr, color='b', label='Mean ROC (AUC = {:.3f} $\pm$ {:.3f})'.format(mean_auc, std_auc), lw=2, alpha=0.8)
    
    # Plotting the standard deviation around the mean ROC Curve
    std_tpr = np.std(tprs_interp, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label='$\pm$ 1 std. dev.')
    
    ax.set_xlabel('False Positive Rate', size=15, labelpad=20)
    ax.set_ylabel('True Positive Rate', size=15, labelpad=20)
    ax.tick_params(axis='x', labelsize=15)
    ax.tick_params(axis='y', labelsize=15)
    ax.set_xlim([-0.05, 1.05])
    ax.set_ylim([-0.05, 1.05])

    ax.set_title('ROC Curves of Folds', size=20, y=1.02)
    ax.legend(loc='lower right', prop={'size': 13})
    
    plt.show()

In [None]:
class_survived = [col for col in probs.columns if col.endswith('Prob_1')]
probs['1'] = probs[class_survived].sum(axis=1) / N
probs['0'] = probs.drop(columns=class_survived).sum(axis=1) / N
probs['pred'] = 0
pos = probs[probs['1'] >= 0.5].index
probs.loc[pos, 'pred'] = 1

y_pred = probs['pred'].astype(int)

submission_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
submission_df['PassengerId'] = df_test['PassengerId']
submission_df['Survived'] = y_pred.values
submission_df.to_csv('submissions.csv', header=True, index=False)
submission_df.head(10)