In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns
import matplotlib.ticker as tick
from matplotlib.colors import LinearSegmentedColormap
import matplotlib as mpl

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import xgboost as xgb


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Intro

Ah Titanic problem, classic!

Titanic problem is one of the legendary problem here in Kaggle, here we are given a dataset about passengers boarding in the Titanic that was sink On April 15, 1912 and try to predict whether a passenger given a certain feature to survived or not. 

Soo in this notebook i'm only trying to explore the data, finding some insight and plot some visualization to understand more about the data. Hope you liked it!


### Reference
Oh and yeah, my visualization is highly inspired by Subin An visualizations, i highly reccomend his notebooks if you are trying to learn about data visualization like i am ðŸ˜ƒ! here are his link:

reference: https://www.kaggle.com/subinium/kaggle-2020-visualization-analysis

# Colors

In [None]:
colors_dark = ["#1F1F1F", "#313131", '#636363', '#AEAEAE', '#DADADA']
colors_blue = ["#21484A", '#2D696C', '#46A3A9', '#5CD8E0', '#B6F4F8']
colors_red  = ["#461818", '#6F2525', '#A03535', '#E14B4B', '#FF5151']

In [None]:
# code for creating custom cmap
# ref: https://stackoverflow.com/a/50230769/12033506

def NonLinCdict(steps, hexcol_array):
    cdict = {'red': (), 'green': (), 'blue': ()}
    for s, hexcol in zip(steps, hexcol_array):
        rgb =mpl.colors.hex2color(hexcol)
        cdict['red'] = cdict['red'] + ((s, rgb[0], rgb[0]),)
        cdict['green'] = cdict['green'] + ((s, rgb[1], rgb[1]),)
        cdict['blue'] = cdict['blue'] + ((s, rgb[2], rgb[2]),)
    return cdict

th = [0, 0.4, 0.5, 0.8, 1]
cdict = NonLinCdict(th, (colors_blue[:4] + colors_dark[-1:])[::-1])
cm = LinearSegmentedColormap('test', cdict)
cm

In [None]:
def reformat_large_tick_values(tick_val, pos):
    """
    Turns large tick values (in the billions, millions and thousands) such as 4500 into 4.5K and also appropriately turns 4000 into 4K (no zero after the decimal).
    """
    if tick_val >= 1000000000:
        val = round(tick_val/1000000000, 1)
        new_tick_format = '{:}B'.format(val)
    elif tick_val >= 1000000:
        val = round(tick_val/1000000, 1)
        new_tick_format = '{:}M'.format(val)
    elif tick_val >= 1000:
        val = round(tick_val/1000, 1)
        new_tick_format = '{:}K'.format(val)
    elif tick_val < 1000:
        new_tick_format = round(tick_val, 1)
    else:
        new_tick_format = tick_val

    # make new_tick_format into a string value
    new_tick_format = str(new_tick_format)
    
    # code below will keep 4.5M as is but change values such as 4.0M to 4M since that zero after the decimal isn't needed
    index_of_decimal = new_tick_format.find(".")
    
    if index_of_decimal != -1:
        value_after_decimal = new_tick_format[index_of_decimal+1]
        if value_after_decimal == "0":
            # remove the 0 after the decimal point since it's not needed
            new_tick_format = new_tick_format[0:index_of_decimal] + new_tick_format[index_of_decimal+2:]
            
    return new_tick_format

In [None]:
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
df_train.drop("PassengerId", axis=1, inplace=True)

In [None]:
train_null = (df_train.isnull().sum() / len(df_train) * 100).sort_values(ascending=False)
train_notnull = (df_train.notnull().sum() / len(df_train) * 100).sort_values()

fig, ax = plt.subplots(figsize=(14, 8))

bars1 = ax.bar(x=train_null.index, height=train_null.values, color=colors_blue[0])
bars2 = ax.bar(x=train_notnull.index, height=train_notnull.values, bottom=train_null.values, alpha=0.3, color=colors_dark[-1])

for bar in bars1: 
    height = bar.get_height()
    x = bar.get_x()
    
    ax.text(
        x=x+0.4, y=height + 2.5,
        ha='center',
        s="{:.2f}%".format(height),
        fontsize=12,
        color=colors_dark[0],
    )

ax.legend(["Missing values (%)"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
ax.grid(axis='y', alpha=0.2)
ax.set_axisbelow(True)
ax.tick_params(labelsize=12)
ax.set_xlabel("Feature", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("Percentage %", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])

ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.suptitle("Missing Value Percentage", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.title("The only feature with most significance loss is Cabin", loc='center', x=0.48, y=1.03, fontsize=14, color=colors_dark[2])
plt.tight_layout()
plt.show()

# Analysis on Gender

In [None]:
def to_pct(row) : 
    return row*100 / row.sum()

df = df_train.groupby(['Sex', 'Survived']).count()['Name'].unstack()
df = df.apply(to_pct, axis=1).reset_index()

fig, ax = plt.subplots(1, 2, figsize=(16, 6))

bars0 = sns.countplot(data=df_train, x='Sex', hue='Survived', palette=list([colors_blue[1], colors_dark[-1]]), ax=ax[0], hue_order=[1, 0], order=['female', 'male'])

bars1 = ax[1].bar(x=df.Sex, height=df[1], label='Survived', color=colors_blue[1])
bars2 = ax[1].bar(x=df.Sex, height=df[0], bottom=df[1], label="Dead", color=colors_dark[-1], alpha=0.3)

for bar in bars1 : 
    height = bar.get_height() 
    x = bar.get_x() 
    
    ax[1].text(
        x=x+0.4, y=height // 2,
        va='center', ha='center', 
        s=f"{round(height)}%",
        color='white',
        fontweight='bold',
        fontsize=24
    )
    
# AX settings
    
ax[0].legend(["Survived", "Not Survived"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
ax[0].grid(axis='y', alpha=0.2)
ax[0].set_axisbelow(True)
ax[0].tick_params(labelsize=12)
ax[0].set_ylabel("Survival Count", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax[0].yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax[0].set_xlabel(None)

ax[0].spines['right'].set_visible(False)
ax[0].spines['left'].set_visible(False)
ax[0].spines['top'].set_visible(False)
    
ax[1].legend(["Survival Rate (%)"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
ax[1].grid(axis='y', alpha=0.2)
ax[1].set_axisbelow(True)
ax[1].tick_params(labelsize=12)
ax[1].set_ylabel("Percentage %", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])

ax[1].spines['right'].set_visible(False)
ax[1].spines['left'].set_visible(False)
ax[1].spines['top'].set_visible(False)

plt.title("It seems that female has a higher chance of survival comapared to male", loc='center', x=-0.1, y=1.08, fontsize=14, color=colors_dark[2])
plt.suptitle("Survival Rate Male and Female", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.tight_layout()
plt.show()

In [None]:
def to_pct(row) : 
    return row*100 / row.sum()

fig, ax = plt.subplots(3, 2, figsize=(16, 16))
for i in range(3) : 

    df = df_train[df_train.Pclass == i+1]
    df = df.groupby(['Sex', 'Survived']).count()['Name'].unstack()
    df = df.apply(to_pct, axis=1).reset_index()


    bars0 = sns.countplot(data=df_train[df_train.Pclass == i+1], x='Sex', hue='Survived', palette=list([colors_blue[1], colors_dark[-1]]), ax=ax[i, 0], hue_order=[1, 0], order=['female', 'male'])

    bars1 = ax[i, 1].bar(x=df.Sex, height=df[1], label='Survived', color=colors_blue[1])
    bars2 = ax[i, 1].bar(x=df.Sex, height=df[0], bottom=df[1], label="Dead", color=colors_dark[-1], alpha=0.3)

    for bar in bars1 : 
        height = bar.get_height() 
        x = bar.get_x() 

        ax[i, 1].text(
            x=x+0.4, y=height // 2,
            va='center', ha='center', 
            s=f"{round(height)}%",
            color='white',
            fontweight='bold',
            fontsize=24
        )

    # AX settings
    ax[i, 0].set_title(f"Pclass = {i+1}", loc='left', fontweight='bold', fontsize=14, color=colors_dark[1])

    ax[i, 0].legend(["Survived", "Not Survived"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
    ax[i, 0].grid(axis='y', alpha=0.2)
    ax[i, 0].set_axisbelow(True)
    ax[i, 0].tick_params(labelsize=12)
    ax[i, 0].set_ylabel("Survival Count", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
    ax[i, 0].yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
    ax[i, 0].set_xlabel(None)

    ax[i, 0].spines['right'].set_visible(False)
    ax[i, 0].spines['left'].set_visible(False)
    ax[i, 0].spines['top'].set_visible(False)

    ax[i, 1].legend(["Survival Rate (%)"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
    ax[i, 1].grid(axis='y', alpha=0.2)
    ax[i, 1].set_axisbelow(True)
    ax[i, 1].tick_params(labelsize=12)
    ax[i, 1].set_ylabel("Percentage %", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])

    ax[i, 1].spines['right'].set_visible(False)
    ax[i, 1].spines['left'].set_visible(False)
    ax[i, 1].spines['top'].set_visible(False)

fig.suptitle("Survival Rate Male and Female On Each Class\n\n\n", fontsize=18, fontweight='bold', color=colors_dark[0])
fig.text(ha='center', va='center', x=0.5, y=0.94, s="It is interesting to see the Pclass=3\nmost of the passenger on Pclass=3 is male\nand both female and male on Pclass=3, has a low survival chance", fontsize=14, color=colors_dark[2])
plt.subplots_adjust(top=2)
plt.tight_layout()
plt.show()

# Analysis on Age

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
age_mean = df_train.Age.mean()
age_median = df_train.Age.median()


sns.kdeplot(data=df_train, x='Age', shade=True, color=colors_blue[0], ax=ax, alpha=1)

ax.axvline(x=age_mean, linestyle='--', color=colors_blue[2])
ax.axvline(x=age_median, linestyle='--', color=colors_blue[3])

ax.annotate(
    text="Average\n {:.2f}".format(age_mean),
    fontsize=12,
    xy=(age_mean, 0.021),
    xytext=(age_mean-8, 0.022),
    color=colors_dark[2],
    arrowprops=dict(
        arrowstyle= '-',
        color=colors_dark[2],
        ls='--'
    )
)

ax.annotate(
    text="Median: {:.2f}".format(age_median),
    fontsize=12,
    xy=(age_median, 0.02),
    xytext=(age_mean+3, 0.021),
    color=colors_dark[2],
    arrowprops=dict(
        arrowstyle= '-',
        color=colors_dark[2],
        ls='--'
    )
)


ax.grid(axis='y', alpha=0.2)
ax.set_axisbelow(True)
ax.tick_params(labelsize=12)
ax.set_xlabel("Age", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("Density", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])

ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.suptitle("How is The Age Distributed?", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.title("We can see that there are three hills: young, middle-aged, old. My guess is that most of old and young will travel in family\nthe mean and median is pretty much the same you might want to check with other feature if you want to impute missing value for age", loc='center', x=0.48, y=1.05, fontsize=14, color=colors_dark[2])
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))


sns.kdeplot(data=df_train, x='Age', hue='Survived', ax=ax, shade=True, hue_order=[1, 0], palette=list([colors_blue[0], colors_dark[-1]]), alpha=1)

ax.grid(axis='y', alpha=0.2)
ax.set_axisbelow(True)
ax.tick_params(labelsize=12)
ax.set_xlabel("Age", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("Density", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax.legend(["Not Survived", 'Survived'], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=3, frameon=False, fontsize=12)


ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.suptitle("How is The Age Distributed?", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.title("Not so much interesting, but we can see that majority who didnt survived is between 20 to 30-ish\n", loc='center', x=0.48, y=1.02, fontsize=14, color=colors_dark[2])
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(14, 14))

fig.text(ha='center', va='center', x=0.5, y=0.935, s="We can see that most who didnt survive is on Pclass=3, many people on Pclass one survive\nwe can also see that most people on Pclass=1 is old people", fontsize=14, color=colors_dark[2])


for i in range(3) :
    

    df = df_train[df_train.Pclass == i+1]
    age_median =  df.Age.median()
    
    ax[i].axvline(x=age_median, linestyle='--', color=colors_red[2], label='Median')

    sns.kdeplot(data=df, x='Age', hue='Survived', ax=ax[i], shade=True, alpha=0.4, hue_order=[1, 0], palette=list([colors_blue[1], colors_dark[-2]]))
    ax[i].set_title(f"Pclass = {i+1}", loc='left', fontweight='bold', fontsize=14, color=colors_dark[1])
    ax[i].grid(axis='y', alpha=0.2)
    ax[i].set_axisbelow(True)
    ax[i].tick_params(labelsize=12)
    ax[i].set_xlabel("Age", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
    ax[i].set_ylabel("Density", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
    ax[i].legend(["Median", 'Not Survived', "Survived"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=3, frameon=False, fontsize=12)


    ax[i].spines['right'].set_visible(False)
    ax[i].spines['left'].set_visible(False)
    ax[i].spines['top'].set_visible(False)

plt.suptitle("Age Distribution for Every Pclass?\n\n\n", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

baby = [0, 5]
children = [6, 16]
young_adult = [17,29]
middle_aged = [30,49]
senior = [50, 90]

sns.kdeplot(data=df_train, x='Age', shade=True, color=colors_blue[0], ax=ax)

ax.axvspan(baby[0], baby[1], alpha=0.8, color=colors_blue[0])
ax.axvspan(children[0], children[1], alpha=0.8, color=colors_blue[0])
ax.axvspan(young_adult[0], young_adult[1], alpha=0.8, color=colors_blue[2])
ax.axvspan(middle_aged[0], middle_aged[1], alpha=0.8, color=colors_blue[3])
ax.axvspan(senior[0], senior[1], alpha=0.8, color=colors_blue[4])

ax.text(
    x=(sum(baby) // 2) + 0.12, y=0.015,
    s='Baby',
    ha='center',
    va='center',
    fontsize=16,
    color='white', 
    fontweight='bold'
)

ax.text(
    x=(sum(children) // 2) + 0.12, y=0.015,
    s='Children',
    ha='center',
    va='center',
    fontsize=16,
    color='white', 
    fontweight='bold'
)

ax.text(
    x=(sum(young_adult) // 2) + 0.12, y=0.015,
    s='Young\nAdult',
    ha='center',
    va='center',
    fontsize=16,
    color='white', 
    fontweight='bold'
)

ax.text(
    x=(sum(middle_aged) // 2) + 0.12, y=0.015,
    s='Middle\nAged',
    ha='center',
    va='center',
    fontsize=16,
    color=colors_dark[1], 
    fontweight='bold'
)

ax.text(
    x=(sum(senior) // 2) + 0.12, y=0.015,
    s='Senior',
    ha='center',
    va='center',
    fontsize=16,
    color=colors_dark[1], 
    fontweight='bold'
)


ax.grid(axis='y', alpha=0.2)
ax.set_axisbelow(True)
ax.tick_params(labelsize=12)
ax.set_xlabel("Age", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("Density", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])

ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.suptitle("Age Categorization", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.title("Lets try to divide the age into four groups, i expect that children and senior have a higher chance of survival", loc='center', x=0.48, y=1.05, fontsize=14, color=colors_dark[2])
plt.tight_layout()
plt.show()

In [None]:
bins = [0, 5, 17, 30, 50, 100]
labels = ["Baby", "Children", "Young Adult", "Middle Aged", "Senior"]
df_train['Age Group'] = pd.cut(df['Age'], bins=bins, labels=labels) 

In [None]:
def to_pct(row) : 
    return row*100 / row.sum()

df = df_train.groupby(['Age Group', 'Survived']).count()['Name'].unstack()
df = df.apply(to_pct, axis=1).reset_index()

mean_survive = df.mean()[1]

fig, ax = plt.subplots(1, 2, figsize=(16, 6))

bars0 = sns.countplot(data=df_train, x='Age Group', hue='Survived', palette=list([colors_blue[1], colors_dark[-1]]), ax=ax[0], hue_order=[1, 0])

bars1 = ax[1].bar(x=df['Age Group'], height=df[1], label='Survived', color=colors_blue[1])
bars2 = ax[1].bar(x=df['Age Group'], height=df[0], bottom=df[1], label="Dead", color=colors_dark[-1], alpha=0.3)

line1 = ax[1].axhline(y=mean_survive, linestyle="--", color=colors_red[-2])

for bar in bars1 : 
    height = bar.get_height() 
    x = bar.get_x() 
    
    ax[1].text(
        x=x+0.4, y=height // 2,
        va='center', ha='center', 
        s=f"{round(height)}%",
        color='white',
        fontweight='bold',
        fontsize=24
    )
    
ax[1].text(
    x=3.8, 
    y=mean_survive,
    s="Average rate {:.2f}%".format(mean_survive),
    backgroundcolor="#2A5D61",
    color="white",
    fontweight='bold',
    fontsize=12
)
    
# AX settings
    
ax[0].legend(["Survived", "Not Survived"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
ax[0].grid(axis='y', alpha=0.2)
ax[0].set_axisbelow(True)
ax[0].tick_params(labelsize=12)
ax[0].set_ylabel("Survival Count", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax[0].yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax[0].set_xlabel(None)

ax[0].spines['right'].set_visible(False)
ax[0].spines['left'].set_visible(False)
ax[0].spines['top'].set_visible(False)
    
ax[1].legend(["Average rate of survival", "Survival Rate (%)"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
ax[1].grid(axis='y', alpha=0.2)
ax[1].set_axisbelow(True)
ax[1].tick_params(labelsize=12)
ax[1].set_ylabel("Percentage %", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])

ax[1].spines['right'].set_visible(False)
ax[1].spines['left'].set_visible(False)
ax[1].spines['top'].set_visible(False)

plt.title("As expected children and senior have a higher Rate of Survival", loc='center', x=0.038, y=1.08, fontsize=14, color=colors_dark[2])
plt.suptitle("Survival Rate For Every Age Group", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.tight_layout()
plt.show()

# Analysis on Embarked

In [None]:
embark_mapping = {
    "S" : "Southampton",
    'C' : "Cherbourg", 
    "Q" : "Queenstown",
    np.nan : "Not Embarked"
}

df_train = df_train.replace({"Embarked": embark_mapping})

In [None]:
def to_pct(row) : 
    return row*100 / row.sum()

df = df_train.groupby(['Embarked', 'Survived']).count()['Name'].unstack()
df = df.apply(to_pct, axis=1).reset_index()

mean_survive = df.mean()[1]

fig, ax = plt.subplots(1, 2, figsize=(16, 6))

bars0 = sns.countplot(data=df_train, x='Embarked', hue='Survived', palette=list([colors_blue[1], colors_dark[-1]]), ax=ax[0], hue_order=[1, 0], order=df.Embarked)

bars1 = ax[1].bar(x=df['Embarked'], height=df[1], label='Survived', color=colors_blue[1])
bars2 = ax[1].bar(x=df['Embarked'], height=df[0], bottom=df[1], label="Dead", color=colors_dark[-1], alpha=0.3)

line1 = ax[1].axhline(y=mean_survive, linestyle="--", color=colors_red[-2])

for bar in bars1 : 
    height = bar.get_height() 
    x = bar.get_x() 
    
    ax[1].text(
        x=x+0.4, y=height // 2,
        va='center', ha='center', 
        s=f"{round(height)}%",
        color='white',
        fontweight='bold',
        fontsize=24
    )
    
ax[1].text(
    x=3.8, 
    y=mean_survive,
    s="Average rate {:.2f}%".format(mean_survive),
    backgroundcolor="#2A5D61",
    color="white",
    fontweight='bold',
    fontsize=12
)
    
# AX settings
    
ax[0].legend(["Survived", "Not Survived"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
ax[0].grid(axis='y', alpha=0.2)
ax[0].set_axisbelow(True)
ax[0].tick_params(labelsize=12)
ax[0].set_ylabel("Survival Count", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax[0].yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax[0].set_xlabel(None)

ax[0].spines['right'].set_visible(False)
ax[0].spines['left'].set_visible(False)
ax[0].spines['top'].set_visible(False)
    
ax[1].legend(["Average rate of survival", "Survival Rate (%)"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
ax[1].grid(axis='y', alpha=0.2)
ax[1].set_axisbelow(True)
ax[1].tick_params(labelsize=12)
ax[1].set_ylabel("Percentage %", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])

ax[1].spines['right'].set_visible(False)
ax[1].spines['left'].set_visible(False)
ax[1].spines['top'].set_visible(False)

plt.title("Most people embarked from Southampton, people from Cherbough have a high rate of survival, why is that?", loc='center', x=0.038, y=1.08, fontsize=14, color=colors_dark[2])
plt.suptitle("How is the Survival Rate for Every Embarked Spot?", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))


df = df_train.groupby(['Pclass', 'Embarked']).count()['Name'].unstack()

df.plot(kind='bar', stacked=True, ax=ax, color=list([colors_blue[0],colors_dark[2], colors_blue[2], colors_dark[4]]), rot=0)

ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=3, frameon=False, fontsize=12)
ax.grid(axis='y', alpha=0.2)
ax.set_axisbelow(True)
ax.tick_params(labelsize=12)
ax.set_ylabel("Count", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax.set_xlabel("Pclass", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax.yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))

ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.title("It turns out that many people boarding from Cherbourg board in Pclass=1 while majority boarding from Southampton are on Pclass=3", loc='center', x=0.515, y=1.08, fontsize=14, color=colors_dark[2])
plt.suptitle("What Pclass Are People in for Every Embark City?", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.tight_layout()
plt.show()

# Analysis on Family

In [None]:
df_train['Famsize'] = df_train['SibSp'] + df_train['Parch'] + 1

def hasfam(x) : 
    return "With family" if x.Famsize != 1 else "Alone"

df_train['Hasfam'] = df_train.apply(hasfam, axis=1)

In [None]:
def to_pct(row) : 
    return row*100 / row.sum()

df = df_train.groupby(['Hasfam', 'Survived']).count()['Name'].unstack()
df = df.apply(to_pct, axis=1).reset_index()

mean_survive = df.mean()[1]

fig, ax = plt.subplots(1, 2, figsize=(16, 6))

bars0 = sns.countplot(data=df_train, x='Hasfam', hue='Survived', palette=list([colors_blue[1], colors_dark[-1]]), ax=ax[0], hue_order=[1, 0], order=df.Hasfam)

bars1 = ax[1].bar(x=df['Hasfam'], height=df[1], color=colors_blue[1])
bars2 = ax[1].bar(x=df['Hasfam'], height=df[0], bottom=df[1], color=colors_dark[-1], alpha=0.3)

line1 = ax[1].axhline(y=mean_survive, linestyle="--", color=colors_red[-2])

for bar in bars1 : 
    height = bar.get_height() 
    x = bar.get_x() 
    
    ax[1].text(
        x=x+0.4, y=height // 2,
        va='center', ha='center', 
        s=f"{round(height)}%",
        color='white',
        fontweight='bold',
        fontsize=24
    )
    
ax[1].text(
    x=1.6, 
    y=mean_survive,
    s="Average rate {:.2f}%".format(mean_survive),
    backgroundcolor="#2A5D61",
    color="white",
    fontweight='bold',
    fontsize=12
)
    
# AX settings
    
ax[0].legend(["Survived", "Not Survived"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
ax[0].grid(axis='y', alpha=0.2)
ax[0].set_axisbelow(True)
ax[0].tick_params(labelsize=12)
ax[0].set_ylabel("Survival Count", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax[0].yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax[0].set_xlabel(None)

ax[0].spines['right'].set_visible(False)
ax[0].spines['left'].set_visible(False)
ax[0].spines['top'].set_visible(False)
    
ax[1].legend(["Average rate of survival", "Survival Rate (%)"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
ax[1].grid(axis='y', alpha=0.2)
ax[1].set_axisbelow(True)
ax[1].tick_params(labelsize=12)
ax[1].set_ylabel("Percentage %", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])

ax[1].spines['right'].set_visible(False)
ax[1].spines['left'].set_visible(False)
ax[1].spines['top'].set_visible(False)

plt.title("People boarding the titanic with family has a 7% higher rate of survival, most of titanic passenger travels alone", loc='center', x=0.038, y=1.08, fontsize=14, color=colors_dark[2])
plt.suptitle("How is the Survival Rate for Those Who Has a Family?", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.tight_layout()
plt.show()

In [None]:
colors = ['#21484A', '#388085', '#54C2C9', '#4382BC', '#04A8B0', '#AEAEAE']
df = df_train[df_train['Hasfam'] == "With family"]
df = df.groupby('Famsize')['Name'].count()

df_cutted = df[5:].sum()
df = df[:5].append(pd.Series(df_cutted))
labels = ["1", "2", "3", "4", "5", "> 5"]
explode = (0.03,0.03,0.03,0.03, 0.03, 0.03)


fig, ax = plt.subplots(figsize=(14, 8))


wedges, texts, pcttext = ax.pie(
    df.values, 
    wedgeprops=dict(width=0.5), 
    startangle=-40, 
    colors=colors, 
    explode=explode,
    labels=labels,
    textprops={
        'fontsize': 12, 
        'color': colors_dark[1],
        'fontweight': 'bold'
    }, 
    autopct='%1.0f%%',
    pctdistance=0.75, 
    labeldistance=1.2
)

for t in pcttext : 
    t.set_color("white"),

for t in pcttext[:3] : 
    t.set_fontsize(18)


ax.text(
    x=0, y=-1.5,
    s="Family Size:",
    ha='center',
    color=colors_dark[1],
    fontweight='bold',
    fontsize=12
)

ax.legend(labels, loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=6, borderpad=1, frameon=False, fontsize=12)

# plt.title("tes", loc='center', x=0.6, y=1.08, fontsize=14, color=colors_dark[2])
plt.suptitle("Sizes of Family", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.title("It seems that most people who travels with family, travels with a family of three or less", loc='center', ha='center', x=0.53, y=1, fontsize=14, color=colors_dark[2])
plt.tight_layout()
plt.show()

# Analysis on Fare

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
fare_mean = df_train.Fare.mean()
fare_median = df_train.Fare.median()


sns.kdeplot(data=df_train, x='Fare', shade=True, color=colors_blue[0], ax=ax, alpha=1)

ax.axvline(x=age_mean, linestyle='--', color=colors_blue[2])
ax.axvline(x=age_median, linestyle='--', color=colors_blue[3])

ax.annotate(
    text="Average\n {:.2f}".format(age_mean),
    fontsize=12,
    xy=(fare_mean, 0.021),
    xytext=(fare_mean+20, 0.022),
    color=colors_dark[2],
    arrowprops=dict(
        arrowstyle= '-',
        color=colors_dark[2],
        ls='--'
    )
)

ax.annotate(
    text="Median: {:.2f}".format(age_median),
    fontsize=12,
    xy=(fare_median, 0.02),
    xytext=(fare_median-80, 0.025),
    color=colors_dark[2],
    arrowprops=dict(
        arrowstyle= '-',
        color=colors_dark[2],
        ls='--'
    )
)


ax.grid(axis='y', alpha=0.2)
ax.set_axisbelow(True)
ax.tick_params(labelsize=12)
ax.set_xlabel("Fare", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("Density", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])

ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.suptitle("How is The Fare Distributed?", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.title("It looks like that the ticket price is mostly gather on 10-24, we can also see a little hill on between 70-100", loc='center', x=0.48, y=1.05, fontsize=14, color=colors_dark[2])
plt.tight_layout()
plt.show()

In [None]:
bins = [0, 25, 90, 150, np.inf]
labels = ["Ticket 1", "Ticket 2", "Ticket 3", "Ticket 4"]

df_train['Fare Group'] = pd.cut(df_train['Fare'], bins=bins, labels=labels) 

# Analysis on Cabin

In [None]:
df_train.Cabin = df_train.Cabin.fillna("No Cabin")
df_train["Cabin"] = df_train["Cabin"].str[0]

In [None]:
cabin_mapping = {
    "N" : "Don't have cabin",
}

df_train = df_train.replace({"Cabin": cabin_mapping})

In [None]:
def to_pct(row) : 
    return row*100 / row.sum()

df = df_train.groupby(['Cabin', 'Survived']).count()['Name'].unstack()
df = df.apply(to_pct, axis=1).reset_index()

mean_survive = df.mean()[1]

fig, ax = plt.subplots(2, 1, figsize=(12, 12))


bars0 = sns.countplot(
    data=df_train, 
    x='Cabin', 
    hue='Survived', 
    palette=list([colors_blue[1], colors_dark[-1]]),
    ax=ax[0],
    hue_order=[1, 0],
    order = df.Cabin
)

bars1 = ax[1].bar(x=df['Cabin'], height=df[1], label='Survived', color=colors_blue[1])
bars2 = ax[1].bar(x=df['Cabin'], height=df[0], bottom=df[1], label="Dead", color=colors_dark[-1], alpha=0.3)

line1 = ax[1].axhline(y=mean_survive, linestyle="--", color=colors_red[-2])

for bar in bars1 : 
    height = bar.get_height() 
    x = bar.get_x() 
    
    ax[1].text(
        x=x+0.4, y=height // 2,
        va='center', ha='center', 
        s=f"{round(height)}%",
        color='white',
        fontweight='bold',
        fontsize=24
    )
    
ax[1].text(
    x=8, 
    y=mean_survive,
    s="Average rate {:.2f}%".format(mean_survive),
    backgroundcolor="#2A5D61",
    color="white",
    fontweight='bold',
    fontsize=12
)
    
# AX settings
    
ax[0].legend(["Survived", "Not Survived"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
ax[0].grid(axis='y', alpha=0.2)
ax[0].set_axisbelow(True)
ax[0].tick_params(labelsize=12)
ax[0].set_ylabel("Survival Count", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax[0].yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax[0].set_xlabel(None)

ax[0].spines['right'].set_visible(False)
ax[0].spines['left'].set_visible(False)
ax[0].spines['top'].set_visible(False)
    
ax[1].legend(["Average rate of survival", "Survival Rate (%)"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False, fontsize=12)
ax[1].grid(axis='y', alpha=0.2)
ax[1].set_axisbelow(True)
ax[1].tick_params(labelsize=12)
ax[1].set_ylabel("Percentage %", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])

ax[1].spines['right'].set_visible(False)
ax[1].spines['left'].set_visible(False)
ax[1].spines['top'].set_visible(False)

fig.text(ha='center', va='center', x=0.5, y=0.935, s="Assuming that the missing value on Cabin is the people who is not on any cabin, most people dont belong in any cabin", fontsize=14, color=colors_dark[2])
plt.suptitle("How is the Survival Rate for Every Cabin?\n\n\n", fontsize=18, fontweight='bold', color=colors_dark[0])

plt.tight_layout()
plt.show()

In [None]:
data_train = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
data_test = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")

data_train.drop('PassengerId', axis=1, inplace=True)
data_test.drop('PassengerId', axis=1, inplace=True)

# Imputation

In [None]:
data_train['Age'] = data_train['Age'].fillna(data_train.groupby('Pclass')['Age'].transform('median'))
data_train['Fare'] = data_train['Fare'].fillna(data_train['Fare'].mean())

data_test['Age'] = data_test['Age'].fillna(data_test.groupby('Pclass')['Age'].transform('median'))
data_test['Fare'] = data_test['Fare'].fillna(data_test['Fare'].mean())

# Feature Engineering

In [None]:
bins = [0, 5, 17, 30, 50, 100]
labels = ["Baby", "Children", "Young Adult", "Middle Aged", "Senior"]

data_train['Age Group'] = pd.cut(data_train['Age'], bins=bins, labels=labels) 
data_test['Age Group'] = pd.cut(data_test['Age'], bins=bins, labels=labels) 

In [None]:
bins = [0, 25, 90, 150, np.inf]
labels = ["Ticket 1", "Ticket 2", "Ticket 3", "Ticket 4"]

data_train['Fare Group'] = pd.cut(data_train['Fare'], bins=bins, labels=labels) 
data_test['Fare Group'] = pd.cut(data_test['Fare'], bins=bins, labels=labels) 

In [None]:
def hasfam(x) : 
    return "With family" if x.Famsize != 1 else "Alone"

data_train["Famsize"] = data_train["SibSp"] + data_train["Parch"] + 1
data_test["Famsize"] = data_test["SibSp"] + data_test["Parch"] + 1


data_train['Hasfam'] = data_train.apply(hasfam, axis=1)
data_test['Hasfam'] = data_test.apply(hasfam, axis=1)

In [None]:
def onClass3(x) :
    return "Yes" if x.Pclass == 3 else "No"


data_train['On Pclass 3'] = data_train.apply(onClass3, axis=1)
data_test['On Pclass 3'] = data_test.apply(onClass3, axis=1)

In [None]:
embark_mapping = {
    "S" : "Southampton",
    'C' : "Cherbourg", 
    "Q" : "Queenstown",
    np.nan : "Not Embarked"
}

data_train = data_train.replace({"Embarked": embark_mapping})
data_test = data_test.replace({"Embarked": embark_mapping})

In [None]:
data_train.Cabin = data_train.Cabin.fillna("No Cabin")
data_train["Cabin"] = data_train["Cabin"].str[0]

data_test.Cabin = data_test.Cabin.fillna("No Cabin")
data_test["Cabin"] = data_test["Cabin"].str[0]

In [None]:
col_to_use = ["Age", "Fare", "Fare Group", "Age Group", "Sex", "Pclass", "SibSp", "Parch", "Embarked", "Famsize", "On Pclass 3", "Cabin", "Survived"]
col_cat = ["Sex", "Embarked", "On Pclass 3", "Cabin"]
data_train = data_train[col_to_use]
data_test = data_test[col_to_use[:-1]]

In [None]:
dum = pd.get_dummies(data_train[col_cat], prefix=['s', 'e', 'o', 'c'])

data_train = data_train.drop(["Sex", "Embarked", "On Pclass 3", "Cabin"], axis=1)
data_train = pd.concat([data_train, dum], axis=1)

In [None]:
dum = pd.get_dummies(data_test[col_cat], prefix=['s', 'e', 'o', 'c'])

data_test = data_test.drop(["Sex", "Embarked", "On Pclass 3", "Cabin"], axis=1)
data_test = pd.concat([data_test, dum], axis=1)

In [None]:
enc = OrdinalEncoder() 

data_train["Age Group"] = enc.fit_transform(data_train[["Age Group"]])
data_test["Age Group"] = enc.fit_transform(data_test[["Age Group"]])

In [None]:
enc = OrdinalEncoder() 

data_train["Fare Group"] = enc.fit_transform(data_train[["Fare Group"]])
data_test["Fare Group"] = enc.fit_transform(data_test[["Fare Group"]])

In [None]:
X_data_train = data_train.drop('Survived', axis=1)
y_data_train = data_train.Survived

In [None]:
X_data_train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data_train, y_data_train, test_size=0.33)

# Modelling and Feature Importance

In [None]:
clf_rfc = RandomForestClassifier(
    n_estimators=1100,
    criterion='gini',
    max_depth=7,
    max_leaf_nodes=4,
    random_state=4211
)

clf_rfc.fit(X_train, y_train)

# clf_rfc.fit(X_data_train, y_data_train)

# y_pred = clf_rfc.predict(data_test)
y_pred = clf_rfc.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
conf = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(6, 5))
g = sns.heatmap(conf, annot=True, fmt='', cmap=cm, ax=ax, annot_kws={"size": 14})

plt.tight_layout()
plt.show()

In [None]:
importance = clf_rfc.feature_importances_

fig, ax = plt.subplots(figsize=(14, 6))

zipped = sorted(list(zip(X_train.columns, importance)), key= lambda x : x[1], reverse=True)
x, importance = list(zip(*zipped))


bars = ax.bar(x, importance)

for bar in bars : 
    height = bar.get_height()
    if height >= 0 : 
        bar.set_color(colors_blue[0])
    else :
        bar.set_color(colors_blue[2])
        
ax.set_xticks(x)
ax.set_xticklabels(x, rotation=-45, ha='left')
ax.grid(axis='y', alpha=0.2)
ax.set_axisbelow(True)
ax.tick_params(labelsize=12)
ax.set_xlabel("Features", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("Score", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])

ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)


plt.suptitle("Random Forest Feature Importance Score", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.title("", loc='center', x=0.48, y=1.05, fontsize=14, color=colors_dark[2])
plt.tight_layout()
plt.show()

In [None]:
clf_lr = LogisticRegression(
    max_iter=1000,
    C=2,
    verbose=1,
    solver='liblinear'
)

# clf_lr.fit(X_data_train, y_data_train)

clf_lr.fit(X_train, y_train)

y_pred = clf_lr.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
conf = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(6, 5))
g = sns.heatmap(conf, annot=True, fmt='', cmap=cm, ax=ax, annot_kws={"size": 14})

plt.tight_layout()
plt.show()

In [None]:
importance = clf_lr.coef_[0]

fig, ax = plt.subplots(figsize=(14, 6))

bars = ax.bar(X_train.columns, importance)

for bar in bars : 
    height = bar.get_height()
    if height >= 0 : 
        bar.set_color(colors_blue[0])
    else :
        bar.set_color(colors_blue[2])
        
ax.set_xticks(X_train.columns)
ax.set_xticklabels(X_train.columns, rotation=-45, ha='left')
ax.grid(axis='y', alpha=0.2)
ax.set_axisbelow(True)
ax.tick_params(labelsize=12)
ax.set_xlabel("Features", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("Score", fontsize=14, labelpad=10, fontweight='bold', color=colors_dark[0])

ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)



plt.suptitle("Logistic Regression Feature Importance Score", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.title("Positive score indicate a feature that predict class 1, whilst negative score indicate a feature that predict class 0", loc='center', x=0.48, y=1.05, fontsize=14, color=colors_dark[2])
plt.tight_layout()
plt.show()

In [None]:
# sub_df = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")
# sub_df = sub_df[["PassengerId"]]
# sub_df["Survived"] = y_pred

In [None]:
# sub_df.to_csv("sub.csv", index=False)

# Conclusion So Far

### Dataset
* there is a lot of missing value on Cabin feature, might want to engineered it or drop it later 

### Gender
* Most people boarding the Titanic are Male
* Female has a lot higher rate of survival compared with male
* It seems like no matter what the gender is people boarding on Pclass=3 has a low chance of survival whilst Pclass=1 has the highest chance of survival

### Age
* Most people that boarding the Titanic are young adult or seniors 
* Most people who did not survive are young adult or middle aged 
* Children and Senior has a higher 
* It seems like no matter what the age is people boarding on Pclass=3 has a low chance of survival whilst Pclass=1 has the highest chance of survival

### Embarked
* Most people on Titanic embarked from Southampton
* People embarked from Cherbourg has a higher rate of survival whilst people from Southampton has the lowest rate of survival
* People from Cherbourg mostly board the Pclass=1 whilst people from Southampton mostly board the Pclass=3

### Family 
* Most people in Titanic travels alone
* People who travels alone has a lower rate of survival 
* Most people who travels in family, travels in family of 3 or less


# Note
Hi thank you for reading my notebook, i hope you enjoy it and got a lot of insight ðŸ˜ƒ.
If you have any criticism or feedback please feel free to comment!