In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib as mpl
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.express as pex
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import & load data

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
df_train.head()

In [None]:
from cycler import cycler

mpl.rcParams['figure.dpi'] = 120
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False
# mpl.rcParams['font.family'] = 'serif'

raw_light_palette = [
    (0, 122, 255), # Blue
    (255, 149, 0), # Orange
    (52, 199, 89), # Green
    (255, 59, 48), # Red
    (175, 82, 222),# Purple
    (255, 45, 85), # Pink
    (88, 86, 214), # Indigo
    (90, 200, 250),# Teal
    (255, 204, 0)  # Yellow
]

light_palette = np.array(raw_light_palette)/255


mpl.rcParams['axes.prop_cycle'] = cycler('color',light_palette)

survived_palette = ['#dddddd', light_palette[2]]
sex_palette = [light_palette[0], light_palette[3]]

In [None]:
from datetime import datetime


tl_dates = [
    "WED April 10",
    "SUN April 14",
    "MON April 15",
    "THU April 18"
]

tl_x = [1, 2, 6, 9]

tl_sub_x = [1.5, 2.4, 2.9, 3.4, 3.8, 4.5, 5.0, 6.5, 7, 7.6, 8]
tl_sub_times = [
    "1:30 PM",
    "9:00 AM",
    "1:42 PM",
    "7:15 PM",
    "10:00 PM",
    "11:30 PM",
    "11:40 PM",
    "12:20 AM",
    "12:45 AM",
    "2:00 AM",
    "2:20 AM",
]

tl_text = [
    "Titanic sets sail.",
    "Recieve Message.",
    "Baltic Warns Titanic\nof icebergs.", 
    "Smith requests the\n return of the message.",
    "Second Officer\n Lightroller is\n relievced from duty.",
    "Warning bells, iceberg\n sighting.",
    "Titanic hits an iceberg.",
    "Life boats are being\n lowered.",
    "Passengers slowly arrive\n on deck.",
    "Rear of boat begins to\n raise.",
    "Titanic sinks."
]

# Set figure & Axes
fig, ax = plt.subplots(figsize=(15, 5), constrained_layout=True)
ax.set_ylim(-2, 2)
ax.set_xlim(0, 10)


# Timeline : line
ax.axhline(0, xmin=0.1, xmax=0.95, c='#4a4a4a', zorder=1)
# Timeline : Date Points
ax.scatter(tl_x, np.zeros(len(tl_x)), s=120, c='#4a4a4a', zorder=2)
ax.scatter(tl_x, np.zeros(len(tl_x)), s=30, c='#fafafa', zorder=3)
# Timeline : Time Points
ax.scatter(tl_sub_x, np.zeros(len(tl_sub_x)), s=50, c='#4a4a4a',zorder=4)

# Date Text
for x, date in zip(tl_x, tl_dates):
    ax.text(x, -0.2, date, ha='center', 
            fontfamily='serif', fontweight='bold',
            color='#4a4a4a')
    

# Stemplot : vertical line
levels = np.zeros(len(tl_sub_x))    
levels[::2] = 0.3
levels[1::2] = -0.3
markerline, stemline, baseline = ax.stem(tl_sub_x, levels, use_line_collection=True)    
plt.setp(baseline, zorder=0)
plt.setp(markerline, marker=',', color='#4a4a4a')
plt.setp(stemline, color='#4a4a4a')

# Text
for idx, x, time, txt in zip(range(1, len(tl_sub_x)+1), tl_sub_x, tl_sub_times, tl_text):
    ax.text(x, 1.3*(idx%2)-0.5, time, ha='center', 
            fontfamily='serif', fontweight='bold',
            color='#4a4a4a' if idx!=len(tl_sub_x) else '#e3120b', fontsize=11)
    
    ax.text(x, 1.3*(idx%2)-0.6, txt, va='top', ha='center', 
        fontfamily='serif',color='#4a4a4a' if idx!=len(tl_sub_x) else '#e3120b')

# Spine
for spine in ["left", "top", "right", "bottom"]:
    ax.spines[spine].set_visible(False)

# Ticks    
ax.set_xticks([]) 
ax.set_yticks([]) 

# Title
ax.set_title("Titanic Timeline", fontweight="bold", fontfamily='serif', fontsize=16, color='#4a4a4a')

plt.show()

# EDA

In [None]:
#First we chect train and test files null data -> Null data have to be fill in!!!
for col in df_train.columns:
    wsg = 'column: {:>10}₩t Percent of NaN value: {:.2f}%'.format(col, 100*(df_train[col].isnull().sum()/df_train[col].shape[0]))
    print(wsg)

In [None]:
import missingno as msno  #show the nerd data in the dataframe 

#.iloc[] = index location. it brings the index that we need (distribution)
#the blank below the graph is the null
msno.matrix(df=df_train.iloc[:, :],figsize=(8,8),color=(0.8,0.5,0.2))

In [None]:
#Other way to find null data - using bar (percentage)
msno.bar(df=df_train.iloc[:, :],figsize=(8,8),color=(0.8,0.5,0.2))

In [None]:
f, ax = plt.subplots(1,2,figsize=(8,6))  

#explode = make a distance between the picture
#autopct = make a percentage 
#  ax[0], ax[1] = So which part you gona put in between 0 and 1

df_train['Survived'].value_counts().plot.pie(explode=[0,0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('Pie plot - Survived')
ax[0].set_ylabel('') #ylabel = blank
sns.countplot('Survived', data=df_train, ax=ax[1])  #Count the Survived in the file df_train
ax[1].set_title('Count plot - Survived')
plt.show()

#The result show that this data is balanced 

In [None]:
fig = plt.figure(figsize=(10, 6))

gs = fig.add_gridspec(3, 4)

train=df_train
survived_palette = ['#dddddd', light_palette[2]]
sex_palette = [light_palette[0], light_palette[3]]

ax_sex_survived = fig.add_subplot(gs[:2,:2])
sns.countplot(x='Sex',hue='Survived', data=train, ax=ax_sex_survived, 
              palette=survived_palette)

ax_survived_sex = fig.add_subplot(gs[:2,2:4], sharey=ax_sex_survived)
sns.countplot(x='Survived',hue='Sex', data=train, ax=ax_survived_sex,
              palette=sex_palette
             )
# ax_survived_sex.set_yticks([])
ax_survived_sex.set_ylabel('')

ax_pie_male = fig.add_subplot(gs[2, 0])
ax_pie_female = fig.add_subplot(gs[2, 1])
ax_pie_notsurvived = fig.add_subplot(gs[2, 2])
ax_pie_survived = fig.add_subplot(gs[2, 3])

# Sex
male = train[train['Sex']=='male']['Survived'].value_counts().sort_index()
ax_pie_male.pie(male, labels=male.index, autopct='%1.1f%%',explode = (0, 0.1), startangle=90,
               colors=survived_palette
               )

female = train[train['Sex']=='female']['Survived'].value_counts().sort_index()
ax_pie_female.pie(female, labels=female.index, autopct='%1.1f%%',explode = (0, 0.1), startangle=90,
                colors=survived_palette
                 )

# Survived
notsurvived = train[train['Survived']==0]['Sex'].value_counts()[['male', 'female']]
ax_pie_notsurvived.pie(notsurvived, labels=notsurvived.index, autopct='%1.1f%%',startangle=90,
                      colors=sex_palette, textprops={'color':"w"}
                      )

survived = train[train['Survived']==1]['Sex'].value_counts()[['male', 'female']]
ax_pie_survived.pie(survived, labels=survived.index, autopct='%1.1f%%', startangle=90,
                    colors=sex_palette, textprops={'color':"w"}
                   )

fig.suptitle('[Sex & Survived] Conditional Distribution', fontweight='bold', fontsize=20)
fig.text(s='''Gender and survival are the most important features of the existing Titanic problem.\nLook at each conditional probability and think of the minimum score''', 
         x=0.5, y= 0.94, ha='center', va='top')

plt.show()


In [None]:
df_train[['Pclass','Survived']].groupby(['Pclass'], as_index=True).count()


In [None]:
pd.crosstab(df_train['Pclass'],df_train['Survived'], margins=True).style.background_gradient(cmap='cool')

In [None]:
f, ax = plt.subplots(1,2,figsize=(10,5))
df_train[['Sex','Survived']].groupby(['Sex'],as_index=True).mean().plot.bar(ax=ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot('Sex', hue= 'Survived', data = df_train, ax=ax[1])
ax[1].set_title('Sex: Survived vs Dead')
plt.show()


In [None]:
pd.crosstab(df_train['Sex'], df_train['Survived'], margins=True).style.background_gradient(cmap='summer_r')

In [None]:
#Let me draw a histogram of the Age of Survival.
fig, ax = plt.subplots(1,1,figsize=(9,5))
sns.kdeplot(df_train[df_train['Survived']==1]['Age'],ax=ax)
sns.kdeplot(df_train[df_train['Survived']==0]['Age'],ax=ax)
plt.legend(['Survived == 1', 'Survived == 0'])
plt.show()

In [None]:
Survived0 = train[train['Survived'] == 0]['Age']
Survived1 = train[train['Survived'] == 1]['Age']

fig = go.Figure()
fig.add_trace(go.Violin(x=Survived0, line_color='salmon', name='Survived0'))
fig.add_trace(go.Violin(x=Survived1, line_color='gold', name= 'Survived1'))


fig.update_traces(orientation='h', side='positive', width=3, points=False, meanline_visible=True)
fig.update_layout(xaxis_showgrid=True, xaxis_zeroline=False)

fig.update_layout(title='Survival-Age distn.',
                  xaxis_title='Age', 
    width=750,
    template="plotly_dark",
    showlegend=False,
    paper_bgcolor="black",
    font=dict(
        color ='white', 
    )
 )
fig.show()


In [None]:
def age_band(num):
    for i in range(1, 100):
        if num < 10*i :  return f'{(i-1) * 10} ~ {i*10}'

train['Age band'] = train['Age'].apply(age_band)
titanic_age = train[['Age band', 'Survived']].groupby('Age band')['Survived'].value_counts().sort_index().unstack().fillna(0)
titanic_age['Survival rate'] = titanic_age[1] / (titanic_age[0] + titanic_age[1]) * 100
age_band = train['Age band'].value_counts().sort_index()


from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable

fig = plt.figure(figsize=(10, 7))
gs = fig.add_gridspec(3, 4)
ax = fig.add_subplot(gs[:-1,:])

color_map = ['#d4dddd' for _ in range(9)]
color_map[2] = light_palette[3]
color_map[8] = light_palette[2]


bars = ax.bar(titanic_age['Survival rate'].index, titanic_age['Survival rate'], 
       color=color_map, width=0.55, 
       edgecolor='black', 
       linewidth=0.7)

ax.spines[["top","right","left"]].set_visible(False)
ax.bar_label(bars, fmt='%.2f%%')


# mean line + annotation
mean = train['Survived'].mean() *100
ax.axhline(mean ,color='black', linewidth=0.4, linestyle='dashdot')
ax.annotate(f"mean : {mean :.4}%", 
            xy=('20 ~ 30', mean + 4),
            va = 'center', ha='center',
            color='#4a4a4a',
            bbox=dict(boxstyle='round', pad=0.4, facecolor='#efe8d1', linewidth=0))
    


ax.set_yticks(np.arange(0, 81, 20))
ax.grid(axis='y', linestyle='-', alpha=0.4)
ax.set_ylim(0, 85)


ax_bottom = fig.add_subplot(gs[-1,:])
bars = ax_bottom.bar(age_band.index, age_band, width=0.55, 
       edgecolor='black', 
       linewidth=0.7)

ax_bottom.spines[["top","right","left"]].set_visible(False)
ax_bottom.bar_label(bars, fmt='%d', label_type='center', color='white')
ax_bottom.grid(axis='y', linestyle='-', alpha=0.4)

# Title & Subtitle    
fig.text(0.1, 1, 'Age Band & Survival Rate', fontsize=15, fontweight='bold', fontfamily='serif', ha='left')
fig.text(0.1, 0.96, 'Unlike before, the survival rate of infants and toddlers is very low.', fontsize=12, fontweight='light', fontfamily='serif', ha='left')

plt.show()


In [None]:
sibsp = train.groupby('SibSp')['Survived'].mean().sort_index()*100
parch = train.groupby('Parch')['Survived'].mean().sort_index()*100

fig, axes = plt.subplots(1, 2, figsize=(10, 5), sharey=True)
axes[0].barh(width=100, y=sibsp.index, color='#dedede')
hbar1 = axes[0].barh(width=sibsp, y=sibsp.index, color=light_palette[2])

axes[0].bar_label(hbar1, fmt='%.02f%%', padding=2)

axes[1].barh(width=100, y=parch.index, color='#dedede')
hbar2 = axes[1].barh(width=parch, y=parch.index, color=light_palette[2])
axes[1].bar_label(hbar2, fmt='%.02f%%', padding=2)

axes[0].set_yticks(range(0, max(parch.index)+1))
axes[0].invert_yaxis()

axes[0].set_xticks([])
axes[1].set_xticks([])
axes[0].spines[['bottom', 'left']].set_visible(False)
axes[1].spines[['bottom', 'left']].set_visible(False)

axes[0].axvline(mean ,color='black', linewidth=0.4, linestyle='dashdot')
axes[1].axvline(mean ,color='black', linewidth=0.4, linestyle='dashdot')

for ax in axes:
    ax.annotate(f"mean : {mean :.4}%", 
            xy=(mean + 4, 7),
            va = 'center', ha='left',
            color='#4a4a4a',
            bbox=dict(boxstyle='round', pad=0.4, facecolor='#efe8d1', linewidth=0))
    

axes[0].set_title('SibSp Survived Ratio', zorder=0)
axes[1].set_title('Parch Survived Ratio', zorder=0)



plt.show()

In [None]:
#Age distribution withing classes, by using hist plot we can see easily 
plt.figure(figsize=(8,4))
df_train['Age'][df_train['Pclass']==1].plot(kind='hist')
df_train['Age'][df_train['Pclass']==2].plot(kind='hist')
df_train['Age'][df_train['Pclass']==3].plot(kind='hist')

plt.xlabel('Age')
plt.title('Age Distribution within classes')
plt.legend(['1st Class', '2nd Class', '3rd Class'])

In [None]:
fig = plt.figure(figsize=(8, 5))
gs = fig.add_gridspec(3,1)
gs.update(hspace= -0.55)

axes = list()
colors = light_palette[-3:]

for idx, cls, c in zip(range(3), sorted(train['Pclass'].unique()), colors):
    axes.append(fig.add_subplot(gs[idx, 0]))
    
    # you can also draw density plot with matplotlib + scipy.
    sns.kdeplot(x='Age', data=train[train['Pclass']==cls], 
                fill=True, ax=axes[idx], cut=0, bw_method=0.20, 
                lw=1.4, edgecolor='lightgray',color=c, alpha=1) 
    
    axes[idx].set_ylim(0, 0.035)
    axes[idx].set_xlim(0, 85)
    
    axes[idx].set_yticks([])
    if idx != 2 : axes[idx].set_xticks([])
    axes[idx].set_ylabel('')
    axes[idx].set_xlabel('')


    axes[idx].spines[["top","right","left","bottom"]].set_visible(False)
        
    axes[idx].patch.set_alpha(0)
    axes[idx].text(-0.2,0,f'Pclass {cls}',fontweight="light", fontfamily='serif', fontsize=11,ha="right")

fig.text(0.13,0.81,"Age distribution by Pclass in Titanic", fontweight="bold", fontfamily='serif', fontsize=16)
plt.show()    


In [None]:
cummulate_survival_ratio = []
#Survival of age by showing the trend.

for i in range(1, 80):
    cummulate_survival_ratio.append(df_train[df_train['Age'] < i]['Survived'].sum() / len(df_train[df_train['Age'] < i]['Survived']))

plt.figure(figsize=(7, 7))
plt.plot(cummulate_survival_ratio)
plt.title('Survival rate change depending on range of Age', y=1.02)
plt.ylabel('Survival rate')
plt.xlabel('Range of Age(0~x)')
plt.show()

In [None]:
f, ax = plt.subplots(1,2,figsize=(18,8))
sns.violinplot('Pclass','Age', hue='Survived', data=df_train, scale='count', split=True, ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,110,10))

sns.violinplot('Sex','Age', hue='Survived', data=df_train, scale='count',split=True, ax=ax[1])
ax[1].set_title('sex and Age vs Survived')
ax[1].set_yticks(range(0,110,10))
plt.show()

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(8 , 5), sharex=True)

for idx, feature in enumerate(['Pclass', 'Embarked']):
    sns.heatmap(train.groupby([feature, 'Age band'])['Survived'].aggregate('mean').unstack()*100, ax=axes[idx],
                square=True, annot=True, fmt='.2f', center=mean, linewidth=2,
                cbar=False, cmap=sns.diverging_palette(240, 10, as_cmap=True)
               ) 

plt.show()

In [None]:
#We can combine data because it is combined with number 
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1 #we have to add 1 because we have include oneself
df_test['FamilySize'] = df_test['SibSp'] + df_train['Parch'] + 1 #we have to add 1 because we have include oneself

f,ax=plt.subplots(1, 3, figsize=(40,10))
sns.countplot('FamilySize', data=df_train, ax=ax[0])
ax[0].set_title('(1) No. Of Passengers Boarded', y=1.02)

sns.countplot('FamilySize', hue='Survived', data=df_train, ax=ax[1])
ax[1].set_title('(2) Survived countplot depending on FamilySize',  y=1.02)

df_train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=True).mean().sort_values(by='Survived', ascending=False).plot.bar(ax=ax[2])
ax[2].set_title('(3) Survived rate depending on FamilySize',  y=1.02)

plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()

# Feat. importance

## Boruta

In [None]:
df_train

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

###initialize Boruta
forest = RandomForestRegressor(
   n_jobs = -1, 
   max_depth = 5
)
boruta = BorutaPy(
   estimator = forest, 
   n_estimators = 'auto',
   max_iter = 100, # number of trials to perform
   verbose=1
)
print(df_train.isnull().sum(axis = 0))
#df_boruta=df_train.drop(['Name', 'Fare', 'PassengerId','Ticket','Cabin'], axis=1)
df_boruta=df_train.drop(['Name',  'PassengerId','Ticket','Cabin'], axis=1)
#df_boruta=df_train.drop(['Name', 'Fare', 'Ticket','Cabin'], axis=1)
# integer encode
label_encoder = LabelEncoder()
df_boruta.Sex= label_encoder.fit_transform(df_boruta.Sex)
df_boruta.Embarked = label_encoder.fit_transform(df_boruta.Embarked)
df_boruta = df_boruta.dropna()

print(df_boruta.head())

### fit Boruta (it accepts np.array, not pd.DataFrame)
boruta.fit(np.array(df_boruta.drop(['Survived'], axis=1)), np.array(df_boruta.Survived))
### print results
green_area = df_boruta.drop(['Survived'], axis=1).columns[boruta.support_].to_list()
blue_area = df_boruta.drop(['Survived'], axis=1).columns[boruta.support_weak_].to_list()
print('features in the green area:', green_area)
print('features in the blue area:', blue_area)

In [None]:
### store feature importances
feat_imp_X = forest.feature_importances_[:len(df_boruta.drop(['Survived'], axis=1).columns)]
feat_imp_shadow = forest.feature_importances_[len(df_boruta.drop(['Survived'],axis=1).columns):]
### compute hits
hits = feat_imp_X > feat_imp_shadow.max()
feat_imp_X

In [None]:
final_features = list()
indexes = np.where(boruta.support_ == True)
for x in np.nditer(indexes):
    final_features.append(feat_imp_X[x])
print(final_features)

## SHAP

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df_shap=df_train.drop(['Name',  'PassengerId','Ticket','Cabin'], axis=1)
#df_boruta=df_train.drop(['Name', 'Fare', 'Ticket','Cabin'], axis=1)
# integer encode
label_encoder = LabelEncoder()
df_shap.Sex= label_encoder.fit_transform(df_shap.Sex)
df_shap.Embarked = label_encoder.fit_transform(df_shap.Embarked)
df_shap = df_shap.dropna()
print(df_shap.head(10))

x_train, x_test, y_train, y_test = train_test_split(df_shap.drop(['Survived'], axis=1), df_shap.Survived, test_size=0.02, random_state=0)

from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(x_train, y_train)
LR.score(x_test, y_test)

In [None]:
import shap
explainer = shap.LinearExplainer(LR, x_train, feature_perturbation="interventional")
shap_values = explainer.shap_values(x_test)
shap.summary_plot(shap_values, x_test)
shap.summary_plot(shap_values, x_train, plot_type="bar")

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[3,:], x_test.iloc[3,:], link="logit")

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, x_test, link="logit")

## Cramer

In [None]:
from scipy import stats

## I borrowred this code snippet from https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

categoricals = ['Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch']
print("Cramer's -V categorival features correlation with Survival ")
print('**********************************************************')
for cats in categoricals:
    print('Correlation between {} and survival is {:.2f}'.format(cats, cramers_v(df_train[cats], df_train['Survived'])))

In [None]:
train=df_train
PClass1_1 = train[(train['Pclass'] == 1) &(train['Survived'] == 1)]['Age']
PClass1_0 = train[(train['Pclass'] == 1) &(train['Survived'] == 0)]['Age']
PClass2_1 = train[(train['Pclass'] == 2) &(train['Survived'] == 1)]['Age']
PClass2_0 = train[(train['Pclass'] == 2) &(train['Survived'] == 0)]['Age']
PClass3_1 = train[(train['Pclass'] == 3) &(train['Survived'] == 1)]['Age']
PClass3_0 = train[(train['Pclass'] == 3) &(train['Survived'] == 0)]['Age']


fig = go.Figure()

fig.add_trace(go.Violin(x=PClass1_1, line_color='salmon', name='PClass1_[1]', ))
fig.add_trace(go.Violin(x=PClass1_0, line_color='lightsalmon', name= 'PClass1_[0]', ))
fig.add_trace(go.Violin(x=PClass2_1, line_color='seagreen', name='PClass2_[1]', ))
fig.add_trace(go.Violin(x=PClass2_0, line_color='lightseagreen', name='PClass2_[0]', ))
fig.add_trace(go.Violin(x=PClass3_1, line_color='gold', name= 'PClass3_[1]', ))
fig.add_trace(go.Violin(x=PClass3_0, line_color='silver', name='PClass3_[0]', ))

fig.update_traces(orientation='h', side='positive', width=3,
                  bandwidth = None, points=False, meanline_visible=True, scalemode='count')
fig.update_layout(xaxis_showgrid=True, xaxis_zeroline=False)

fig.update_layout(title='Pclass-Age Survival',
                  font_family="San Serif",
                  xaxis_title='Age',
                  width=600,height=400,
                  template="plotly_dark",
                  showlegend=False,
                  titlefont={'size': 24},
                  paper_bgcolor="black",
                  font=dict(
                      color ='white',
                      )
                  )
fig.show()

In [None]:
train=df_train
suv = train.groupby(['Survived', 'Sex', 'Pclass']).agg({'Survived': 'count'}).rename(columns = {'Survived': 'count'}).reset_index()
suv.iloc[0:6, 0] = 'Not survived'
suv.iloc[6:, 0] = 'Survived'
for i in range(len(suv.index)):
    suv.iloc[i,2] = str(suv.iloc[i,2]) + ' class'

fig = px.sunburst(suv, path = ['Survived', 'Sex', 'Pclass'], values = 'count', color = 'Survived',
                 color_discrete_map = {'Not survived': '#A01D26', 'Survived': '#ACBEBE'},
                 width = 700, height = 700)

fig.update_layout(annotations = [dict(text = 'Distribution of male and female survival rates by class', 
                                      x = 0.5, y = 1.1, font_size = 24, showarrow = False, 
                                      font_family = 'Calibri Black',
                                      font_color = 'black')])

fig.update_traces(textinfo = 'label + percent parent')
                  
fig.show()

## LIME

In [None]:
import lime
import lime.lime_tabular

df_lime=df_train.drop(['Name',  'PassengerId','Ticket','Cabin'], axis=1)
#df_boruta=df_train.drop(['Name', 'Fare', 'Ticket','Cabin'], axis=1)
# integer encode
label_encoder = LabelEncoder()
df_lime.Sex= label_encoder.fit_transform(df_lime.Sex)
df_lime.Embarked = label_encoder.fit_transform(df_lime.Embarked)
df_lime = df_lime.dropna()


x_train, x_test, y_train, y_test = train_test_split(df_lime.drop(['Survived'], axis=1), df_lime.Survived, test_size=0.2, random_state=0)
def predict_fn(x):
    preds = clf.predict(x, num_iteration=clf.best_iteration).reshape(-1,1)
    p0 = 1 - preds
    return np.hstack((p0, preds))

explainerLime = lime.lime_tabular.LimeTabularExplainer(
    x_train.values,
    mode='classification',
    feature_names=df_lime.columns,
   class_names=["NotSurvived", "Survived"],
   verbose=True
    )

np.random.seed(1)
i = 0
exp = explainerLime.explain_instance(x_train[df_lime.columns].values[i], predict_fn, num_features=10)
exp.show_in_notebook(show_all=True)

# Prédictions

## mljar

In [None]:
!pip install -q -U git+https://github.com/mljar/mljar-supervised.git@master
from supervised.automl import AutoML # mljar-supervised


In [None]:
train=df_train
x_cols = train.columns[2:].tolist()
y_col = train.columns[1]

automl = AutoML(
    mode="Compete", 
    eval_metric="auc",
    total_time_limit=2*360,
#    total_time_limit=2*3600,
    features_selection=False # switch off feature selection
)
automl.fit(train[x_cols], train[y_col])

In [None]:
automl.report()

In [None]:
preds = automl.predict(test[x_cols])
submission = pd.DataFrame({'PassengerId':test.iloc[:,0], 'Survived': preds})
submission.to_csv('1_submission.csv', index=False)


## Naive Bayes Classifier

Notice we have the Name of each passenger. We won't use that feature for our classifier because it is not significant for our problem. We'll also get rid of the Fare feature because it is continuous and our features need to be discrete.

There are Naive Bayes Classifiers that support continuous features. For example, the Gaussian Naive Bayes Classifier.

In [None]:
df_nbc_train=df_train.drop(['Name', 'Fare', 'PassengerId','Ticket','Cabin'], axis=1)
df_nbc_test=df_train.drop(['Name', 'Fare','PassengerId','Ticket','Cabin'], axis=1)
#print(len(df_nbc_train.drop(['Survived'], axis=1)[0]))
print(df_nbc_train.drop(['Survived'], axis=1).values[0])

df_nbc_train.head()


In [None]:
class NaiveBayesClassifier:
    
    def __init__(self, X, y):
        
        '''
        X and y denotes the features and the target labels respectively
        '''
        self.X, self.y = X, y 
        
        self.N = len(self.X) # Length of the training set

        self.dim = len(self.X[0]) # Dimension of the vector of features

        self.attrs = [[] for _ in range(self.dim)] # Here we'll store the columns of the training set

        self.output_dom = {} # Output classes with the number of ocurrences in the training set. In this case we have only 2 classes

        self.data = [] # To store every row [Xi, yi]
        
        
        for i in range(len(self.X)):
            for j in range(self.dim):
                # if we have never seen this value for this attr before, 
                # then we add it to the attrs array in the corresponding position
                if not self.X[i][j] in self.attrs[j]:
                    self.attrs[j].append(self.X[i][j])
                    
            # if we have never seen this output class before,
            # then we add it to the output_dom and count one occurrence for now
            if not self.y[i] in self.output_dom.keys():
                self.output_dom[self.y[i]] = 1
            # otherwise, we increment the occurrence of this output in the training set by 1
            else:
                self.output_dom[self.y[i]] += 1
            # store the row
            self.data.append([self.X[i], self.y[i]])
            
            

    def classify(self, entry):

        solve = None # Final result
        max_arg = -1 # partial maximum

        for y in self.output_dom.keys():

            prob = self.output_dom[y]/self.N # P(y)

            for i in range(self.dim):
                cases = [x for x in self.data if x[0][i] == entry[i] and x[1] == y] # all rows with Xi = xi
                n = len(cases)
                prob *= n/self.N # P *= P(Xi = xi)
                
            # if we have a greater prob for this output than the partial maximum...
            if prob > max_arg:
                max_arg = prob
                solve = y

        return solve

In [None]:
values=df_nbc_test.drop(['Survived'],axis=1).values
print (len(values))


## Creating the Naive Bayes Classifier instance with the training data
nbc = NaiveBayesClassifier(df_nbc_train.drop(['Survived'],axis=1).values,df_nbc_train['Survived'].values)


total_cases = len(df_nbc_test) # size of validation set

# Well classified examples and bad classified examples
good = 0
bad = 0
for i in range(total_cases):
    predict = nbc.classify(values[i])
    if (i % 100 == 0):
        print(i)
#     print(y_val[i] + ' --------------- ' + predict)
    if df_nbc_test.Survived.values[i] == predict:
        good += 1
    else:
        bad += 1

print('TOTAL EXAMPLES:', total_cases)
print('RIGHT:', good)
print('WRONG:', bad)
print('ACCURACY:', good/total_cases)

## LightAutoML

In [None]:
pip install -U lightautoml