In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Titanic EDA

The aim of this notebook is to visualize Titanic data in order to get some insights for the development of a predictive model.

Notebook is divided in three parts:

1- import of data

2- Missing value imputation 

3- Data visualization

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
print("Data imported")

First, after importing the data, let's take a look at train and test data types.

In [None]:
print("Train dtypes :",train.dtypes)
print("Test dtypes :", test.dtypes)

Variables can be devided between categorical and numerical varibles.
Categorical variables are:
* Pclass
* Sex
* Embarked
* Ticket
* SibSp
* Parch

Numerical variables are:
* Age
* Fare

Target variable is Suvived: 1 if person suvived 0 if person not survived.


Before data visualization it's important to know how many null values there are both in training and testing dataset.

In [None]:
print("Percentages of null values in training data :\n", np.round(train.isnull().sum()/len(train),3)*100)
print("Percentages of null values in testing data :\n", np.round(test.isnull().sum()/len(test),3)*100)

As you can see some variables like Cabin have an high percentage of null values (67% for training dataset and 70.8% for testing dataset) wheras some other variables like Fare have a very low rate of null values. Having a variable with a high rate of null values doesn't mean that you have to drop it. Instead, you can map null values to another variable and still use it in the model. Null values in some cases can also be informative for the development of the model.
So let's take a look at some strategies to impute missing values.

## Dealing with missing values

#### Missing values in Cabin

In order not to drop the colum we can substitute missing value in cabin with  'W' and create a new categorical variable with the first digit of Cabin variable

In [None]:
df = [train , test]

for d in df:
    d.loc[d['Cabin'].isnull(), 'Cabin'] = 'W'
    d['Cabin_initial'] = d['Cabin'].apply(lambda x : x[0])
    d.drop('Cabin', axis = 1, inplace = True)

#### Missing values in Age and Embarked and Fare

We can use SimpleImputer to deal with missing values in Age, Fare and Embarked, for the first two the imputing strategy is 'mean' while for the latter strategy is 'most_common'. 
We will then drop ticket column.

In [None]:
from sklearn.impute import SimpleImputer

df = [train,test]

# imputing missing values for age 
for d in df:
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit(d[['Age']])
    d['Age_imputed'] = np.round(imp.transform(d[['Age']]),2)
    d.drop('Age', axis =1, inplace = True)
    d.rename({'Age_imputed':'Age'}, axis = 1, inplace = True)

# imputing missing values for Embarked

for d in df:
    imp = SimpleImputer(missing_values= np.nan, strategy = 'most_frequent')
    imp.fit(d[['Embarked']])
    d['Embarked_imputed'] = imp.transform(d[['Embarked']])
    d.drop('Embarked', axis =1, inplace = True)
    d.rename({'Embarked_imputed':'Embarked'},axis =1, inplace = True)
    
# imputing missing values for Fare

for d in df:
    imp = SimpleImputer(missing_values= np.nan, strategy = 'most_frequent')
    imp.fit(d[['Fare']])
    d['Fare_imputed'] = imp.transform(d[['Fare']])
    d.drop('Fare', axis =1, inplace = True)
    d.rename({'Fare_imputed':'Fare'},axis =1, inplace = True)
    
# drop ticket colum

for d in df:
    d.drop('Ticket',axis =1, inplace = True)


After imputation we can take a look at out training and test dataset to see if there are some other missing values.

In [None]:
print('Null values for training:', sum(train.isnull().sum()))
print('Null values for training:', sum(test.isnull().sum()))

Everything seems ok after imputation so let's move on to data visualization!

## Visualizing data

First of all we can define a function to label data in the graphs. This function takes as input an axes a dataframe and plot data at a y_shift distance from the graph.

In [None]:
def autolabel(rects,df,ax, y_shift):
    for rect in rects:
        height = rect.get_height()
        ax.text(x = rect.get_x()+ rect.get_width()/2 , y = height+y_shift , s =f'{np.round(height/len(df)*100,2)}',horizontalalignment='center' )
    

So let's first plot overall survival rate.

In [None]:
fig, ax = plt.subplots()
sns.countplot(x = 'Survived', data = train,ax =ax, palette ='pastel')
ax.set_title('Survival rate',fontsize = 15)
sns.despine()
plt.xticks([0,1],['Not survived', 'Survived'])
ax.set_ylabel('Passenger counts')
ax.set_xlabel('')
rects = ax.patches
autolabel(rects, train, ax,1500)

Survival rate is 57.23 %. This value however alone  does not provide enough information about data. We need to see how survival rate is related to others variables.

Before plotting let's map Pclass values to their respctive classes (First class, Second class an Third class) and Embarked to their repective ports of embarkation. 

In [None]:
train['Pclass'] = train['Pclass'].map({1: 'First class', 2: 'Second class',3:'Third class'})
train['Embarked'] = train['Embarked'].map({'C' : 'Cherbourg', 'S' : 'Southampton', 'Q' : 'Queenstown'})

A first insight could be obtained by plotting how the survival rate varies among class, port of embark and ticket. 

In [None]:
fig, ax = plt.subplots(1,3, figsize = (10,5))
a = sns.countplot(x = 'Pclass', data = train,ax =ax[0], palette ='pastel', hue ='Survived')
a.set_xlabel('')
plt.setp( ax[0].xaxis.get_majorticklabels(), rotation=30 )
autolabel(ax[0].patches, train, ax = ax[0], y_shift =100)
b = sns.countplot(x = 'Embarked', data = train,ax =ax[1], palette ='pastel', hue ='Survived')
b.set_xlabel('')
autolabel(ax[1].patches, train, ax = ax[1], y_shift =100)
plt.setp( ax[1].xaxis.get_majorticklabels(), rotation=30 )
c = sns.countplot(x = 'Cabin_initial', data = train, ax = ax[2], palette = 'pastel', hue = 'Survived')
plt.setp( ax[2].xaxis.get_majorticklabels(), rotation=30 )
c.set_xlabel('')
plt.suptitle('Number of survival for Class, Port of embark and Ticket', fontsize = 15)
sns.despine()
plt.tight_layout()

As we can see First class and Second Class have a higher survival rate than Third class.
There is also a difference in the port of embark: people who embarked in Southampton and Cherbourg have a higher chance of survival than people embarked in Queenstown.


Another column that we can take into account is Sex: so let's take a look if survival rate changes as between male and female.

In [None]:
temp = np.round(train.groupby(['Sex'])['Survived'].sum()/len(train)*100,2)
fig, ax  = plt.subplots(figsize = (6,4))
sns.barplot(x = temp.index, y = temp, palette = 'Pastel2', ax = ax)
sns.despine()
plt.title('Survival rate for sex')
plt.ylabel('Survival rate')
print(temp)

Woman have a higher rate of survive compared to men (about three times higher). So let's go deeper:
are there any differences  between Class or port of embarking?

In [None]:
temp1 = train.groupby(['Embarked','Pclass','Sex'], as_index = False)['Survived'].sum()
temp2 = train.groupby(['Embarked','Pclass'],as_index = False)['Survived'].count()
combined = temp1.merge(temp2, how = 'left', left_on = ['Embarked','Pclass'],right_on = ['Embarked','Pclass'] )
combined['Survival_rate'] = np.round(combined['Survived_x']/combined['Survived_y']*100,2)
combined.drop(['Survived_x','Survived_y'],axis=1, inplace = True )

In [None]:
g = sns.FacetGrid(combined, col = 'Pclass',row = 'Embarked',  margin_titles=True, height=2.5)
g.map(sns.barplot, 'Sex','Survival_rate', palette = 'Pastel2', order = ['female','male'])
g.set_axis_labels('','Survival rate')
g.fig.subplots_adjust(wspace=.02, hspace=.02)
plt.tight_layout()

In [None]:
combined

As we can see survival rate is higher for female. Southampton is the port of embark in which survival rate is lower both for female and male. Second class seems to have the higher chances of survival. Cherbourg is port of embark where survival rate if higher. Southampton has been the first port of embarkation, followed by  Cherbourg and  Queenstown). For the understanding of survival rate it's very important to study how different passenger where dislocated in the Titanic.

Let's now take into account another variable, Age and plot how survival change between different ages.

In [None]:
sns.displot(train ,x ='Age', hue ='Survived', kind ='kde', multiple = 'stack', col = 'Sex', row = 'Pclass', palette = 'Pastel1')
plt.ylabel('Survival density')
plt.suptitle('Survival density for Sex and port of Embark', y =1.009, fontsize = 15)

Probably one thing that you can note is that first class has a higher average rate than second and third class. Let's take a look to se if this is correct.

In [None]:
np.round(train.groupby(['Pclass'])['Age'].mean(),1)

Yes, it's true, we can also split average Age between male and female.

In [None]:
np.round(train.groupby(['Pclass','Sex'], as_index = False)['Age'].mean(),1)

It's there a difference between survival rate and Fare?

In [None]:
sns.jointplot(data = train, x = 'Fare', y = 'Age', hue = 'Survived')

Is there a significant difference between in the correlation between variables and Survived vs Not Survived? Let's take a look with a heatmap.

In [None]:
col_heatmap = [col for col in train.columns if col not in ('Survived','PassengerId')]
train_survived = train[train['Survived'] ==1]
train_not_survived = train[train['Survived'] ==0]
corr_surv = train_survived[col_heatmap].corr()
corr_notsurv = train_not_survived[col_heatmap].corr()
mask = np.triu(np.ones_like(corr_surv,dtype=bool))
fig,ax = plt.subplots(nrows = 1, ncols = 2,figsize = (10,6))
sns.heatmap(corr_surv,mask= mask, ax = ax[0],annot = True)
ax[0].set_title('Correlation for Survived')
sns.heatmap(corr_notsurv,mask= mask, ax = ax[1], annot = True)
ax[1].set_title('Correlation for not survived')
plt.tight_layout()

For survived there seem to be a higher correlation between Age and Fare: that is most of the person who survived where int the first class where fare is higher and also the average age is higher.

Let's now take a look at Sibsp and Parch by creating a new variable by summing the two.

In [None]:
train['Sib+Parch'] = train['SibSp']+ train['Parch']

In [None]:
sns.displot(train ,x ='Sib+Parch', hue ='Survived', kind = 'kde', col = 'Embarked', row = 'Pclass', palette = 'Pastel1')

## Comparison between training and testing dataset

Now we can compare training and testing dataset to see if there are some differences in the distribution of data which could effect the development of a machine learning model.

We can create a new dataframe by concatenating training and testing.

In [None]:
train.drop(['PassengerId','Survived','Name'],axis =1, inplace = True)
train['Pclass'] = train['Pclass'].map({'First class':1, 'Second class':2, 'Third class':3})
test.drop(['PassengerId','Name'], axis = 1,inplace = True)
test['Sib+Parch'] = test['SibSp']+ test['Parch']
train['Embarked'] =train['Embarked'].map({'Cherbourg':'C',  'Southampton':'S',  'Queenstown':'Q'})
test['data'] ='test'
train['data'] = 'train'
df = pd.concat([train,test],axis = 0, ignore_index = True)

In [None]:
fig , ax = plt.subplots(nrows = 2, ncols = 3, figsize = (12,10))
plt.suptitle('Distribution of data between train and test dataset', size = 20)
sns.countplot(x = 'Sex', data = df,ax =ax[0][0], palette ='pastel', hue ='data')
ax[0][0].set_title('Male and female')
sns.histplot(data=df, x="Age", palette = 'Pastel1',hue = 'data', ax =ax[0][1])
ax[0][1].set_title('Age distribution')
sns.countplot(data=df, x="Sib+Parch", palette = 'Pastel1',hue = 'data', ax =ax[0][2])
ax[0][2].set_title('Sib+Parch')
sns.countplot(data=df, x="Embarked", palette = 'Pastel1',hue = 'data', ax =ax[1][0])
ax[1][0].set_title('Embarked')
sns.countplot(data=df, x="Cabin_initial", palette = 'Pastel1',hue = 'data',ax =ax[1][1])
ax[1][1].set_title('Cabin initial')
sns.histplot(data=df, x="Fare", palette = 'Pastel1',hue = 'data',multiple = 'dodge',bins = 5, ax =ax[1][2])
ax[1][2].set_title('Fare distribution')
plt.tight_layout()

There seems to be a difference in Age distribution between training and testing: in the testing dataset there seems to be more concentration in the ages between 20 and 30. Male and Female distribution seems also slightly different.

Next steps: in another notebook I will develop a predictive model trying to exploit the intuitions about data distribution and relationship of features with the target variable gained in this notebook.