In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<br>
<h1 style = "font-size:35px; font-family:cursive ; font-weight : bold; color : #AA3073; text-align: center; border-radius: 10px 100px;">TITANIC EDA </h1>
<br>

In [None]:
from IPython.display import HTML
HTML("""
<style>
h1,h2,h3 {
    margin: 1em 0 0.5em 0;
    font-weight: 600;
    font-family: 'Titillium Web', sans-serif;
    position: relative;  
    font-size: 36px;
    line-height: 40px;
    padding: 15px 15px 15px 2.5%;
    color: #13003A;
    box-shadow:
        inset 0 0 0 1px rgba(53,86,129, 1), 
		inset 0 0 5px rgba(53,86,129, 1),
		inset -285px 0 35px white;;
    border-radius: 0 10px 0 15px;
    background: #fff
    
}
</style>
""")

<h1 style="background-color:#9370DB;font-size:20px;color:#00033E;font-weight : bold">Importance of EDA</h1>

**Exploratory Data Analysis (EDA), also known as Data Exploration, is a step in the Data Analysis Process, where a number of techniques are used to better understand the dataset being used**.
1. Detect outliers and anomalies
2. Determine the quality of data
3. Determine what statistical models can fit the data
4. Find out if the assumptions about the data, that you or your team started out with is correct or way off.
5. Extract variables or dimensions on which the data can be pivoted.
6. Determine whether to apply univariate or multivariate analytical techniques.

<h1 style="background-color:#9370DB;font-size:20px;color:#00033E;font-weight : bold">Loading Data</h1>

In [None]:
train_path = '../input/titanic/train.csv'

test_path = '../input/titanic/test.csv'

# Read a comma-separated values (csv) file into pandas DataFrame
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# shape of tha data
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)


In [None]:
#first Five Rows
train.head()

<h1 style="background-color:#9370DB;font-size:20px;color:#00033E;font-weight : bold">Types of Variables</h1>

In [None]:
data = pd.concat([train, test], sort = False)
data.info()

Age and Fare are both numeric variables with a continuous value.
Pclass appears to be an integer, but it is actually a categorical variable with three numbers.
Survived variable has type 'float' which is incorrect but we will leave it as float for now.

<h1 style="background-color:#9370DB;font-size:20px;color:#00033E;font-weight : bold">Missing Values</h1>

In [None]:
sns.color_palette("magma", as_cmap=True)
sns.heatmap(train.isnull(),yticklabels=False,cbar=False)

Cabin column has so much missing data that it might be diffivult to get any info from it,while age too has some missing values.

<h1 style="background-color:#9370DB;font-size:20px;color:#00033E;font-weight : bold">Data Exploration and Visualization</h1>

SURVIVED

In [None]:
sns.countplot(x='Survived', data=train,palette = 'magma')

In [None]:
print(train.Survived.sum()/train.Survived.count())

Implies that more people died than survived

Females have higher chances to survive

In [None]:
print("% of women survived: " , train[train.Sex == 'female'].Survived.sum()/train[train.Sex == 'female'].Survived.count())
print("% of men survived:   " , train[train.Sex == 'male'].Survived.sum()/train[train.Sex == 'male'].Survived.count())

In [None]:
import matplotlib.pyplot as plt
f,ax=plt.subplots(1,2,figsize=(16,7))
colors = ['#eed4d0', '#cda0aa']
train['Survived'][train['Sex']=='male'].value_counts().plot.pie(explode=[0,0.2],colors = colors,autopct='%1.1f%%',ax=ax[0],shadow=True)
train['Survived'][train['Sex']=='female'].value_counts().plot.pie(explode=[0,0.2],colors = colors,autopct='%1.1f%%',ax=ax[1],shadow=True)
ax[0].set_title('Survived (male)')
ax[1].set_title('Survived (female)')

plt.show()

In [None]:
sns.catplot('Pclass','Survived', kind='point', data=train,palette="tab10")

**Passenger Class and Sex**

In [None]:
ax = sns.countplot(data['Pclass'], palette = ['#eed4d0', '#cda0aa', '#a2708e'])
# calculate passengers for each category
labels = (data['Pclass'].value_counts(sort = False))
# add result numbers on barchart
for i, v in enumerate(labels):
    ax.text(i, v+2, str(v), horizontalalignment = 'center', size = 12, color = 'black', fontweight = 'bold')
    
    
plt.title('Passengers distribution by family size')
plt.ylabel('Number of passengers')
plt.tight_layout()

Most of the Titanic's passengers were traveling third class and the second class is the smallest in terms of the number of passengers.

Class vs. Gender vs. Age -> Surviving status
For better understanding how the combination of some factors influence on chances to survive, lets break passengers into 18 imaginary groups separated by:

Class (1 / 2 / 3)
Gender (male / female)
Age ( <16 / 16-40 / 40<)
To do so I will create 6 stripplots (3 for male, 3 for female), with values grouped by Surviving status and add background color to separate age groups:

In [None]:
plt.figure(figsize=(20, 6))

# set palette
palette = sns.cubehelix_palette(5, start = 3)

plt.subplot(1, 2, 1)
sns.boxplot(x = 'Pclass', y = 'Age', data = data,
     palette = palette, fliersize = 0)

sns.stripplot(x = 'Pclass', y = 'Age', data = data,
     linewidth = 0.6, palette = palette)
plt.xticks( np.arange(3), ['1st class', '2nd class', '3rd class'])
plt.title('Age distribution grouped by ticket class (all data)',fontsize= 16)
plt.xlabel('Ticket class')


plt.subplot(1, 2, 2)

# To use kdeplot I need to create variables with filtered data for each category
age_1_class = data[(data["Age"] > 0) & 
                              (data["Pclass"] == 1)]
age_2_class = data[(data["Age"] > 0) & 
                              (data["Pclass"] == 2)]
age_3_class = data[(data["Age"] > 0) & 
                              (data["Pclass"] == 3)]

# Ploting the 3 variables that we create
sns.kdeplot(age_1_class["Age"], shade=True, color='#eed4d0', label = '1st class')
sns.kdeplot(age_2_class["Age"], shade=True,  color='#cda0aa', label = '2nd class')
sns.kdeplot(age_3_class["Age"], shade=True,color='#a2708e', label = '3rd class')
plt.title('Age distribution grouped by ticket class (all data)',fontsize= 16)
plt.xlabel('Age')
plt.xlim(0, 90)
plt.tight_layout()
plt.show()

**EMBARKED**

In [None]:
sns.catplot(x='Survived', col='Embarked',color='#eed4d0', kind='count', data=data)

In [None]:
sns.catplot('Embarked','Survived', hue= 'Sex',color='#a2708e', kind='point', data=train)

**FARE**
Lets see how fare is distributed usinga dist plot and swarm plot

In [None]:
sns.distplot(data['Fare'],color='#a2708e')

In [None]:
sns.catplot(x="Pclass", y="Fare", kind="swarm", data=data, palette=sns.cubehelix_palette(5, start = 3), hue = "Survived", height = 6)

plt.tight_layout()

It can be clearly seen that the distribution of prices for the second and third class is very similar. The distribution of first-class prices is very different, has a larger spread, and on average prices are higher.
Also chance of survival increase with higher price paid.


In [None]:
fig, ax = plt.subplots(1, 3, figsize=(16, 6))
#sns.set_style('ticks')

sns.kdeplot(data=train, x='Fare', hue='Sex', fill=True, color='#a2708e', ax=ax[0])
sns.boxenplot(data=train, x='Sex', y='Fare', ax=ax[1], palette='BrBG')

sns.violinplot(data=train, x='Sex', y='Fare', ax=ax[2], palette=sns.cubehelix_palette(5, start = 3))

sns.despine()
plt.show()

Visualisation of fare distributions