# Exploratory Data Analisys 
Data Set: Tabular Playground Series April 2021

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Graphic style settings
sns.set_style('dark') # darkgrid, white grid, dark, white and ticks
sns.despine() # quit axes spines
plt.rc('axes', titlesize=18)     # fontsize of the axes title
plt.rc('axes', labelsize=14)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=13)    # fontsize of the tick labels
plt.rc('ytick', labelsize=13)    # fontsize of the tick labels
plt.rc('legend', fontsize=13)    # legend fontsize
plt.rc('font', size=13)       

In [None]:
# Import data set

train_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv', index_col = 'PassengerId')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')

## Data Exploration

In [None]:
# 10 first rows
train_df.head(10)

In [None]:
# Shape of dataset
train_df.shape

In [None]:
# Resume of Data Set
train_df.info

In [None]:
# Description of the Data Set
train_df.describe()

In [None]:
# Count of surivived
train_df['Survived'].value_counts()

In [None]:
# Count of Pclass
train_df['Pclass'].value_counts()

In [None]:
# Count of Age
train_df['Age'].value_counts()

In [None]:
# Count of SibSp
train_df['SibSp'].value_counts()

In [None]:
# Count of Parch
train_df['Parch'].value_counts()

In [None]:
# Count of Fare
train_df['Fare'].value_counts()

In [None]:
# Count of Sex
train_df['Sex'].value_counts()

In [None]:
# Count of Embarked
train_df['Embarked'].value_counts()

### Observations:
* Most of people went in 3rd class. (3>1>2).
* Almost half of people were female.
* Almost half of people survived.
* Name, Ticket and Cabin are the less usable data.

### Implications:
* Embarked can be converted into numerical values: S:1, C:2, Q:3.
* Sex can be converted into boolean value: female:0, male:1.

In [None]:
# Convert Embarked
train_df['Embarked'] = train_df['Embarked'].map({'S':1, 'C':2, 'Q':3})

In [None]:
# Converted Sex
train_df['Sex'] = train_df['Sex'].map({'female':0, 'male':1})

## Univariate Analysis

In [None]:
#Distribution Plot of Survived vs Gender
plt.figure(tight_layout=True)

barplot = train_df.groupby(['Sex', 'Survived'], as_index = False).count()


ax = sns.barplot(x = barplot['Sex'], y = barplot['Pclass'], hue = barplot['Survived'], palette = 'pastel')
ax.set(title = 'Gender vs Survived', xlabel = 'Sex', ylabel = 'Count')

plt.show()

In [None]:
#Distribution Plot of Survived vs Number of Parch
plt.figure(tight_layout=True)

barplot = train_df.groupby(['Parch', 'Survived'], as_index = False).count()


ax = sns.barplot(x = barplot['Parch'], y = barplot['Age'], hue = barplot['Survived'], palette = 'pastel')
ax.set(title = 'Number of parent/children vs Survived', xlabel = 'Parch', ylabel = 'Count')

plt.show()

In [None]:
#Distribution Plot of Survived vs Embarked
plt.figure(tight_layout=True)

barplot = train_df.groupby(['Embarked', 'Survived'], as_index = False).count()


ax = sns.barplot(x = barplot['Embarked'], y = barplot['Age'], hue = barplot['Survived'], palette = 'pastel')
ax.set(title = 'Embarked vs Survived', xlabel = 'Embarked', ylabel = 'Count')

plt.show()

In [None]:
#Distribution Plot of Survived vs Number of siblings/spouse
plt.figure(tight_layout=True)

barplot = train_df.groupby(['SibSp', 'Survived'], as_index = False).count()


ax = sns.barplot(x = barplot['SibSp'], y = barplot['Age'], hue = barplot['Survived'], palette = 'pastel')
ax.set(title = 'Number of siblings/spouses vs Survived', xlabel = 'SibSp', ylabel = 'Count')

plt.show()

In [None]:
#Distribution Plot of Survived vs Socio-economic Status
plt.figure(tight_layout=True)

barplot = train_df.groupby(['Pclass', 'Survived'], as_index = False).count()


ax = sns.barplot(x = barplot['Pclass'], y = barplot['Age'], hue = barplot['Survived'], palette = 'pastel')
ax.set(title = 'Socio-Economic Status vs Survived', xlabel = 'Pclass', ylabel = 'Count')

plt.show()

### Observations:
* Most of males died.
* Most of females survived.
* People with no children, siblings or spouses died more.
* People who embarked C and Q survive more.
* People in 3rd class died more.


## Bivariaty Analysis:


In [None]:
#Boxplot of Fare vs Survived
plt.figure(tight_layout = True, figsize = (10,10))

ax = sns.boxplot(data = train_df, x = 'Survived', y = 'Fare', palette = 'Set2', linewidth = 2.5)
ax.set(title = 'Fare vs Survived', xlabel = 'Survived', ylabel = 'Fare')
plt.show()

In [None]:
#Boxplot of Pclass vs Age
ax = sns.boxplot(data = train_df, x = 'Pclass', y = 'Age', palette = 'Set2', linewidth = 2.5)
ax.set(title = 'Pclass vs Age', xlabel = 'Pclass', ylabel = 'Age')
plt.show()

In [None]:
#Boxplot of Pclass vs Age
plt.figure(tight_layout = True, figsize = (10,10))

ax = sns.boxplot(data = train_df, x = 'Embarked', y = 'Fare', palette = 'Set2', linewidth = 2.5)
ax.set(title = 'Embarked vs Fare', xlabel = 'Embarked', ylabel = 'Fare')
plt.show()

In [None]:
#Boxplot of Survived vs Age
ax = sns.boxplot(data = train_df, x = 'Survived', y = 'Age', palette = 'Set2', linewidth = 2.5)
ax.set(title = 'Survived vs Fare', xlabel = 'Survived', ylabel = 'Age')
plt.show()

### Observations.
* People who paid more fare, survived more.
* People in first class were older.
* People embarked in 2 and 3 paid more.
* People with fares more expensives, survived more.


In [None]:
plot = train_df[['Survived', 'Age', 'Pclass', 'Embarked', 'Parch', 'SibSp', 'Sex']]
sns.pairplot(plot, hue = "Survived", height = 5, palette = 'Set2').add_legend()
plt.show()