# CONTENT
> 
<font color = 'red'>
    
1. [Load and Check Data](#1)
2. [Variable Description](#2)
    *    [Univariate Variable Analysis](#3) 
            * [Categorical Variable](#4) 
            * [Numerical Variable](#5) 
3. [Basic Data Analysis](#6)
4. [Outlier Detection](#7)
5. [Missing Value](#8)
    * [Find Missing Value](#9)
    * [Fill Missing Value](#10)
6. [Visualization](#11)
    * [Correlation Between Sibsp & Parch & Age & Fare & Survived](#12)
    * [Sibsp & Survived](#13)
    * [Pclass & Survived](#14)
    * [Age & Survived](#15)
    * [Survived & Age & Pclass](#16)
    * [Embarked & Sex & Pclass & Survived](#17)
    * [Embarked & Sex & Fare & Survived](#18)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')  # with --> plt.style.available  --> you can see all available functions

import seaborn as sns

import warnings 
warnings.filterwarnings("ignore")

from collections import Counter


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id = "1"></a><br>
# Load and Check Data

In [None]:
df = pd.read_csv("/kaggle/input/titanicdataset-traincsv/train.csv")

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.describe()

<a id = "2"></a><br>
# Variable Description
1. PassengerId : unique id number to each passenger
1. Survived : passenger survive(1) died(0)
1. Pclass : passenger class
1. Name : name
1. Sex : gender of passenger
1. Age : age of passenger
1. SibSp : number of siblings/spouses 
1. Parch : number of parents/children
1. Ticket : ticket number
1. Fare : amount of money spent on ticket
1. Cabin : cabin category
1. Embarked : port where passenger embarked (C = Cherbourg, Q = Queenstown, S = Southampton)

In [None]:
df.info()

 <a id = "3"></a><br>
 # Univariate Variable Analysis
    * Categorical Variable : Survived, Sex, PClass, Embarked, Cabin, Name, Ticket, Sibsp and Parch
    * Numerical Variable : Fare, age and passengerId

<a id = "4"></a><br>
## Categorical Variable


In [None]:
def bar_plot(variable):
    """
        Input : variable ex : "Survived"
        Output : bar plot & value count
    """
    
    #get feature
    var = df[variable]
    
    #count number of categorical variable(value/sample)
    value_count = var.value_counts()
    
    #visualize 
    plt.figure(figsize = (9,3))
    plt.bar(value_count.index,value_count)
    plt.xticks(value_count.index, value_count.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{}: \n {}".format(variable,value_count))
    

In [None]:
category1 = ["Survived","Sex","Pclass","Embarked","SibSp","Parch"]

for c in category1:
    bar_plot(c)



In [None]:
category2 = ["Cabin","Name","Ticket"]

for c in category2:
    print("{} \n ".format(df[c].value_counts()))

<a id = "5"></a><br>
## Numerical Variable

In [None]:
def hist_plot(variable):
    plt.figure(figsize = (9,3))
    plt.hist(df[variable], bins = 50)
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("{} distribution with hist".format(variable))
    plt.show()

In [None]:
numericVar = ["Fare","Age","PassengerId"]

for c in numericVar:
    hist_plot(c)


<a id = "6" ></a><br>
# Basic Data Analysis
* Pclass - Survived
* Sex - Survived
* SibSp - Survived
* Parch - Survived

In [None]:
# Plcass vs Survived
df[["Pclass","Survived"]].groupby(["Pclass"], as_index = False).mean().sort_values(by="Survived",ascending = False)

In [None]:
# Sex vs Survived
df[["Sex","Survived"]].groupby(["Sex"], as_index = False).mean().sort_values(by="Survived",ascending = False)

In [None]:
# Sibsp vs Survived
df[["SibSp","Survived"]].groupby(["SibSp"], as_index = False).mean().sort_values(by="Survived",ascending = False)

In [None]:
# Parch vs Survived
df[["Parch","Survived"]].groupby(["Parch"], as_index = False).mean().sort_values(by="Survived",ascending = False)

<a id = "7"></a><br>
# Outlier Detection

In [None]:
def detect_outliers(dataFrame,features):
    outlier_indices = []
    
    for i in features:
        # 1st quartile
        Q1 = np.percentile(dataFrame[i],25)
        # 3rd quartile
        Q3 = np.percentile(dataFrame[i],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5
        # detect outlier and their indeces
        outlier_list_index = dataFrame[(dataFrame[i] < Q1 - outlier_step) | (dataFrame[i] > Q3 + outlier_step)].index
        # store indeces
        outlier_indices.extend(outlier_list_index)
    
    outlier_indices = Counter(outlier_indices)
    #print(outlier_indices)
    #print(outlier_indices.items())
    
    multiple_outliers = list(index for index,value_count in outlier_indices.items() if  value_count > 2)
    #print(multiple_outliers)
    
    return multiple_outliers  #return indexes

In [None]:
df.loc[detect_outliers(df,["Age","SibSp","Parch","Fare"])]

In [None]:
# drop outliers
df = df.drop(detect_outliers(df,["Age","SibSp","Parch","Fare"]),axis = 0).reset_index(drop = True)

In [None]:
df.info() # deleted 10 passengers (891-->881 passengers)

<a id = "8"></a><br>
# Missing Value
> * Find Missing Value
* Fill Missing Value

In [None]:
train_df = pd.read_csv("/kaggle/input/titanicdataset-traincsv/train.csv")
test_df = pd.read_csv("/kaggle/input/testtitanic/titanic_data.csv")

In [None]:
train_df_len = len(train_df)
train_df = pd.concat([train_df,test_df],axis = 0).reset_index(drop = True)


In [None]:
train_df.head()

<a id = "9"></a><br>
## Find Missing Value

In [None]:
train_df.columns[train_df.isnull().any()]

In [None]:
train_df.isnull().sum()

<a id = "10"></a><br>
## Fill Missing Value
* Embarked has 4 missing value
* Age has 354 missing value

In [None]:
train_df[train_df["Embarked"].isnull()]

In [None]:
train_df.boxplot(column="Fare",by = "Embarked")
plt.show()

In [None]:
train_df["Embarked"] = train_df["Embarked"].fillna("C")

In [None]:
train_df[train_df["Embarked"].isnull()]

In [None]:
train_df[train_df["Age"].isnull()]

In [None]:
np.mean(train_df[train_df["Age"].notnull()]["Age"])

In [None]:
train_df["Age"] = train_df["Age"].fillna(np.mean(train_df[train_df["Age"].notnull()]["Age"]))

In [None]:
train_df[train_df["Age"].isnull()]

<a id = "11"></a><br>
# Visualization

<a id = "12"></a><br>
## Correlation Between Sibsp & Parch & Age & Fare & Survived
    
    As you can see most correlated features are SibSp & Parch (0.41)
    But for survived feature most correlated with Fare (0.26)

In [None]:
list_1 = ["SibSp", "Parch", "Age", "Fare", "Survived"]
sns.heatmap(train_df[list_1].corr(), annot = True, fmt = ".2f")
plt.show()

<a id = "13"></a><br>
## Sibsp & Survived
Having 2 or less SibSp have more chance to survive..

In [None]:
g = sns.factorplot(x = "SibSp", y = "Survived", data = train_df, kind = "bar", size = 5 )
g.set_ylabels("Survived Probability")
plt.show()

<a id = "14"></a><br>
## Pclass & Survived

In [None]:
g = sns.factorplot(x = "Pclass", y = "Survived", data = train_df, kind = "bar", size = 6)
g.set_ylabels("Survived Probability")
plt.show()

<a id = "15"></a><br>
## Age & Survived

In [None]:
g = sns.FacetGrid(train_df, col = "Survived")
g.map(sns.distplot, "Age", bins = 25)
plt.show()

<a id = "16"></a><br>
## Survived & Age & Pclass

In [None]:
g = sns.FacetGrid(train_df, col = "Survived", row = "Pclass", size = 2)
g.map(plt.hist, "Age", bins = 25)
g.add_legend()
plt.show()

<a id = "17"></a><br>
## Embarked & Sex & Pclass & Survived


In [None]:
g = sns.FacetGrid(train_df, row = "Embarked", size = 2)
g.map(sns.pointplot, "Pclass","Survived","Sex")
g.add_legend()
plt.show()

<a id = "18"></a><br>
## Embarked & Sex & Fare & Survived

In [None]:
g = sns.FacetGrid(train_df, row = "Embarked", col = "Survived", size = 2.3)
g.map(sns.barplot, "Sex", "Fare")
g.add_legend()
plt.show()