In [None]:

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Hi all! Here we will be performing an extensive exploratory data analysis on the Titanic data. I will keep on updating this notebook in future, till then enjoy the code!

In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
train.head()

In [None]:
train.isnull().sum().sort_values(ascending = False)

In [None]:
train.info()

### Exploring Survived column:

In [None]:
train.Survived.value_counts()

In [None]:
train.Survived.value_counts(normalize = True).mul(100).round(2).astype("str").add("%")

In [None]:
def uni_plot(feature,x=6,y=5):

    sns.set_style('darkgrid')
    sns.set(font_scale=1.5)

    f,ax = plt.subplots(figsize=(x,y))
    f = train[feature].value_counts(normalize = True).mul(100).round(2).sort_index().plot(ax= ax,kind = "bar",color="skyblue")
    for p in ax.patches:
        ax.annotate(p.get_height().astype(str) + "%",(p.get_x()+p.get_width()/2,1 + p.get_height()),ha = "center" )
    ax.set_ylim(0,100)
    ax.set_xlabel(feature,fontsize=20)
    ax.set_ylabel("Percentage",fontsize=20)
    ax.set_title("{} distribution".format(feature),fontsize=20)
    ax.set_xticklabels(sorted(train[feature].dropna().unique()),rotation= 360,fontsize=15)
    plt.setp(f.get_yticklabels(),fontsize=15);
    
    return f

In [None]:
g = uni_plot("Survived")
g.set_xticklabels(["Not Survived","Survived"])
g.set_xlabel("Survival status");

### Approximately 61.62% of the passengers did not survive. Approximately only 38.38% of the passengers were lucky enough to survive.

### Let's have a look at Sex column:

In [None]:
train.Sex.value_counts(normalize=True).mul(100).round(2).astype("str").add("%")

In [None]:
g = uni_plot("Sex");

### Most of the passengers i.e., approximately 64.76% were males and 35.24% were females. 
### Let's see how the gender affected the survival:

In [None]:
def feat_survived(feature):
    return train.pivot_table("Survived",index = feature).mul(100).round(2).reset_index()

In [None]:
feat_survived("Sex")

In [None]:
def surv_rate(feature,h=5,a=1):
    import numpy as np

    sns.set(style="whitegrid", color_codes=True,font_scale = 1.5)
    
    pal = sns.color_palette("crest", len(feat_survived(feature)))
    rank = feat_survived(feature)["Survived"].argsort().argsort()   
    
    g = sns.catplot(data=feat_survived(feature),x=feature, y="Survived",kind = "bar",
                    dodge=False,hue=feature,palette=np.array(pal[::1])[rank],height=h,aspect=a)

    for p in g.ax.patches:
        g.ax.annotate(p.get_height().astype(str) + "%",(p.get_x()+p.get_width()/2,1 + p.get_height()),ha = "center" )
    
    
    g.ax.set_ylim(0,100)
    g.add_legend()
    g.set_xlabels(feature,fontsize=20)
    g.set_ylabels("Survival Percent",fontsize=20)
    plt.setp(g.ax.get_yticklabels(),fontsize=15)
    g.ax.set_title("Survival rate by {}".format(feature),fontsize=20)
    g.add_legend
    
    return g

In [None]:
feat_survived("Sex")

In [None]:
surv_rate("Sex");

### Here, we can observe that 74.2% of the females survived but only 18.89% of the males survived. 
### That means that approximately 3 of every 4 females survived but only 1 of every 5 males survived. 
### It's really interesting to see higher survival rate in females because despite having more males than females on the ship, females were saved more. 

### Now, let's explore the Pclass column: 

In [None]:
train.Pclass.value_counts(normalize=True)

In [None]:
uni_plot("Pclass");

### Most of the passengers i.e., approximately 55.11% were in the Pclass "3". This might be because the Pclass "3" had the cheapest fare. Let's have a look at the fare for these classes.

In [None]:
train.pivot_table("Fare",index="Pclass",aggfunc="mean")

### The average fare for 3rd passenger class was the lowest whereas the 1st passenger class had the highest fare. That's why we observed that most of the people travelled in 3rd passenger class because of it's cheaper tickets.

In [None]:
sns.catplot(data=train,x="Pclass",y="Fare");

In [None]:
train.pivot_table("Survived",index="Pclass")

In [None]:
surv_rate("Pclass");

### The survival rate was highest in passenger class 1 having a survival rate of approximately 62.96% followed by passenger class 2 and 3. 

In [None]:
train.pivot_table("Survived",index=["Sex","Pclass"])

In [None]:
sns.catplot(data=train,x="Pclass",y="Fare",hue="Survived",col="Sex",);

### Approximately 97% of the females from passenger class 1 survived. Females from passenger class 1 and 2 were most likely to survive.

### Let's have a look at the Embarked column now:

In [None]:
train.Embarked.value_counts(normalize=True)

In [None]:
uni_plot("Embarked");

### Most of the passengers embarked the ship from Southampton followed by Cherbourg and Queenstown.

In [None]:
train.pivot_table("Survived",index="Embarked")

In [None]:
surv_rate("Embarked");

### Cherbourg port had the highest survival rate but it seems here that the port of embarkation does not significantly affect the survival of a passenger.

### Now, let's explore the Age column:

In [None]:
train["Age"].isnull().sum()

In [None]:
train.Age.describe()

In [None]:
sns.displot(train,x="Age",kde=True,stat="probability",alpha=0.8);

### Most of the passengers were 20-30 years old

In [None]:
sns.displot(train,x="Age",kde=True,stat="probability",col="Survived",hue="Survived");

### Younger passengers were more likely to survive.

In [None]:
train_2= train.copy()

In [None]:
train_2["Age_bracket"]=pd.cut(train.Age,bins=[0,10,20,30,40,50,60,70,80])

In [None]:
train_2.loc[:,["Age","Age_bracket"]].head()

In [None]:
train_2.head()

In [None]:
g = sns.displot(data=train_2,x="Age",hue="Age_bracket",kind="kde",multiple="stack",
            alpha=1,palette="crest",height=6,aspect=2);
g.add_legend();

In [None]:
Age_Surv_df = train_2.pivot_table("Survived",index="Age_bracket").mul(100).round(2).sort_values(by="Survived",
                                                                            ascending=False).reset_index()

In [None]:
Age_Surv_df

In [None]:
sns.set(style="whitegrid",font_scale = 1.2)
    
pal = sns.color_palette("crest", len(Age_Surv_df["Survived"]))
rank = Age_Surv_df["Survived"].argsort().argsort()   

g = sns.catplot(data=Age_Surv_df,x="Age_bracket", y="Survived",kind = "bar",
                height=8,aspect=1,hue ="Age_bracket",dodge=False,palette=np.array(pal[::1])[rank])
for p in g.ax.patches:
    g.ax.annotate(p.get_height().astype(str) + "%",(p.get_x()+p.get_width()/2,1 + p.get_height()),ha = "center" )

g.ax.set_ylim(0,100)
g.add_legend()
g.set_xlabels("Age_bracket",fontsize=20)
g.set_ylabels("Survival Percent",fontsize=20)
plt.setp(g.ax.get_yticklabels(),fontsize=15)
g.ax.set_title("Survival rate by {}".format("Age_bracket"),fontsize=20);

### Children aged 0-10 were most likely to be saved and older people aged 60-80 were the least likely to survive.

### Let's explore the SibSp column:

In [None]:
train.SibSp.value_counts(normalize=True)

In [None]:
uni_plot("SibSp",9);

### Most of the passengers were not having any siblings. 

In [None]:
g=surv_rate("SibSp",6.5,1)

### Passengers having 1 sibling were most likely to survive. Survival rate is inversely proportional to the number of siblings expect for the case of no siblings. We can say that more the number of siblings, lesser the chances of survival.

### Let's have a look at the Parch column:

In [None]:
train.Parch.value_counts(normalize=True)

In [None]:
uni_plot("Parch",9);

### Most (approximately 76.09%) of the passengers were without parents or children on the ship.

In [None]:
surv_rate("Parch",7);

### Passengers with less number of parents or children were more likely to survive. (Except for the passengers who were alone).