# importing modules
### let's import some modules to get started!

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px 

# Reading the data
### let's start by taking titanic dataset as .csv file

In [None]:
train_df=pd.read_csv('../input/titanicdataset-traincsv/train.csv')
train_df

In [None]:
train_df.head()

# description of data

In [None]:
train_df.info()

In [None]:
train_df.describe()

# Exploratory Data Analysis
### let's find out the missing values and fill them with appropriate values

In [None]:
train_df.isnull().sum()

### only 3 columns have null/NaN values in the dataset of which 70% of cabin entries are missing and around 20% data entriees of age are missing 


## Finding missing values

In [None]:
train_df.isnull()

# representing null/NaN values using seaborn plotting techniques
### representing using heatmap()

In [None]:
sns.heatmap(train_df.isnull())

## looking at the heatmap we can conclude that majority of the cabin values are missing and some values of age are missing

# No. of males and males travelled

In [None]:
print("no. of males in the titanic:",train_df['Sex'].value_counts()['male'])
print("no. of females in the titanic:",train_df['Sex'].value_counts()['female'])
train_df['Survived'].value_counts()[train_df['Sex']=='male']
train_df['Survived'].value_counts()[train_df['Sex']=='female']

## we can also visualise the survival rate in gender category using count plot

In [None]:
plt.subplot(1,2,1)
sns.countplot(x='Sex',data=train_df)
plt.subplot(1,2,2)
sns.countplot(data=train_df,x='Survived')

## if we look at the above countplot() then the total number of people survived to the total number of people travelled are comparable
## we can predict that equal proportion of people were killed

In [None]:
sns.countplot(x='Survived',data=train_df,palette='rainbow',hue='Sex')

### but if we look at this graph in this case majority of males killed during the disaster survival rate of females is more while the death rate of males is more as compared to females

In [None]:
sns.countplot(x='Survived',data=train_df,palette='rainbow',hue='Pclass')

## people who were living in better passenger classes survived people living in lower class were remained dead

In [None]:
train_df['Fare']//=100

In [None]:
sns.countplot(data=train_df,x='Survived',hue=train_df['Fare'],palette='rainbow')

## From the above plot we can also conclude that people who paid higher fares managed to survive 

# now let's plot the distribution plots for age and cabin as they contribute much of the missing values using distplot()

In [None]:
sns.set_style('whitegrid')
sns.distplot(train_df['Age'],kde=False,bins=20,color='g')

## from above missing observations we learnt that around 20% of age values were missing.from the distribution large groups of passengers are of 15 to 35 years

# Number of survivors of the titanic accident

In [None]:
train_df['Survived'].value_counts()[1]

# Survived people vs Siblings/Spouses aboard the Titanic

In [None]:
train_df[['SibSp','Survived']].groupby(['SibSp'],as_index=False).mean().sort_values(ascending=False,by='SibSp')

In [None]:
sns.countplot(data=train_df,x='Survived',hue='SibSp',palette='rainbow')
plt.legend()

### From the above observation people with 1 Sibling/Spouses survival and also death percentile is more

In [None]:
x=train_df['SibSp']
y=train_df['Survived']
fig,Axes=plt.subplots()
plt.suptitle('SibSp vs Survived')
plt.subplot(1,3,1)
plt.scatter(x,y,marker='*',color='r',linewidth=5,s=25,edgecolor='g')
Axes.set_title('using scatterplot')
plt.subplot(1,3,2)
plt.xlabel('SibSp')
plt.ylabel('Survived')
Axes.set_title('using plot ')
plt.plot(x,y,'g*',linestyle='dashdot',linewidth=2,markersize=10)
plt.subplot(1,3,3)
plt.bar(x,y,align='center',color='black')
Axes.set_title('using bar')
plt.xlabel('SibSp')
plt.ylabel('Survived')


**Survived based on their gender**

In [None]:
train_df[['Sex','Survived']].groupby(['Sex'],as_index=False).mean().sort_values(by='Sex',ascending=False)

# cleansing of the data

In [None]:
sns.boxplot(x='SibSp',y='Age',data=train_df)

### From the above observation we can map an estimated age to the null values in comparision with SibSp

In [None]:
def fill_age(cols):
    SibSp = cols[0]
    Age =cols[1]
    if pd.isnull(Age):
        if SibSp==0:
            return 29
        elif SibSp==1:
            return 30
        elif SibSp==2:
            return 23
        elif SibSp==3:
            return 10
        elif SibSp==4:
            return 7
        elif SibSp==5:
            return 11
        else:
            return train_df.fillna('ffill')
    else:
        return Age
    

In [None]:
train_df['Age']=train_df[["Age","SibSp"]].apply(fill_age,axis=1)

In [None]:
sns.heatmap(train_df.isnull())

In [None]:
train_df.isnull().sum()

# From the above heatmap we can see all the missing values are resolved but if we see the missing values using isnull().sum() there are still some missing values in Embarked columns
## so we will fill missing Embarked values with backward or forward fill

In [None]:
train_df['Embarked'].fillna('bfill',inplace=True)

In [None]:
train_df.isnull().sum()

# so we now resolved all the missing values in the dataset

# we can use the data provided efficiently only if the data is categorical format

In [None]:
train_df.info()

In [None]:
# we can represent the given values except Name,Sex,Embarked ,Ticket
# so we will convert object datatype into categorical values if possible or we will drop the unnecessary columns
pd.get_dummies(train_df)

In [None]:
# so we will create a copy of train_df and proceed accordingly
train_copy=train_df.copy()
train_copy

In [None]:
# now we will drop name and ticket columns because they can't be converted into valid categorical columns
train_copy.drop(['Name','Ticket'],inplace=True,axis=1)

In [None]:
train_copy

In [None]:
# so we will convert the Embarked and Sex to categorical values using get_dummies()
Sex_category=pd.get_dummies(train_copy['Sex'],drop_first=True)
Embarked_category=pd.get_dummies(train_copy['Embarked'],drop_first=True)

In [None]:
# drop Sex and Embarked
train_copy.drop(['Sex','Embarked'],axis=1,inplace=True)

In [None]:
# now we will add Sex_category and Embarked_category into the train_copy DataFrame
train=pd.concat([train_copy,Sex_category,Embarked_category],axis=1)

In [None]:
train.head()

In [None]:
train.drop(['bfill'],axis=1,inplace=True)

In [None]:
train.info()

# Visualise using some seaborn plotting techniques

In [None]:
sns.rugplot(train['Age'].isnull())

## the above figure depicts that there are no missing values present in Age column

In [None]:
sns.jointplot(data=train,x=train['Survived'],y=train['Pclass'],kind='kde')

In [None]:
sns.pairplot(train)

In [None]:
sns.distplot(train[['Survived','Pclass']],kde=True,bins=10)

In [None]:
sns.jointplot(x=train['male'],y=train['Pclass'],kind='kde')

# correlation of the train data

In [None]:
sns.heatmap(train.corr())

## visualize through categorical plottings

In [None]:
sns.boxplot(x='male',y='Pclass',data=train,color='k')
sns.boxenplot(x='male',y='Pclass',data=train,color='g')

In [None]:
sns.swarmplot(x='male',y='Pclass',data=train,color='k')
sns.violinplot(x='male',y='Pclass',data=train,color='g')
sns.stripplot(x='male',y='Pclass',data=train,color='r')

# Using all the categorical plotting in a single figure

In [None]:
sns.stripplot(x='Survived',y='SibSp',data=train,color='b')
sns.swarmplot(x='Survived',y='SibSp',data=train,color='k')
sns.violinplot(x='Survived',y='SibSp',data=train,palette='rainbow')
sns.boxenplot(data=train,x='Survived',y='SibSp',color='m')
sns.barplot(data=train,y='SibSp',x='Survived',color='y')
sns.boxplot(x='Survived',y='SibSp',data=train,palette='dark')
sns.countplot(data=train,y='SibSp',color='red')

In [None]:
sns.factorplot(x='Pclass',y='SibSp',data=train)

## after data analysis we got the data that can be used for machine learning algorithms

In [None]:
train.head(10)