In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Titanic - Machine Learning from Disaster

## Business Problem:
- To use machine learning to create a model that predicts which passengers survived the Titanic shipwreck.

### Column Profiling:

**pclass:**	Ticket class.A proxy for socio-economic status (SES)
- 1st = Upper
- 2nd = Middle
- 3rd = Lower

**sibsp:** # of siblings / spouses aboard the Titanic.The dataset defines family relations in this way...
- Sibling = brother, sister, stepbrother, stepsister
- Spouse = husband, wife (mistresses and fiancés were ignored)

**parch:** -# of parents / children aboard the Titanic. The dataset defines family relations in this way...
- Parent = mother, father
- Child = daughter, son, stepdaughter, stepson
- Some children travelled only with a nanny, therefore parch=0 for them.

**survival**
- 0 = No, 1 = Yes

**embarked** : Port of Embarkation
- C = Cherbourg 
- Q = Queenstown,
- S = Southampton

**age**: 
- Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

**sex**:
- Passenger Gender 

**ticket** :
- Ticket number

**fare** : 
- Passenger fare

**cabin** : 
- Cabin number


#### Importing required packages:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
from numpy import NaN, nan, NAN
from scipy import stats
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

#### Loading train data into Dataframe:

In [None]:
titanic_df = pd.read_csv('../input/titanic/train.csv')
titanic_df

#### Identification of variables and data types:

In [None]:
titanic_df.shape

In [None]:
# Getting a list of columns available in the titanic dataset 
titanic_df.columns

In [None]:
cols = ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp','Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
for i in cols:
    
    print(f" Unique value count in {i} is {titanic_df[i].nunique()}")

In [None]:
titanic_df.info()

#### Analysing the basic metrics:

In [None]:
titanic_df.describe()

In [None]:
# Missing values

In [None]:
def missingValue(df):
    #Identifying Missing data.
    total_null = df.isnull().sum().sort_values(ascending = False)
    percent = ((df.isnull().sum()/len(df))*100).sort_values(ascending = False)
    print(f"Total records in our data =  {df.shape[0]} where missing values are as follows:")

    missing_data = pd.concat([total_null,percent.round(2)],axis=1,keys=['Total Missing','In Percent'])
    return missing_data

In [None]:
missingValue(titanic_df)

In [None]:
print(f"Columns with category datatypes (Categorical Features) are : \
{list(titanic_df.select_dtypes('object').columns)}")
print(f"Columns with integer and float datatypes (Numerical Features) are: \
{list(titanic_df.select_dtypes(['int64','float64']).columns)}")

In [None]:
actual_numerical_cols = ['Age', 'Fare']

In [None]:
actual_categorical_cols = ['Survived', 'Pclass', 'SibSp','Parch','Sex', 'Embarked']

In [None]:
titanic_df['Cabin'] # Too many missing values, hence we can ignore/drop.

# Univariate Analysis:

In [None]:
def numerical_feat(df,colname,nrows=2,mcols=2,width=20,height=5):
    fig , ax = plt.subplots(nrows,mcols,figsize=(width,height))
    fig.set_facecolor("lightgrey")
    rows = 0
    for var in colname:        
        ax[rows][0].set_title("Boxplot for Outlier Detection ", fontweight="bold")
        plt.ylabel(var, fontsize=12)
        sns.boxplot(y = df[var],color='green',ax=ax[rows][0])
        
        # plt.subplot(nrows,mcols,pltcounter+1)
        sns.distplot(df[var],color='green',ax=ax[rows][1])
        ax[rows][1].axvline(df[var].mean(), color='r', linestyle='--', label="Mean")
        ax[rows][1].axvline(df[var].median(), color='m', linestyle='-', label="Median")
        ax[rows][1].axvline(df[var].mode()[0], color='royalblue', linestyle='-', label="Mode")
        ax[rows][1].set_title("Outlier Detection ", fontweight="bold")
        ax[rows][1].legend({'Mean':df[var].mean(),'Median':df[var].median(),'Mode':df[var].mode()})
        rows += 1
    plt.show()

In [None]:
actual_numerical_cols = ['Age', 'Fare']

In [None]:
numerical_feat(titanic_df,actual_numerical_cols,len(actual_numerical_cols),2,14,10)

In [None]:
# Frequency of each feature in percentage.
def categorical_features(df, colnames, nrows=2,mcols=2,width=20,height=30, sortbyindex=False):
    fig , ax = plt.subplots(nrows,mcols,figsize=(width,height))  
    fig.set_facecolor(color = 'lightgrey')
    string = "Frequency of "
    rows = 0                          
    for colname in colnames:
        count = (df[colname].value_counts(normalize=True)*100)
        string += colname + ' in (%)'
        if sortbyindex:
                count = count.sort_index()
        count.plot.bar(color=sns.color_palette("crest"),ax=ax[rows][0])
        ax[rows][0].set_ylabel(string, fontsize=14)
        ax[rows][0].set_xlabel(colname, fontsize=14)
        
        count.plot.pie(colors = sns.color_palette("crest"),autopct='%0.0f%%',
                       textprops={'fontsize': 14},shadow = True, ax=ax[rows][1])#explode=[0.2 if colname[i] == min(colname) else 0])        
        ax[rows][0].set_title("Frequency wise " + colname, fontweight="bold")
        string = "Frequency of "
        rows += 1 

In [None]:
actual_categorical_cols = ['Survived', 'Pclass', 'SibSp','Parch','Sex', 'Embarked']

In [None]:
categorical_features(titanic_df,actual_categorical_cols,len(actual_categorical_cols),2,14,30)

### Splitting into train and validation sets

In [None]:
# Shuffiling the dataset and then splitting into train and validation sets 

train_df, validation_df = train_test_split(titanic_df, train_size = 0.8, random_state = 100)

80 % of the data is used for training and the remaning 20% will be used for validataion
Splitting the data will help us tune the model to generalise well

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8,5))
survival_stats = (train_df["Survived"].value_counts() *100/ train_df.shape[0]).reset_index()
ax1.pie(survival_stats["Survived"], labels=survival_stats["index"], autopct="%1.2f%%",colors = sns.color_palette("crest"),
                       textprops={'fontsize': 14},shadow = True )
survival_stats = (validation_df["Survived"].value_counts() *100/ validation_df.shape[0]).reset_index()
ax1.title.set_text("Distribution of survivors in Train")
ax2.pie(survival_stats["Survived"], labels=survival_stats["index"], autopct="%1.2f%%",colors = sns.color_palette("crest"),
                       textprops={'fontsize': 14},shadow = True)
ax2.title.set_text("Distribution of survivors in Validation")
plt.show()

The percentage of survivors in the validation table is slightly higher compared to the train data set

In [None]:
### Creating Stacked Bar Chart - Helps us understand the probability of survival given a condition

def create_StackedBar(df,col1, perc = True):
    stacked_plot = df.groupby([col1,"Survived"])\
    .agg({"PassengerId":"count"})\
    .reset_index()\
    .pivot(index=col1,columns="Survived",values="PassengerId")

    stacked_plot["total"] = stacked_plot.apply(sum,axis=1)
    if perc:
        stacked_plot[0] = stacked_plot[0]/stacked_plot["total"]
        stacked_plot[1] = stacked_plot[1]/stacked_plot["total"]
    stacked_plot[[0,1]].plot(kind="bar", stacked=True)
    plt.show()

In [None]:
print("Total Null valus in ",train_df["Pclass"].isnull().sum())
create_StackedBar(train_df,"Pclass")

#### Analysing Survival based on Pclass

- From the plot below it is evident that the passengers had a chance of survival in the order Class1 > Class2 > Class3
- Chances of survival
- Class1 ~60%
- Class2 ~45%
- Class3 ~25%

In [None]:
print("Total Null valus in ",train_df["Sex"].isnull().sum())

create_StackedBar(train_df, 'Sex')

#### Analysis of survival based on Passenger Sex
- Females travelling on the titanic Had a higher chance of survival than the male passengers
- Almost 75% of all female passengers survived compared to only 20% of surivors in Male passengers

In [None]:
# Split Passengers into age buckets of 5 
print("Total Rows in data", train_df.shape[0])
print("Total Null values in Age", train_df["Age"].isnull().sum())

train_df.loc[:,"Age_bucket"] = ((train_df["Age"]//5)+1).fillna(20)
create_StackedBar(train_df,"Age_bucket")

#### Analysis of survival based on Age buckets
- We have around 20% missing data in the age column
    - the missing ages can be treated as a seperate group
- Children between 0-5 years had the highest survival percentge
- Adults in the age of 60-70 have a low change of survival

In [None]:
print("# of nulls in Sibling or Spouse", train_df["SibSp"].isnull().sum())
print("# of nulls in Sibling or Spouse", train_df["Parch"].isnull().sum())

# Distribution of SibSp
create_StackedBar(train_df,"SibSp",)

# Distributioon of ParChi
create_StackedBar(train_df,"Parch")

#### Analysing the odds of survival if you had a sib/Spo or Par/Chi on the trip with you
- People travelling alone had a low chance of survival
- People who were travelling as big famalies ( 5 childen ) had a lower chance of survival

In [None]:
#train_df["Ticket"] # Not used currently

ranges = [0,10,15,25,50,100,1000]
labels = [1,2,3,4,5,6]

train_df["Fare_buckets"] = pd.cut(train_df["Fare"],bins=ranges,labels=labels).astype(float).fillna(0)
# Most Fares re below 50 
# Checking to see if the Fare has any relationship with survival 

create_StackedBar(train_df, "Fare_buckets")

#### Ticket Fare
- Any one who paid a fare of above 100 can be put in a bin


- we see a clear relationship between amount paid and survival rate
- Passengers who have paid higher fare had a better chance of surviving

In [None]:
print("Number of Nans in Cabin",train_df["Cabin"].isnull().sum()/ train_df.shape[0])

78 % of Cabin info is NA - not including this feature for modelling

In [None]:
print("Number of Nans in Emabarked",train_df["Embarked"].isnull().sum()/ train_df.shape[0])
embarked_mode = train_df["Embarked"].mode()[0]
train_df["Embarked"] = train_df["Embarked"].fillna(train_df["Embarked"].mode()[0])
create_StackedBar(train_df,"Embarked")

# EDA Observations and Inferences:

- Based On The Dataset Only 40% People Were Able To Survived The Disaster.
- Mostly Childrens Are Being Rescued.
- People Above Age 20 had a chance of 35% of Being Survived From The Disaster.
- People Paid High Fare and Class Means VIPs Are Given Priority For Rescued.
- Out of Male and Females, Almost 75% Femals Survived The Disaster.
- People Traveling Alone or With A Smaller Family Size Upto 2 children had high chances of survival.
- Family Size Under 5 Had Higher Chance of Survival On Titanic Disaster.
- People Traveling Alone had approx 43% Chances of Survival.
- Family with Size 5+ Had Lesser Chance of Complete Survival On Titanic Disaster.
- Adults in the age of 60-70 have a low change of survival
- The passengers had a chance of survival in the order Class1 > Class2 > Class3

Passengers who embarked at station S have a lower chance of survival

### Feature Selection and Engineering
The columns we have decided to use based on the Data Analysis are

- Pclass - can just be used as is
- Sex - can just be used as is
- Age Buckets -need to be transformed(binning values and Imputing missing values)
- SibSp - can just be used as is
- Parch - can just be used as is
- Fare Buckets - need to be transformed(binning values and Imputing missing values)
- Embarked - need to be transformed(binning values and Imputing missing values)

#### All the categorical columns need to be converted to numeric
- There are 2 Categorical columns in the dataset Sex and Embarked - both can be converted using a label encoder

In [None]:
label_enc_dict = {}
for cols in ["Sex","Embarked"]:
    lr = LabelEncoder()
    lr.fit(train_df[cols].values)
    label_enc_dict[cols] = lr

In [None]:
def feature_engineered_columns(df):
    """
    Function used for imputing missing values and transforming continous data into buckets 
    """
    # Age column 
    df.loc[:,"Age_bucket"] = ((df["Age"]//5)+1).fillna(20)
    
    # Ticket fare column
    ranges = [0,10,15,25,50,100,1000]
    labels = [1,2,3,4,5,6]
    df["Fare_buckets"] = pd.cut(df["Fare"],bins=ranges,labels=labels).astype(float).fillna(0)
    
    # Filling Na values for embarked 
    df["Embarked"] = df["Embarked"].fillna(embarked_mode)
    df = df.drop(["Age","Cabin","Fare","Ticket","Name"],axis=1)
    
    
    # Converting Categorical columns to numeric 
    for cols in ["Sex","Embarked"]:
        df[cols] = label_enc_dict[cols].transform(df[cols])
    
    print("Total Number of NAs in the data", df.isnull().sum().sum())
    return df
 

In [None]:
train_df_transformed = feature_engineered_columns(train_df)
validation_df_transformed = feature_engineered_columns(validation_df)

In [None]:
sns.heatmap(titanic_df.corr(),annot=True,cmap='RdYlGn',linewidths=0.2)
fig=plt.gcf()
fig.set_size_inches(20,12)
plt.show()

Having Two Highly or Perfectly Correlated Feature In Our Training Data Will Cause MultiColinearity So It is better to remove them.

#### Modelling
- Tried to fit a basic logistic Regression Model and Naive Bayed Model and compared their performance using the validation dataset

# Logistic Regression:

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict ## Cross Validation

In [None]:
X_train = train_df_transformed.drop(columns=["Survived",'PassengerId']).values
y_train = train_df_transformed["Survived"].values

In [None]:
X_validation = validation_df_transformed.drop(columns=["Survived",'PassengerId']).values
y_validation = validation_df_transformed["Survived"].values

In [None]:
# Declaring a the model and training the model 

lr = LogisticRegression()
lr.fit(X_train, y_train)

### Testing train and validation performance



In [None]:
pred = lr.predict(X_train)
pred_val = lr.predict(X_validation)
print(f"""Training Data Accuracy other metrics 
{classification_report(pred,y_train)}
""")

print(f"""Validation Data Accuracy other metrics 
{classification_report(pred_val,y_validation)}
""")

In [None]:

from sklearn.linear_model import LogisticRegression
model =  LogisticRegression()
model.fit(X_train,y_train)
prediction_lr=model.predict(X_validation)

print('--------Logistic Regression -------')
print('The accuracy Logistic Regression is',round(accuracy_score(prediction_lr,y_validation)*100,2))

kfold = KFold(n_splits=8,shuffle=True, random_state=42) # split the data into 10 equal parts

result_lr=cross_val_score(model,X_train,y_train,cv=10,scoring='accuracy')

print('The cross validated score for Logistic Regression is:',round(result_lr.mean()*100,2))

y_pred = cross_val_predict(model,X_train,y_train,cv=10)
sns.heatmap(confusion_matrix(y_train,y_pred),annot=True,fmt='3.0f',cmap="Accent_r")
plt.title('Confusion Matrix', y=1, size=15);

The train and validation performane are comparable

# Naive Bayes 

In [None]:
# Declaring a the model and training the model 

nb = GaussianNB()
nb.fit(X_train, y_train)

In [None]:
pred = nb.predict(X_train)
pred_val = nb.predict(X_validation)
print(f"""Training Data Accuracy other metrics 
{classification_report(pred,y_train)}
""")

print(f"""Validation Data Accuracy other metrics 
{classification_report(pred_val,y_validation)}
""")

In [None]:

from sklearn.naive_bayes import GaussianNB
model= GaussianNB()
model.fit(X_train,y_train)
prediction_gnb=model.predict(X_validation)

print('--------GaussianNB Naive Bayes -------')
print('The accuracy Gaussian Naive Bayes Classifier is',round(accuracy_score(prediction_gnb,y_validation)*100,2))

kfold = KFold(n_splits=8,shuffle=True,random_state=42) # split the data into 10 equal parts

result_gnb=cross_val_score(model,X_train,y_train,cv=10,scoring='accuracy')

print('The cross validated score for Gaussian Naive Bayes classifier is:',round(result_gnb.mean()*100,2))

y_pred = cross_val_predict(model,X_train,y_train,cv=10)
sns.heatmap(confusion_matrix(y_train,y_pred),annot=True,fmt='3.0f',cmap="Accent_r")
plt.title('Confusion Matrix', y=1, size=15);


#### The performance of the NB model is slightly worse than the Logistic Regression Model hence we will use lr for prediction

In [None]:
### importing test data 
titanic_df_test = pd.read_csv('../input/titanic/test.csv')
titanic_df_test

In [None]:
gender_sub_df = pd.read_csv('../input/titanic/gender_submission.csv')
gender_sub_df

In [None]:
df_test_transformed = feature_engineered_columns(titanic_df_test) 

In [None]:
X_test = df_test_transformed.drop(["PassengerId"], axis=1).values
df_test_transformed["Survived"] = lr.predict(X_test)

In [None]:
df_test_transformed[["PassengerId","Survived"]].to_csv("Titanic_submission.csv",index=False)