# Hey Everyone !

#### This is my first notebook on Kaggle.! 
#### In this notebook, we'll go through the appoach I took for solving this problem.
#### So, we can divide the task into 3 phases
#### 1. EDA (Exploratory Data Analysis)
#### 2. Feature Engineering 
#### 3. Model Selection and Hypeparameter Tuning

#### And Finally we'll talk about the approach I took that got me into top 3% (79.186)

### **Lets Get Started !**

In [None]:
# Loading the dataset

import numpy as np
import pandas as pd
import os
train_data=pd.read_csv("../input/titanic/train.csv")
test_data=pd.read_csv("../input/titanic/test.csv")

# 1) Eploratory Data Analysis
 
#### Now that we have the dataset loaded into pandas dataframe, lets explore it !

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
# see if null values are present
train_data.info()

In [None]:
# see if null values are present
test_data.info()

#### In total we have 10 feature for training out models.
#### Out of which Name [ Sex, Ticket, Cabin, Embarked ] are categorical and rest are numerical
#### One more thing to notice is that we have many null values, but don't worry we'll deal with them later

In [None]:
# import libraries to help visualize data
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use("fivethirtyeight")

In [None]:
#gender frequrncy
g = sns.FacetGrid(train_data, col="Sex")
g.map(sns.countplot, "Survived")

# So, from below plots we get to know that even though men are more, 
#less male survive when compared to female

In [None]:
#Passenger Class
grid = sns.FacetGrid(train_data, col='Survived', row='Pclass', size=2.5, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend();

# Passengers in 1st class are more likely to survive

In [None]:
#Age
sns.FacetGrid(train_data, col="Survived").map(sns.histplot, "Age", bins=25)

In [None]:
# from plt below we can see, younger passenger are more likely to survive
sns.scatterplot(x=train_data['PassengerId'],y=train_data['Age'],hue=train_data['Survived'])

In [None]:
# Fare
g = sns.histplot(train_data['Fare'])

## Fare is highly skewed towards left

In [None]:
sns.scatterplot(x=train_data['Age'],y=train_data['Fare'],hue=train_data['Survived'])

# Passengers with higher fare are more likely to survive

In [None]:
# Siblings and spouse
sns.barplot(x="SibSp", y ="Survived", data=train_data)
plt.show()

In [None]:
# parch
sns.barplot(x="Parch", y ="Survived", data=train_data)
plt.show()

In [None]:
# Family Size

sns.scatterplot(x=train_data['PassengerId'],y=train_data['SibSp']+train_data['Parch'],hue=train_data['Survived'])

# People with large family are less likely to survive

In [None]:
#embarked

sns.barplot(x="Embarked", y ="Survived", data=train_data)
plt.show()

In [None]:
sns.countplot(x="Embarked", data=train_data)
plt.show()

In [None]:
# pair plot

g = sns.pairplot(data=train_data, hue='Survived',
                 size=1.2,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=10) )
g.set(xticklabels=[])

# 2) Feature Engineering

#### Now that we have some knowledge of the data we're dealing with, we can get started with feature engineering

In [None]:
# first lets see how many null values are we dealing with

train_data.isnull().sum()


In [None]:
test_data.isnull().sum()

### * Embarked

In [None]:
# Embarked : Only 2 null values in train data, we can replace the null values with 'S' as it is most common

train_data['Embarked']=train_data['Embarked'].fillna('S')

In [None]:
# lets combine train and test data before applying the transformtions

data = pd.concat([train_data.assign(ind="train"), test_data.assign(ind="test")], ignore_index=True)

#### * Cabin
![](https://upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Olympic_%26_Titanic_cutaway_diagram.png/330px-Olympic_%26_Titanic_cutaway_diagram.png)

In [None]:
# We can extract first letter of the cabin name, then group them accordingly

def extract_cabin_alpha(cabin):
    cabin=str(cabin)
    if(cabin=='nan'):
        return "M"
    else:
        return cabin[0]

data['Deck']=data['Cabin'].apply(extract_cabin_alpha)

In [None]:
sns.barplot(x=data['Deck'],y=train_data['Survived'])

In [None]:
data['Deck'].value_counts()

In [None]:
# Lets group values like : ABCT, DE, FG, M
def group_deck(deck):
    if(deck in ['A','B','C','T']):
        return "ABC"
    elif(deck in ['D','E']):
        return "DE"
    elif (deck in ['F', 'G']):
        return "FG"
    else:
        return "M"

data['Deck']=data['Deck'].apply(group_deck)

In [None]:
sns.barplot(x=data['Deck'],y=train_data['Survived'])

### * Age

#### The idea is, replace Age by median of correponding Sex and Pclass


In [None]:
data['Age'] = data.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

In [None]:
data['Age'] = data['Age'].astype(int)
data.loc[ data['Age'] <= 15, 'Age'] = 0
data.loc[(data['Age'] > 15) & (data['Age'] <= 30), 'Age'] = 1
data.loc[(data['Age'] > 30) & (data['Age'] <= 45), 'Age'] = 2
data.loc[(data['Age'] > 45) & (data['Age'] <= 60), 'Age'] = 3
data.loc[ data['Age'] > 60, 'Age'] = 4

In [None]:
data.head()

In [None]:
data.describe()

### * Fare

In [None]:
# Making bins according to distribution
data['Fare_bin'] = pd.cut(data['Fare'], bins=[0.0,7.9,14.45,31,512], labels=[0,1,
                                                                                  2,3])
data['Fare_bin']=data['Fare_bin'].fillna(0)

data['Fare_bin']=data['Fare_bin'].astype(int)

### * Name

In [None]:
# all unique titles
def disp_title(name):
    title=name.split(',')[1].split('.')[0].strip()
    return title
data['Name'].apply(disp_title).unique()

In [None]:
# grouping titles

def get_title(name):
    title=name.split(',')[1].split('.')[0].strip()
    return title.strip()
data['Title']=data['Name'].apply(get_title)

In [None]:
def group_titles(df):
    title=df['Title']
    if title in ['Don', 'Major', 'Jonkheer','Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title in ['Dr', 'Capt', 'Rev', 'Col']:
        if df['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

data['Title']=data.apply(group_titles, axis=1)

In [None]:
sns.countplot(x=data['Title'])

### * SibSp and Parch : 
#### We can make a new feature FamilySize, which will be sum of the two

In [None]:
data['FamilySize']=data['SibSp']+data['Parch']

In [None]:
data.head()

In [None]:
final_data=data.drop(columns=['Fare','Name','Ticket','Cabin'])

In [None]:
test_data_pre, train_data_pre= final_data[final_data["ind"].eq("test")], final_data[final_data["ind"].eq("train")]

In [None]:
# Removing Unnecessary columns
train_data_pre=train_data_pre.drop(columns=['PassengerId','ind','SibSp','Parch'])
test_data_pre=test_data_pre.drop(columns=['Survived','PassengerId','ind','SibSp','Parch'])

In [None]:
#spliting traindata into X and y
X=train_data_pre.drop(columns=['Survived'])
y=train_data_pre['Survived']

In [None]:
#Correlation Matrix
sns.heatmap(X.corr(),annot=True,cmap='RdYlGn',linewidths=0.2,annot_kws={'size':14})
fig=plt.gcf()
fig.set_size_inches(14,10)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
# Convering Categorical columns to numerical

traindf_X = pd.get_dummies(X, columns = ["Sex","Title","Embarked","Fare_bin",'Deck'],
                             prefix=["Sex","Title","Em_type","Fare_type",'Deck'])
testdf = pd.get_dummies(test_data_pre, columns = ["Sex","Title","Embarked","Fare_bin",'Deck'],
                             prefix=["Sex","Title","Em_type","Fare_type",'Deck'])

# 3) Model Selection and Hyperparameter Tuning

#### Phew, finally we can get into training machine learning models !

In [None]:
from sklearn import metrics

# lets first define a function that'll help us know how good/bad our model is doing
def get_scores(y_preds,y):
    return {
        'Accuracy':metrics.accuracy_score(y_preds,y),
        'Precision':metrics.precision_score(y_preds,y),
        'Recall':metrics.recall_score(y_preds,y),
        'F1':metrics.f1_score(y_preds,y),
        'ROC_AUC': metrics.roc_auc_score(y_preds,y)
    }

In [None]:
# split data into train and val
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(traindf_X, y, test_size=0.3, random_state=42)

#### Now, regarding the model I chose. I chose total 8 model which you can see below. Now the reason I chose so many models was **Majority Voting**. 
#### YES Majority Voting
#### Let's first train these models without any tuning and see how are they doing

In [None]:
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [None]:
# lets define a function that trains model for us

def train_model(model):
    model_=model
    model_.fit(X_train,y_train)
    y_preds=model_.predict(X_val)
    return get_scores(y_preds,y_val)

In [None]:
model_list=[
            DecisionTreeClassifier(random_state=42), 
            RandomForestClassifier(random_state=42),
            XGBClassifier(random_state=42), 
            LGBMClassifier(random_state=42, is_unbalance=True), 
            LogisticRegression(random_state=42),
            svm.SVC(random_state=42),
            CatBoostClassifier(random_state=42,verbose=0),
            AdaBoostClassifier(random_state=42)
           ]
model_names=['Decision Tree', 'Random Forest', 'XG Boost', 'Light GBM', 'Logistic Regression','SVM','CatBoost','AdaBoost']

In [None]:
# Now lets train all the models and see how are they doing

scores = pd.DataFrame(columns=['Name','Accuracy','Precision',
                                'Recall',
                                'F1',
                                'ROC_AUC'])
for i in range(len(model_list)):
    score=train_model(model_list[i])
    scores.loc[i]=[model_names[i]]+list(score.values())

figure, axis = plt.subplots(2, 3)
figure.set_figheight(15)
figure.set_figwidth(20)

for i in range(2):
    for j in range(3):
        axis[i,j].set_xlim([.5,.9])
        
axis[0, 0].barh(scores['Name'],scores['Accuracy'],height=.5)
axis[0, 0].set_title("Accuracy Score")
  
axis[0, 1].barh(scores['Name'],scores['Precision'],height=.5)
axis[0, 1].set_title("Precision")

axis[1, 0].barh(scores['Name'],scores['Recall'],height=.5)
axis[1, 0].set_title("Recall")

axis[1, 2].barh(scores['Name'],scores['F1'],height=.5)
axis[1, 2].set_title("F1")

axis[0, 2].barh(scores['Name'],scores['ROC_AUC'],height=.5)
axis[0, 2].set_title('ROC_AUC')

axis[1, 1].set_visible(False)

plt.show()

## Hyperparameter Tuning

#### Alright, It's time for hyper parameter tuning!. We'll use GridSearchCV to find the best permutaion of hyper parameters for the model
#### Note : I have reduced parameters at takes a lot of time

In [None]:
# Let's start by defining what all parameters we want to tune for all 5 models

from sklearn.model_selection import GridSearchCV
from sklearn.utils.fixes import loguniform
param_grids = [

    {
        'max_depth': [5,7,10,20,50],
        'min_samples_leaf': [4, 5, 6, 7],
        'min_samples_split': [8, 10,7],
    },
    {
        'max_depth': [5,7, 20],
        'min_samples_leaf': [4, 5, 6, 7],
        'min_samples_split': [6,7,8],
        'n_estimators': [100,500],
        'oob_score' : [True],
        'max_features' :['auto'],
    },
    {
        'max_depth': [3, 5, 9], 
        'n_estimators': [5, 50, 100],
        'learning_rate': [0.01]
    },
    {
        'learning_rate': [0.1,.2],
        'num_leaves': [100,200,150,300],
        'n_estimators':[5, 20, 50, 100],
        'boosting_type' : ['gbdt'],
        'objective' : ['binary'],
        'max_depth' : [10,25,50,100,150], 
        'colsample_bytree' : [0,3,0.5,0.7],
        'subsample' : [0.3,0.5,0.7],
        'min_split_gain' : [0.01],
    },
    {
        'penalty' : ['l1', 'l2'],
        'C' : np.logspace(-4, 4, 20),
        'solver' : ['liblinear']
    },
    {
        'C': [0.1,1, 10, 100], 
        'gamma': [1,0.1,0.01,0.001],
        'kernel': ['rbf', 'poly', 'sigmoid']
    },
    {
        'learning_rate': [0.1],
        'depth': [4, 10],
        'l2_leaf_reg': [ 5, 7, 9]
    },
    {
        'n_estimators':[10,50,250,1000],
        'learning_rate':[0.01,0.1]
    }
]

In [None]:
# Now that we have defined the parameters, we can now start the search

tuned_scores=scores.drop(scores.index)
tuned_models=[]
def grid_search_util(i):
    grid_search = GridSearchCV(estimator = model_list[i], param_grid = param_grids[i], 
                          cv = 3, n_jobs = -1,verbose=1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

for i in range(len(model_list)):
    model=grid_search_util(i)
    tuned_models.append(model)
    score=train_model(model)
    tuned_scores.loc[i]=[model_names[i]]+list(score.values())
    print(model_names[i]," Done")

In [None]:
for i in tuned_models:
    print(i)

In [None]:
figure, axis = plt.subplots(2, 3)
figure.set_figheight(15)
figure.set_figwidth(20)

for i in range(2):
    for j in range(3):
        axis[i,j].set_xlim([.5,.9])
axis[0, 0].barh(tuned_scores['Name'],tuned_scores['Accuracy'],height=.5)
axis[0, 0].set_title("Accuracy Score")


axis[0, 1].barh(tuned_scores['Name'],tuned_scores['Precision'],height=.5)
axis[0, 1].set_title("Precision")

axis[1, 0].barh(tuned_scores['Name'],tuned_scores['Recall'],height=.5)
axis[1, 0].set_title("Recall")

axis[1, 2].barh(tuned_scores['Name'],tuned_scores['F1'],height=.5)
axis[1, 2].set_title("F1")

axis[0, 2].barh(tuned_scores['Name'],tuned_scores['ROC_AUC'],height=.5)
axis[0, 2].set_title('ROC_AUC')

axis[1, 1].set_visible(False)
plt.show()

#### Time for majority voting. So, this is not exactly "majority" voting. Basically, I predict survival only if all models predict survival. Now the reason for that is debatable. On experimenting I found it works the best. Would love to hear you thoughts on this

In [None]:
maj=np.zeros((len(testdf)))

for i in range(len(model_list)):
    model=tuned_models[i].fit(X_train,y_train)
    maj+=model.predict(testdf)

for i in range(len(maj)):
    if(maj[i]==8):
        maj[i]=1
    else:
        maj[i]=0

In [None]:
# saving the results for submission
predictions = [int(x) for x in maj]
submission = pd.DataFrame({'PassengerId':test_data['PassengerId'],'Survived':predictions})
submission.to_csv('submission.csv',index = False)

## Conclusion
#### Though this approach works well, there is a lot of scope for improvement. As you can see I applied only basic feature engineering. Would love to know what all can we further do to perform better.

### Thanks for reading..! Make sure to upvote if you liked the post. 