#  **Welcome to my notebook**

Steps Involved:
*     Data Cleaning & Data Manipluation on Train & Test Dataset & Imputing the Missing Values.
*     Using XGBoost Model with HyperParameter Tuning
*     Predicting the Class Survived

In [None]:
#importing libraries
import pandas as pd
import numpy as np

import warnings 
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Loading Dataset
train=pd.read_csv('../input/titanic/train.csv')
test=pd.read_csv('../input/titanic/test.csv')
dataset=[train,test]
train.info()

# **Data Visualization**

In [None]:
# missing values graph
# for training
fig = plt.figure(figsize=(12,6))

plt.subplot(121)   #  subplot 1 - female
plt.title('training datset')
sns.heatmap(train.isnull(),yticklabels=False,cmap='viridis')#, annot=True, fmt='.2f', square=True, cmap = 'Reds_r')

plt.subplot(122)   #  subplot 2 - male
plt.title('testing datset')
sns.heatmap(test.isnull(),yticklabels=False,cmap='viridis' )#,annot=True, fmt='.2f', square=True, cmap = 'Blues_r')

plt.show()

> From above graph, we can see that 
> 1. in training dataset, Column **Age, Cabin** and **Embarked** have null values 
> 2. in training dataset, Column **Age**, and **Cabin**  have null values 

> From above figure , we can see that people

In [None]:
# visualize the number of male and female survived or not

sns.countplot(data=train,x='Sex',hue='Survived')

It shows that most of the women are survived as compared to men

> Now let's see the catplot which will give detailed distribution of the survival rate between different passenger classes on the Titanic for men and women.

In [None]:
## Count of number of family memebers
#sns.countplot(x = 'Pclass',hue='Sex', data = train,palette='PuBuGn')
sns.catplot(x="Pclass", hue="Sex", col="Survived",
                data=train, kind="count",
                height=4, aspect=.7, palette = 'PuBu');

Let's check the distribution of age and sex with Survival Count

In [None]:
sns.boxplot(x='Sex', y='Age', hue = 'Survived',data=train);

 We can see that the Average age for both men and women is nearly about 30

**Distribution of Embarkation Port**

In [None]:
sns.countplot(data=train,x='Embarked',hue='Survived')

# **Data Cleaning**

> SibSp and Parch Means number of sibling and Parents/children respectively. So to get family size we can do operation as adding them to 1. So we will get family size.

In [None]:
for data in dataset:
    data['Family']=data['SibSp']+data['Parch']+1
    
# Drop columns SibSp and Parch
for data in dataset:
    data.drop(columns=['SibSp','Parch'],inplace=True,axis=1)

# Column Name Ticket and Cabin is not necessary to predict whether passanger will survive or not, So Drop column Name and Ticket and Cabin
for data in dataset:
    data.drop(columns=['Name','Ticket','Cabin',],axis=1,inplace=True)
    
# manipulating Fare Column
for data in dataset:
    data.loc[ data['Fare'] <= 7.91, 'Fare'] = 0
    data.loc[(data['Fare'] > 7.91) & (data['Fare'] <= 14.454), 'Fare'] = 1
    data.loc[(data['Fare'] > 14.454) & (data['Fare'] <= 31), 'Fare']   = 2
    data.loc[(data['Fare'] > 31) & (data['Fare'] <= 99), 'Fare']   = 3
    data.loc[(data['Fare'] > 99) & (data['Fare'] <= 250), 'Fare']   = 4
    data.loc[ data['Fare'] > 250, 'Fare'] = 5
    


In [None]:
train['Family'].value_counts()

In [None]:
sns.barplot(x="Family", y="Survived", data=train)
plt.show;

In [None]:
# Convert Categorical Values to Numeric Value 
print(data['Embarked'].value_counts())
for data in dataset:
    data['Sex']=data['Sex'].map({'female':0,'male':1})
    data['Embarked']=data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# **Imputing the missing values**

In [None]:
# information about training and testing dataset
for data in dataset:
    data.info()
    print('========================================')

In [None]:
# Finding the null columns in train and test datasets
train_null_cols=train.columns[train.isna().any()].to_list()
test_null_cols=test.columns[test.isna().any()].to_list()
print('train_null_cols : ',train_null_cols)
print('test_null_cols : ',test_null_cols)

In [None]:
# imputation of age in traing dataset by using mean
for data in dataset:
    data['Age'].fillna(data['Age'].mean(),inplace=True)
    
#imputing Embarked column in traing dataset
train['Embarked'].fillna(train['Embarked'].mode()[0],inplace=True)

#imputing Fare column in testing dataset
test['Fare'].fillna(test['Fare'].mode()[0],inplace=True)

#Check After Imputation
for data in dataset:
    data=data.astype(int)
    data.info()
    print('========================================')

# **Applying XGBoost Classifier Model**

In [None]:
X_train=train.drop(columns='Survived',axis=1)
Y_train=train['Survived']

from xgboost import XGBClassifier

xgb = XGBClassifier(objective='binary:logistic',booster = 'gbtree',eval_metric='logloss', gamma=5,learning_rate = 0.1, max_depth = 5, n_estimators = 100,colsample_bytree=1)
xgb.fit(X_train, Y_train)
predictions_xgb=xgb.predict(test)

#feature importance graph
from xgboost import plot_importance
plot_importance(xgb)
plt.show()

Survived=pd.Series(predictions_xgb,name='Survived')
ans=pd.concat([test['PassengerId'],Survived],axis=1)
ans.to_csv('XGB_Ans.csv',index=False)

> We can observe from above feature importance graph that PClass has highest importance.

# **Do upvote if you find it useful!**