## Hi! 

In this notebook you will find:
1. Exploratory Data Analysis
2. Feature Engineering
3. Handling null values using KNN-Imputation
4. Outlier Detection using Z-Score
5. Feature Importance using Random-Forest
6. Modelling using Gradient-Boosting Classififer

Give me an upvote if you like my work and please comment your feedback! 

# Importing Dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st

from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')
sns.set()

# Importing Data

In [None]:
train = pd.read_csv("../input/d/rahulsah06/titanic/train.csv")
test = pd.read_csv("../input/d/rahulsah06/titanic/test.csv")

# Understanding Data

In [None]:
train.info(), test.info()

# TASK: EDA and Data Cleaning

# Analysis: PassengerID

In [None]:
plt.scatter(train['PassengerId'],train['Survived'])
plt.show()

In [None]:
print("Percentage of unique values: {}".format(
    train['PassengerId'].nunique()/len(train['PassengerId']) * 100))

Conclusion: Irrelevant Feature

In [None]:
train.drop("PassengerId", axis=1, inplace=True)
test.drop("PassengerId", axis=1, inplace=True)

# Analysis: PClass

In [None]:
sns.barplot('Pclass','Survived',data=train)
plt.show()

In [None]:
sns.countplot(train['Pclass'],hue=train['Survived'])
plt.legend(("Died","Survived"))
plt.show()

Conclusion: A Person with a first class ticket is more likely to survive

In [None]:
train.Pclass.value_counts()/len(train.Pclass) * 100 #Percentage of categories

# Analysis: Name

In [None]:
print("Percentage of unique values: {}".format(train.Name.nunique()/len(train.Name) * 100))

Conclusion: Irrelevent Feature

In [None]:
train.drop("Name", axis=1, inplace=True)
test.drop("Name", axis=1, inplace=True)

# Analysis: Sex

In [None]:
sns.countplot(train.Sex,hue=train.Survived)
plt.legend(("Died","Survived"))
plt.show()

Conclusion: A person who is female is more likely to survive

In [None]:
train.Sex.value_counts()/len(train.Sex) * 100 #Percentage of categories

# Analysis: Age

In [None]:
sns.boxplot(train.Survived,train.Age)
plt.show()

# Filling Null Values using KNN Imputer

Task1: Encode

In [None]:
encoder = LabelEncoder()
train_temp_Sex = encoder.fit_transform(train.Sex)
test_temp_Sex = encoder.transform(test.Sex)

train_temp_Embarked = encoder.fit_transform(train.Embarked)
test_temp_Embarked = encoder.transform(test.Embarked)

train_temp = train.drop(["Sex","Embarked","Cabin","Ticket","Survived"],axis=1)
test_temp = test.drop(["Sex","Embarked","Cabin","Ticket"],axis=1)

train_temp['Sex'] = train_temp_Sex
test_temp['Sex'] = test_temp_Sex

train_temp['Embarked'] = train_temp_Embarked
test_temp['Embarked'] = test_temp_Embarked

Task2: Scale

In [None]:
scaler = MinMaxScaler()
train_temp = pd.DataFrame(scaler.fit_transform(train_temp),columns=train_temp.columns)
test_temp = pd.DataFrame(scaler.transform(test_temp),columns=test_temp.columns)

Task3: Impute

In [None]:
imputer = KNNImputer(n_neighbors=5)
train_temp = pd.DataFrame(imputer.fit_transform(train_temp),columns=train_temp.columns)
test_temp = pd.DataFrame(imputer.transform(test_temp),columns=test_temp.columns)

Task4: Insert Back

In [None]:
train_temp = pd.DataFrame(scaler.inverse_transform(train_temp),columns=train_temp.columns)
test_temp = pd.DataFrame(scaler.inverse_transform(test_temp),columns=test_temp.columns)

In [None]:
train.Age = train_temp.Age
test.Age = test_temp.Age

In [None]:
train.Age.isnull().sum(), test.Age.isnull().sum()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(16, 5))
ax = sns.distplot(train[train.Sex=='female'][train.Survived==1]['Age'],label='Survived',bins=20,
            kde=False, ax=axes[0])
ax = sns.distplot(train[train.Sex=='female'][train.Survived==0]['Age'],label='Died',bins=40,
            kde=False, ax=axes[0])
ax.legend()
ax.set_title("Female")
ax = sns.distplot(train[train.Sex=='male'][train.Survived==1]['Age'],label='Survived',bins=20,
            kde=False, ax=axes[1])
ax = sns.distplot(train[train.Sex=='male'][train.Survived==0]['Age'],label='Died',bins=40,
            kde=False, ax=axes[1])
ax.legend()
ax.set_title("Male")
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3,figsize=(16, 5))
ax = sns.distplot(train[train.Pclass==1][train.Survived==1]['Age'],label='Survived',bins=20,
            kde=False, ax=axes[0])
ax = sns.distplot(train[train.Pclass==1][train.Survived==0]['Age'],label='Died',bins=40,
            kde=False, ax=axes[0])
ax.legend()
ax.set_title("Pclass 1")
ax = sns.distplot(train[train.Pclass==2][train.Survived==1]['Age'],label='Survived',bins=20,
            kde=False, ax=axes[1])
ax = sns.distplot(train[train.Pclass==2][train.Survived==0]['Age'],label='Died',bins=40,
            kde=False, ax=axes[1])
ax.legend()
ax.set_title("Pclass 2")
ax = sns.distplot(train[train.Pclass==3][train.Survived==1]['Age'],label='Survived',bins=20,
            kde=False, ax=axes[2])
ax = sns.distplot(train[train.Pclass==3][train.Survived==0]['Age'],label='Died',bins=40,
            kde=False, ax=axes[2])
ax.legend()
ax.set_title("Pclass 3")
plt.show()

Conclusion: A person who is younger is more likely to survive

# Analysis: SibSp : # of siblings / spouses aboard the Titanic

# Analysis: Parch : # of parents / children aboard the Titanic

In [None]:
train['Relatives'] = train['SibSp'] + train['Parch']
test['Relatives'] = test['SibSp'] + test['Parch']

train.drop(['SibSp','Parch'], axis=1, inplace=True)
test.drop(['SibSp','Parch'], axis=1, inplace=True)

In [None]:
sns.factorplot('Relatives', 'Survived', data=train, aspect=2)
plt.show()

In [None]:
plt.figure(figsize=(16,5))
sns.countplot(train[(train.Relatives >=1) & (train.Relatives <=3)].Relatives, 
              hue=train.Survived)
plt.legend(("Died","Survived"))
plt.show()

In [None]:
plt.figure(figsize=(16,5))
sns.countplot(train[(train.Relatives >= 4)].Relatives, 
              hue=train.Survived)
plt.legend(("Died","Survived"))
plt.show()

Conclusion: A person with 1-3 relatives is more likely to survive

# Analysis: Ticket

In [None]:
print("Percentage of unique values: {}".format(train.Ticket.nunique()/len(train.Ticket) * 100))

In [None]:
pd.DataFrame(train.groupby(train.Ticket, as_index=False).Survived.sum()).plot()
plt.xticks([])
plt.title("Tickets")
plt.show()

Conclusion: Irrelevant Feature

In [None]:
train.drop("Ticket", axis=1, inplace=True)
test.drop("Ticket", axis=1, inplace=True)

# Analysis: Fare

In [None]:
plt.figure(figsize=(16,5))
sns.distplot(train[train.Survived==0].Fare, label="Died", kde=True, bins=60)
sns.distplot(train[train.Survived==1].Fare, label="Survived", kde=True)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(16,5))
sns.boxplot(train.Fare,train.Survived,orient='h')
plt.show()

# Treating Outliers using Z-Score

In [None]:
mean = np.mean(train.Fare)
std = np.std(train.Fare)
def z_score(value):
    return (value-mean)/std
train_z = train.copy()
train_z['Z'] = train_z.Fare.apply(z_score)

Potential outliers

In [None]:
train_z[train_z['Z']>3]

In [None]:
train_z['binFare'] = pd.cut(train_z.Fare.astype(int), 
                            range(0,600,50))
sns.factorplot('binFare', 'Survived', data=train_z, aspect=2)
plt.show()

Filling Null Values

In [None]:
test.Fare.isnull().sum()

In [None]:
test.Fare = test.Fare.fillna(test.Fare.mean())

Conclusion: A person who pays more as Fare is more likely to survive

In [None]:
train_z.binFare.value_counts() / len(train_z) * 100

# Analysis: Cabin

In [None]:
train.Cabin.isnull().sum()/len(train.Cabin) * 100

In [None]:
train_c = train.copy()
train_c['CabinClass'] = train_c.Cabin.str.slice(0,1)
train_c.drop("Cabin", axis=1, inplace=True)
train_c = train_c.dropna()

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(train_c.CabinClass, hue=train_c.Survived)
plt.legend(("Died","Survived"))
sns.factorplot('CabinClass', 'Fare', data=train_c, aspect=2)
plt.show()

In [None]:
train_c = train_c.join(pd.get_dummies(train_c.CabinClass, prefix='Cabin'))
train_c.drop("CabinClass", axis=1, inplace=True)

In [None]:
train_c.corr()['Pclass'].sort_values(ascending=False)

In [None]:
train_c.corr()['Fare'].sort_values(ascending=False)

Conclusion: A person with a premium cabin (B and C) is more likely to Survive

Filling null values

In [None]:
train['CabinClass'] = train.Cabin.str.slice(0,1)
train.drop("Cabin", axis=1, inplace=True)
train.CabinClass = train.CabinClass.fillna("N")

test['CabinClass'] = test.Cabin.str.slice(0,1)
test.drop("Cabin", axis=1, inplace=True)
test.CabinClass = test.CabinClass.fillna("N")

# Analysis: Embarked

Fill null values

In [None]:
train.Embarked.isnull().sum()

In [None]:
train.Embarked = train.Embarked.fillna(train.Embarked.mode()[0])

In [None]:
ig, axes = plt.subplots(nrows=1, ncols=2,figsize=(16, 5))
ax = sns.countplot(train[train.Sex=='male'].Embarked, hue=train.Survived, ax=axes[0])
ax.set_title("Male")
ax.legend(("Died","Survived"))
ax = sns.countplot(train[train.Sex=='female'].Embarked, hue=train.Survived, ax=axes[1])
ax.set_title("Female")
ax.legend(("Died","Survived"))
plt.show()

In [None]:
FacetGrid = sns.FacetGrid(train, row='Embarked', aspect=2.6)
FacetGrid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex')
FacetGrid.add_legend()
plt.show()

Conclusion: Women are more likely to survive if ported from 'Southhampton' or 'Queenstown'

Men are more likely to survive if ported from 'Cherbourg'

In [None]:
train.Embarked.value_counts()/len(train.Embarked) * 100

# EDA Summary Points:

1. A Person with a first class ticket is more likely to survive
2. A person who is female is more likely to survive
3. A person who is younger is more likely to survive
4. A person with 1-3 relatives is more likely to survive
5. A person who pays more as Fare is more likely to survive
6. A person with a premium cabin (B and C) is more likely to Survive
7. A female person is more likely to survive if ported from 'S' or 'Q' and men from 'C'

# TASK: Feature Importance using RandomForrestClassifier

In [None]:
train.head()

In [None]:
le = LabelEncoder()
train.Sex = le.fit_transform(train.Sex)
test.Sex = le.transform(test.Sex)
train.Embarked = le.fit_transform(train.Embarked)
test.Embarked = le.transform(test.Embarked)
train.CabinClass = le.fit_transform(train.CabinClass)
test.CabinClass = le.transform(test.CabinClass)

In [None]:
X = train.drop("Survived", axis=1).values
y = train["Survived"].values

In [None]:
rf_model = RandomForestClassifier(n_estimators=250, n_jobs=-1, random_state=42)
rf_model.fit(X,y)

In [None]:
features = train.drop("Survived", axis=1).columns
importances = rf_model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(16,5))
plt.barh(range(len(indices)), importances[indices])
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.title("Feature Importance")
plt.show()

# Task: Make Predictions!

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

RandomForest Classifier

In [None]:
rf_model = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42)
rf_model.fit(X_train,y_train)
y_pred = rf_model.predict(X_test)
print(accuracy_score(y_test, y_pred))

GradientBoosting Classifier

In [None]:
gb_model = GradientBoostingClassifier(n_estimators=200, random_state=42)
gb_model.fit(X_train,y_train)
y_pred = gb_model.predict(X_test)
print(accuracy_score(y_test, y_pred))

# TASK: Hyper-Parameter Tuning

In [None]:
param_grid ={'max_depth': [1,2,3,4,5],
             'n_estimators':[100,200,300,400,500],
             'max_features':[1,2,3,4,5],
            'max_leaf_nodes':[1,2,3,4,5]}
grid = RandomizedSearchCV(gb_model, param_grid, cv=10, scoring='accuracy', verbose=1, 
                          n_iter=20)
grid.fit(X, y)
grid.best_estimator_

In [None]:
grid.best_score_

In [None]:
predictions = grid.best_estimator_.predict(test)
submission = pd.read_csv("../input/d/rahulsah06/titanic/gender_submission.csv")
output = {'PassengerId':submission['PassengerId'], 'Survived':predictions}
pd.DataFrame(output).to_csv("Result",index=False)