<h1><center>Titanic EDA + Prediction</center></h1>

Table of Contents: <a id=100></a>

1. [Adding basic libraries & importing the dataset](#1)
2. [Visual EDA and feature engineering](#2)
3. [Imputing missing data and handling categorical variables](#3)
4. [Model Development](#4)

## 1. Adding basic libraries & importing the dataset <a id=1></a>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
print(df_train.shape)
print(df_test.shape)

## 2. Visual EDA and feature engineering <a id=2></a>

In [None]:
# Correlation between different features and Survived
df_train.corr()

In [None]:
# Heatmap of the correlation
sns.heatmap(df_train.corr())
plt.show()

In [None]:
# Count of Survived people
sns.set_style('dark')
sns.set_palette('RdBu')
sns.set_context('poster')
sns.catplot(x = 'Survived',data=df_train, kind='count')
plt.show()

In [None]:
# Count of Survived people belonging to different Pclass
sns.set_palette(['Red','Green'])
sns.catplot(x = 'Pclass',data=df_train, kind='count',hue='Survived')
plt.show()

In [None]:
# Count of Survived people of each Sex
sns.set_palette(['Red','Green'])
sns.catplot(x = 'Sex', data = df_train, kind='count', hue='Survived')
plt.show()

In [None]:
# Distribution of Age among the Survived people
sns.set_context('notebook')
sns.catplot(x = 'Survived', y='Age', data=df_train,kind='box')
plt.show()

In [None]:
# Relation between the survived people and their fare
sns.catplot(x = 'Survived', y='Fare', data=df_train, kind='bar')
plt.show()

In [None]:
# Count of Survived people from each Embarking
sns.catplot(x = 'Embarked', data = df_train, kind='count', hue='Survived')
plt.show()

## 3. Imputing missing data and handling categorical variables <a id=3></a>

In [None]:
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].mean())
df_test['Age'] = df_test['Age'].fillna(df_test['Age'].mean())
df_train['Cabin'] = df_train['Cabin'].fillna('Missing')
df_test['Cabin'] = df_test['Cabin'].fillna('Missing')
df_train = df_train.dropna()
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].mean())

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
df_train = df_train.drop(columns=['Name'],axis=1)
df_test = df_test.drop(columns=['Name'],axis=1)
df_train = df_train.drop(columns=['Ticket'],axis=1)
df_test = df_test.drop(columns=['Ticket'],axis=1)
df_train = df_train.drop(columns=['Cabin'], axis=1)
df_test = df_test.drop(columns=['Cabin'], axis=1)

In [None]:
sex_map = {
        'male':0,
    'female':1
}
df_train.loc[: ,'Sex'] = df_train['Sex'].map(sex_map)
df_test.loc[: , 'Sex'] = df_test['Sex'].map(sex_map)

In [None]:
df_train = pd.get_dummies(df_train, prefix_sep='_',columns=['Embarked'])
df_test = pd.get_dummies(df_test, prefix_sep='_',columns=['Embarked'])

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
print(df_train.shape)
print(df_test.shape)

## 4. Model Development <a id=4></a>

In [None]:
# Base Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN

# Ensembling Techniques
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

# Metrics 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [None]:
X = df_train.drop('Survived',axis=1)
y = df_train['Survived'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Base Classifiers
lr = LogisticRegression(max_iter=10000)
knn=KNN()
dt = DecisionTreeClassifier()
classifiers = [('LogisticRegression',lr),
              ('KNeighborsClassifier',knn),
              ('ClassificationTree',dt)]
for clf_name, clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf_name, 'Accuracy Score' , accuracy_score(y_test,y_pred) , " " , 'ROC AUC Score' , roc_auc_score(y_test, y_pred))

In [None]:
# Voting Classifier
vc = VotingClassifier(estimators = classifiers)
vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)
print('Voting Classifier', 'Accuracy Score' , accuracy_score(y_test,y_pred) , " " , 'ROC AUC Score' , roc_auc_score(y_test, y_pred))

In [None]:
# AdaBoost Classifier
adb_clf = AdaBoostClassifier(base_estimator = dt, n_estimators = 100)
adb_clf.fit(X_train, y_train)
y_pred = adb_clf.predict(X_test)
print('AdaBoostClassifier', 'Accuracy Score' , accuracy_score(y_test,y_pred) , " " , 'ROC AUC Score' , roc_auc_score(y_test, y_pred))

In [None]:
# GradientBoosting Classifier
gbt = GradientBoostingClassifier()
gbt.fit(X_train, y_train)
y_pred = gbt.predict(X_test)
print('GradientBoostingClassifier', 'Accuracy Score' , accuracy_score(y_test,y_pred) , " " , 'ROC AUC Score' , roc_auc_score(y_test, y_pred))

In [None]:
# Stochastic GradientBoostingClassifier
sgbt = GradientBoostingClassifier(max_depth=1,subsample=0.8,max_features=0.2,n_estimators=300,random_state=21)
sgbt.fit(X_train, y_train)
y_pred = sgbt.predict(X_test)
print('Stochastic GradientBoostingClassifier', 'Accuracy Score' , accuracy_score(y_test,y_pred) , " " , 'ROC AUC Score' , roc_auc_score(y_test, y_pred))

In [None]:
# XGBoost
xg_cl = xgb.XGBClassifier(objective='binary:logistic',
                         seed=123)
xg_cl.fit(X_train, y_train)
y_pred = xg_cl.predict(X_test)
print('XGBoost', 'Accuracy Score' , accuracy_score(y_test,y_pred) , " " , 'ROC AUC Score' , roc_auc_score(y_test, y_pred))

In [None]:
## The highest ROC AUC Score and Accuracy is given by GradientBoostingClassifier
# GradientBoostingClassifier
sgbt = GradientBoostingClassifier(max_depth=1,subsample=0.8,max_features=0.2,n_estimators=300,random_state=21)
sgbt.fit(X_train, y_train)
y_pred = sgbt.predict(X_test)
print('Stochastic GradientBoostingClassifier', 'Accuracy Score' , accuracy_score(y_test,y_pred) , " " , 'ROC AUC Score' , roc_auc_score(y_test, y_pred))

In [None]:
y_pred = sgbt.predict(df_test)

In [None]:
df_test['Survived'] = y_pred

In [None]:
df_test

In [None]:
df_submission = df_test.drop(["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked_C", "Embarked_Q", "Embarked_S"],axis=1)

In [None]:
df_submission.head()

In [None]:
gender_submission = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
gender_submission.head()

In [None]:
df_submission.to_csv('results.csv',index=False)

### If you like it, please drop an upvote.
Check out my other notebooks
1. https://www.kaggle.com/namanmanchanda/cat-vs-dog-classifier-10-lines-of-code-fast-ai
2. https://www.kaggle.com/namanmanchanda/star-wars-classifier

[back to top](#100)