# Introduction

This notebook is for submission to the Titanic. This is my first notebook to join Kaggle competition. There may be some incorrection, feel free to comment and give me advice.

## Summary
Check if there are any null values in the features.  If do so, fill the missing value with median of the features.
Review the distribution of variables
Use the Catboost to encode the categorical features into numerical representation based on its distribution to the target.
Apply oversampling to equalize the size of true and false cases
Use GridSearchCV to hyperparameterize the Random Forest
Get the optimal hyperparameters for cross-validation and data submission


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Additional packages
import category_encoders as encoders
from sklearn.preprocessing import StandardScaler
from pandas.api.types import is_numeric_dtype

from imblearn.over_sampling import SMOTE
from sklearn import model_selection, metrics, naive_bayes
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC 

In [None]:
# Read the data
df_train = pd.read_csv('../input/titanic/train.csv')

# Explortory Data Analysis

In [None]:
print(df_train.info())
print('Size of Train data set = {}'.format(df_train.shape))

In [None]:
# Delete columns with unique identifiers
col_lst = ['PassengerId', 'Name', 'Ticket', 'Cabin']
df_train.drop(col_lst, axis = 1, inplace=True)
print(df_train.info())

## Check if there is any missing values

In [None]:
df_train.isnull().sum()

In [None]:
df_train['Sex'].value_counts()

## Fill the null value with Median

In [None]:
#fare_median = df_train[(df_train['Fare']>0) & (df_train['Fare'].isnull() == False)]['Fare'].median()
age_median = df_train[(df_train['Age']>0) & (df_train['Age'].isnull() == False)]['Age'].median()
print('Median = {}'.format(age_median))
print('No. of records with non-null Age = {}'.format(df_train[(df_train['Age']>0) & (df_train['Age'].isnull() == False)]['Age'].count()))
print('===={} of Median {} with {} Null Records===='.format('Age', age_median, df_train[(df_train['Age'].isnull() == True)]['Survived'].count()))
df_train['Age'].fillna(age_median, inplace=True)

## Distribution of numerical variables

In [None]:
numerical = ['Age', 'SibSp', 'Parch', 'Fare']
%matplotlib inline
import matplotlib.pyplot as plt

for col in numerical:
    if is_numeric_dtype(df_train[col]) == True:
        df_train[df_train[col]>0][col].plot.hist(bins=50, grid=True, legend=None)
        plt.title(col)
        plt.show()

# Data encoding

In [None]:
CATBoostENCODE = encoders.CatBoostEncoder()
categorical = ['Pclass', 'Sex', 'Embarked']

# Cast teh Pclass from integer to string so that we can apply the categorical encoding later
df_train['Pclass'] = df_train['Pclass'].astype(str)

df_target = df_train['Survived'].astype(str)

# Use CatBoost to encode the categorical values
encoder_cat = CATBoostENCODE.fit_transform(df_train[categorical], df_target)
encoded_cat = pd.DataFrame(encoder_cat)
print(encoded_cat.head(10))

# Training and validation data

## Prepare data for modelling

In [None]:
df_model_data = df_train.copy()
df_model_data.drop(categorical, axis = 1, inplace=True)
df_model_data = pd.concat([df_model_data, encoded_cat], axis=1)
df_model_data.info()

# Oversampling

In [None]:
def get_oversample (training, testing):

    smote = SMOTE()

    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(training, testing, test_size=0.3)
    X_smote, Y_smote = smote.fit_resample(X_train, Y_train)
    print("length of original data is ",len(training))
    print("Proportion of True data in original data is ",len(Y_train[Y_train['Survived']==1])/len(Y_train))
    print("Proportion of False data in original data is ",len(Y_train[Y_train['Survived']==0])/len(Y_train))
    print("length of oversampled data is ",len(X_smote))
    print("Proportion of True data in oversampled data is ",len(Y_smote[Y_smote['Survived']==1])/len(Y_smote))
    print("Proportion of False data in oversampled data is ",len(Y_smote[Y_smote['Survived']==0])/len(Y_smote))
   
    return X_smote, Y_smote, X_train, X_test, Y_train, Y_test

In [None]:
Y = df_model_data.iloc[:,0:1]
X = df_model_data.iloc[:,1:]
X_smote, Y_smote, X_train, X_test, Y_train, Y_test = get_oversample(X, Y)

# Model - Random Forest

In [None]:
# parameter list
p_cv = 5
p_score = 'accuracy'

In [None]:
# Maximum number of depth in each tree:
max_depth = [7,8,9,10]
# Minimum number of samples to consider at each leaf node:
min_samples_leaf = [10,12,15]## Decision Tree
# Minimum number of samples to consider to split a node:
min_samples_split = [10,12,15]
# No. of estimators
estimators = [50, 100, 150]

In [None]:
clf = RandomForestClassifier()

forest_params_grid={'n_estimators':estimators,
           'max_depth':max_depth,
           'min_samples_split':min_samples_split,
           'min_samples_leaf':min_samples_leaf  }

cv = model_selection.StratifiedKFold(n_splits=p_cv, random_state=5463, shuffle=True)

model = model_selection.GridSearchCV(clf, forest_params_grid, cv=cv, scoring=p_score, n_jobs=-1, verbose=1)

In [None]:
model.fit(X_smote, Y_smote.values.ravel())
print(model.best_params_)
print(model.best_estimator_)

In [None]:
predicted_test = pd.DataFrame(model.predict(X_test))
predicted_train = pd.DataFrame(model.predict(X_train))
print('=============================================')
print('Scoring Metrics for Random Forest (Validation)')
print('=============================================')
print('Balanced Accuracy Score = {}'.format(metrics.balanced_accuracy_score(Y_test, predicted_test)))
print('Accuracy Score = {}'.format(metrics.accuracy_score(Y_test, predicted_test)))
print('Precision Score = {}'.format(metrics.precision_score(Y_test, predicted_test)))
print('F1 Score = {}'.format(metrics.f1_score(Y_test, predicted_test, labels=['0','1'])))
print('Recall Score = {}'.format(metrics.recall_score(Y_test, predicted_test, labels=['0','1'])))
print('ROC AUC Score = {}'.format(metrics.roc_auc_score(Y_test, predicted_test, labels=['0','1'])))
print('Confusion Matrix')
print('==================')
print(metrics.confusion_matrix(Y_test, predicted_test))
print('==================')
print(metrics.classification_report(Y_test, predicted_test, target_names=['0','1']))
metrics.ConfusionMatrixDisplay(metrics.confusion_matrix(Y_test, predicted_test)).plot()


print('=============================================')
print('Scoring Metrics for Random Forest (Training)')
print('=============================================')
print('Balanced Accuracy Score = {}'.format(metrics.balanced_accuracy_score(Y_train, predicted_train)))
print('Accuracy Score = {}'.format(metrics.accuracy_score(Y_train, predicted_train)))
print('Precision Score = {}'.format(metrics.precision_score(Y_train, predicted_train)))
print('F1 Score = {}'.format(metrics.f1_score(Y_train, predicted_train)))
print('Recall Score = {}'.format(metrics.recall_score(Y_train, predicted_train, labels=['0','1'])))
print('ROC AUC Score = {}'.format(metrics.roc_auc_score(Y_train, predicted_train, labels=['0','1'])))
print('Confusion Matrix')
print('==================')
print(metrics.confusion_matrix(Y_train, predicted_train))
print('==================')
print(metrics.classification_report(Y_train, predicted_train, target_names=['0','1']))
metrics.ConfusionMatrixDisplay(metrics.confusion_matrix(Y_train, predicted_train)).plot()

# Data Submission

In [None]:
# Read the data
df_test = pd.read_csv('../input/titanic/test.csv')
df_id = df_test.iloc[:,0:1]

# Delete columns with unique identifiers
col_lst = ['PassengerId', 'Name', 'Ticket', 'Cabin']
df_test.drop(col_lst, axis = 1, inplace=True)

# Replace null age with median 
df_test['Age'].fillna(age_median, inplace=True)
df_test['Fare'].fillna(0, inplace=True)

# Convert the data type of Pclass from ingeter to string value
df_test['Pclass'] = df_test['Pclass'].astype(str)

# Categorical variable encoding
encoder_cat = CATBoostENCODE.transform(df_test[categorical])
encoded_cat = pd.DataFrame(encoder_cat, columns =categorical)

# Prepare the dataset
df_test.drop(categorical, axis = 1, inplace=True)
df_test = pd.concat([df_test, encoded_cat], axis=1)  


In [None]:
# Prediction
p_model = model.predict(df_test)
df_rst = pd.concat([df_id, pd.DataFrame(p_model, columns = ['Survived'])], axis = 1)
df_rst.to_csv("submission.csv",index=False)
print('Done!')