# Intro
Welcome to the [Tabular Playground Series - Apr 2021](https://www.kaggle.com/c/tabular-playground-series-apr-2021) competition.

![](https://storage.googleapis.com/kaggle-competitions/kaggle/26478/logos/header.png)

<span style="color: royalblue;">Please vote the notebook up if it helps you. Feel free to leave a comment above the notebook. Thank you. </span>

# Libraries

In [None]:
import numpy as np
import pandas as pd
import scipy.special
import matplotlib.pyplot as plt
import os
import random

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

import warnings
warnings.filterwarnings("ignore")

# Functions
We define some helper functions for visualizations.

In [None]:
def plot_bar_survivor(data, feature, rot=False):
    """ Compare the distribution between survived and nor survived """
    
    df_not_survived = data[data['Survived']==0]
    df_survived = data[data['Survived']==1]
    
    survived_label = df_survived[feature].value_counts().sort_index()
    dict_survived = dict(zip(survived_label.keys(), ((100*(survived_label)/len(df_survived.index)).tolist())))
    survived_names = list(dict_survived.keys())
    survived_values = list(dict_survived.values())
    
    not_survived_label = df_not_survived[feature].value_counts().sort_index()
    dict_not_survived = dict(zip(not_survived_label.keys(), ((100*(not_survived_label)/len(df_not_survived.index)).tolist())))
    not_survived_names = list(dict_not_survived.keys())
    not_survived_values = list(dict_not_survived.values())
    
    fig, axs = plt.subplots(1, 2, figsize=(9, 3), sharey=True)
    
    axs[0].bar(survived_names, survived_values, color='yellowgreen')
    axs[1].bar(not_survived_names, not_survived_values, color='sandybrown')
    axs[0].grid()
    axs[1].grid()
    axs[0].set_title('Survived')
    axs[1].set_title('Not Survived')
    axs[0].set_ylabel('%')
    if(rot==True):
        axs[0].set_xticklabels(survived_names, rotation=45)
        axs[1].set_xticklabels(not_survived_names, rotation=45)
    plt.show()
    
def plot_bar_compare(train, test, name, rot=False):
    """ Compare the distribution between train and test data """
    
    fig, axs = plt.subplots(1, 2, figsize=(9, 3), sharey=True)
    
    train_label = train[name].value_counts().sort_index()
    dict_train = dict(zip(train_label.keys(), ((100*(train_label)/len(train.index)).tolist())))
    train_names = list(dict_train.keys())
    train_values = list(dict_train.values())
    
    test_label = test[name].value_counts().sort_index()
    dict_test = dict(zip(test_label.keys(), ((100*(test_label)/len(test.index)).tolist())))
    test_names = list(dict_test.keys())
    test_values = list(dict_test.values())
    
    axs[0].bar(train_names, train_values, color='yellowgreen')
    axs[1].bar(test_names, test_values, color='sandybrown')
    axs[0].grid()
    axs[1].grid()
    axs[0].set_title('Train data')
    axs[1].set_title('Test data')
    axs[0].set_ylabel('%')
    if(rot==True):
        axs[0].set_xticklabels(train_names, rotation=45)
        axs[1].set_xticklabels(test_names, rotation=45)
    plt.show()

# Path

In [None]:
path = '/kaggle/input/tabular-playground-series-apr-2021/'
os.listdir(path)

# Load Data

In [None]:
train_data = pd.read_csv(path+'train.csv')
test_data = pd.read_csv(path+'test.csv')
samp_subm = pd.read_csv(path+'sample_submission.csv')

# Overview

In [None]:
print('Number train samples:', len(train_data.index))
print('Number test samples:', len(test_data.index))

In [None]:
train_data.head()

Columns with missing data:

In [None]:
train_data.isnull().sum()

# Handle Missing Values

In [None]:
cols_with_missing_train = [col for col in train_data.columns if train_data[col].isnull().any()]
cols_with_missing_test = [col for col in test_data.columns if test_data[col].isnull().any()]

In [None]:
print('train columns with missing data:', cols_with_missing_train)
print('test columns with missing data:', cols_with_missing_test)

## Feature Age

In [None]:
age_mean = int(train_data[train_data['Age'].notnull()]['Age'].mean())
age_std = int(train_data[train_data['Age'].notnull()]['Age'].std())
age_mean, age_std

In [None]:
def fill_age(s):
    if np.isnan(s) == False:
        return s
    else:
        return random.randrange(age_mean-age_std, age_mean+age_std)

In [None]:
train_data['Age'] = train_data['Age'].apply(fill_age)
test_data['Age'] = test_data['Age'].apply(fill_age)

## Feature Cabin

In [None]:
train_data['Cabin'] = train_data['Cabin'].fillna('Unknown', inplace=False)
test_data['Cabin'] = test_data['Cabin'].fillna('Unknown', inplace=False)

## Feature Embarked

In [None]:
train_data['Embarked'] = train_data['Embarked'].fillna('Unknown', inplace=False)
test_data['Embarked'] = test_data['Embarked'].fillna('Unknown', inplace=False)

## Feature Fare

In [None]:
mean = train_data['Fare'].mean()
train_data['Fare'] = train_data['Fare'].fillna(mean, inplace=False)
test_data['Fare'] = test_data['Fare'].fillna(mean, inplace=False)

# Exploratory Data Analysis

Age:

In [None]:
plot_bar_survivor(train_data, 'Age')

Sex

In [None]:
plot_bar_survivor(train_data, 'Sex')

Fare

In [None]:
plot_bar_survivor(train_data, 'Fare')

SibSP

In [None]:
plot_bar_survivor(train_data, 'SibSp')

# Feature Engineering

In [None]:
train_data['Cabin'] = train_data['Cabin'].str[0]
test_data['Cabin'] = test_data['Cabin'].str[0]

# Encode Data

Categorical Features

In [None]:
features_cat = ['Sex', 'Cabin', 'Embarked']
le = LabelEncoder()
for col in features_cat:
    le.fit(train_data[col])
    train_data[col] = le.transform(train_data[col])
    test_data[col] = le.transform(test_data[col])

# Correlation Matrix

In [None]:
corr = train_data.corr()
corr.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

# Select Features

In [None]:
no_features = ['Survived', 'Name', 'Ticket', 'PassengerId']

# Define Train And Test Data

In [None]:
X_train = train_data[train_data.columns.difference(no_features)].copy(deep=False)
y_train = train_data['Survived']
X_test = test_data[test_data.columns.difference(no_features)].copy(deep=False)

# Scale Data

In [None]:
min_max = MinMaxScaler()
X_train_scaled = min_max.fit_transform(X_train)
X_test_scaled = min_max.transform(X_test)

# Define Model

In [None]:
model = XGBClassifier()
model.fit(X_train_scaled, y_train)

# Analyse Results

Feature Importance

In [None]:
importance = model.feature_importances_
fig = plt.figure(figsize=(10, 6))
x = X_train.columns.values
plt.barh(x, 100*importance)
plt.title('Feature Importance', loc='left')
plt.xlabel('Percentage')
plt.grid()
plt.show()

Confusion Matrix Of The Test Data:

In [None]:
y_train_pred = model.predict(X_train_scaled)
conf_mat = confusion_matrix(y_train, y_train_pred)

fig, ax = plot_confusion_matrix(conf_mat=conf_mat,
                                show_normed=True,
                                show_absolute=False,
                                figsize=(6, 6))
fig.show()

# Predict Test Data

In [None]:
y_test = model.predict(X_test_scaled)

# Export Results

In [None]:
output = pd.DataFrame({'PassengerId': samp_subm['PassengerId'],
                       'Survived': y_test})
output.to_csv('submission.csv', index=False)

In [None]:
output['Survived'].value_counts()