# Table of Contents

+ [Preparation](#Preparation)
  + [Load packages](#Load-packages)
  + [Load dataset](#Load-dataset)
+ [Data pre-processing](#Data-pre-processing)
  + [Missing values](#Handle-missing-values)
  + [Categorical features](#Handle-categorical-features)
    + [`Cabin` column](#Cabin-column)
    + [`Name` column](#Name-column)
    + [`Ticket` column](#Ticket-column)
    + [Ordinal features](#Ordinal-features)
    + [Category features](#Category-features)
  + [Numerical features](#Numerical-features)
    + [Binning](#Binning)
+ [Feature engineering](#Feature-engineering)
+ [Prepare dataset](#Prepare-dataset)
+ Model building
  + Logistic regression
  + RandomForest classifier
  + XGBoost 
  + CatBoost
  + LGBM
+ Final submission

# Preparation

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load packages

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [None]:
import xgboost
xgboost.__version__

## Load dataset

In [None]:
train_filepath = '/kaggle/input/tabular-playground-series-apr-2021/train.csv'
test_filepath = '/kaggle/input/tabular-playground-series-apr-2021/test.csv'

In [None]:
train = pd.read_csv(train_filepath, index_col='PassengerId')
test = pd.read_csv(test_filepath, index_col='PassengerId')

In [None]:
print(f'Train dataset shape: {train.shape}')
print(f'Test dataset shape: {test.shape}')

In [None]:
# View summary of each columns in train dataset
train.info()

# Data pre-processing

## Handle missing values

In [None]:
# Check for missing values in train dataset
train.isnull().sum()

In [None]:
# Check for missing values in test dataset
test.isnull().sum()

To handle missing values in columns, we'll follow the imputation rule:
+ Age, Fare  
  Impute missing values with mean
+ Ticket, Cabin, Embarked  
  Impute missing values with 'NA'

Whatever change we make in train dataset, same transform goes to test dataset

In [None]:
feature = 'Age'
train[feature] = train[feature].fillna(train[feature].mean())
test[feature] = test[feature].fillna(test[feature].mean())

In [None]:
feature = 'Fare'
train[feature] = train[feature].fillna(train[feature].mean())
test[feature] = test[feature].fillna(test[feature].mean())

In [None]:
feature = 'Ticket'
train[feature] = train[feature].fillna('None')
test[feature] = test[feature].fillna('None')

In [None]:
feature = 'Cabin'
train[feature] = train[feature].fillna('None')
test[feature] = test[feature].fillna('None')

In [None]:
feature = 'Embarked'
train[feature] = train[feature].fillna('None')
test[feature] = test[feature].fillna('None')

In [None]:
train.info()

In [None]:
test.info()

## Handle categorical features

### `Cabin` column

In [None]:
train['Cabin'].value_counts()

In [None]:
train['Cabin'].str[0].value_counts()

In [None]:
# Replace Cabin value with first letter
train['Cabin'] = train['Cabin'].str[0]
test['Cabin'] = test['Cabin'].str[0]

### `Name` column

In [None]:
train['Name'].value_counts()

In [None]:
# Split LastName, FirstName => LastName
def split_name(name):
    parts = name.split(',')
    return parts[0].strip()

split_name('Johnson, John')

In [None]:
train['LastName'] = train['Name'].apply(split_name)
test['LastName'] = test['Name'].apply(split_name)

In [None]:
train['LastName'].value_counts()

In [None]:
# Save only first letter of LastName
train['LastName'].str[0].value_counts()

In [None]:
train['LastName'] = train['LastName'].str[0]
test['LastName'] = test['LastName'].str[0]

### `Ticket` column

In [None]:
train['Ticket'].value_counts()

In [None]:
train['Ticket'].str[0].value_counts()

In [None]:
train['Ticket'] = train['Ticket'].str[0]
test['Ticket'] = test['Ticket'].str[0]

In [None]:
train.head()

### Ordinal features

In [None]:
ordinal_cols = ['Pclass']

# There is nothing to do as Pclass is already numerical

### Category features

In [None]:
# Remove 'Name' column
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

In [None]:
train.head()

In [None]:
# Convert categorical features into numerical using pd.get_dummies
train = pd.get_dummies(train, drop_first=True)
test = pd.get_dummies(test, drop_first=True)

## Numerical features

### Binning

In [None]:
# Age column
train['Age'].describe()

In [None]:
age_bins = np.linspace(0,100,6) # [0, 20, 40, .. 100]

train['Age_binned'] = pd.cut(train['Age'], bins=age_bins, labels=False)
test['Age_binned'] = pd.cut(test['Age'], bins=age_bins, labels=False)

In [None]:
# Fare column
train['Fare'].describe()

In [None]:
fare_bins = [0, 10, 20, 30, 50, 100, 500, 1000]
fare_bins

In [None]:
# Fare column
train['Fare_binned'] = pd.cut(train['Fare'], bins=fare_bins, labels=False)
test['Fare_binned'] = pd.cut(test['Fare'], bins=fare_bins, labels=False)

In [None]:
train['Fare_binned'].value_counts()

In [None]:
# Drop Age and Fare columns
train.drop(['Age', 'Fare'], axis=1, inplace=True)
test.drop(['Age', 'Fare'], axis=1, inplace=True)

In [None]:
train.head()

In [None]:
# Check for any NaN values
train.isnull().sum()

In [None]:
print(f'Train data shape: {train.shape}')
print(f'Test data shape: {test.shape}')

## Feature engineering

Create new column 'WithFamily' if SibSp + Parch > 0

In [None]:
train['WithFamily'] = (train['SibSp'] + train['Parch']) > 0
test['WithFamily'] = (test['SibSp'] + test['Parch']) > 0

In [None]:
train['WithFamily'] = train['WithFamily'].apply(lambda x: int(x))
test['WithFamily'] = test['WithFamily'].apply(lambda x: int(x))

In [None]:
train['WithFamily'].value_counts()

In [None]:
train.head()

In [None]:
train.drop(['SibSp','Parch'], axis=1, inplace=True)
test.drop(['SibSp','Parch'], axis=1, inplace=True)

In [None]:
train.head()

## Prepare dataset
+ Training data
+ Validation data
+ Testing data

In [None]:
X_train_full = train.drop('Survived', axis=1)
y_train_full = train['Survived']
X_test = test

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=41)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

## Model building

### Logistic Regression

In [None]:
model_lr = LogisticRegression()

In [None]:
model_lr.fit(X_train, y_train)

In [None]:
y_val_preds = model_lr.predict(X_val)

In [None]:
acc_lr = accuracy_score(y_val, y_val_preds)

print(f'Logistic Regression: {acc_lr:.4f}')

### RandomForest classifier

In [None]:
model_rf = RandomForestClassifier(random_state=41)

In [None]:
model_rf.fit(X_train, y_train)

In [None]:
y_val_preds = model_rf.predict(X_val)

In [None]:
acc_rf = accuracy_score(y_val, y_val_preds)

print(f'RandomForest: {acc_rf:.4f}')

### XGBoost classifier

In [None]:
model_xgb = XGBClassifier(random_state = 41,
                         use_label_encoder=False,
                         eval_metric='logloss')

In [None]:
model_xgb = model_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)

In [None]:
y_val_preds = model_xgb.predict(X_val)

In [None]:
acc_xgb = accuracy_score(y_val, y_val_preds)

print(f'XGBoost: {acc_xgb:.4f}')

In [None]:
def prepare_submission(model):
    y_preds = model.predict(X_test)
    out = pd.DataFrame({'PassengerId':X_test.index, 'Survived':y_preds})
    out.to_csv('submission.csv', index=False)

In [None]:
prepare_submission(model_xgb) # Kaggle score: 0.77948

### Catboost classifier

In [None]:
model_catb = CatBoostClassifier(
    verbose=0,
    eval_metric='Accuracy',
    random_state=41,
)

In [None]:
model_catb = model_catb.fit(X_train, y_train)

In [None]:
y_val_preds = model_catb.predict(X_val)

In [None]:
acc_catb = accuracy_score(y_val, y_val_preds)

print(f'Catboost: {acc_catb:.4f}')

In [None]:
prepare_submission(model_catb) # Kaggle score: 0.78089

### LightGbm classifier

In [None]:
model_lgbm = LGBMClassifier(
    verbose=0,
    random_state=41,
    metric='Accuracy',
    force_row_wise=True
)

In [None]:
model_lgbm.fit(X_train, y_train)

In [None]:
y_val_preds = model_lgbm.predict(X_val)

In [None]:
acc_lgbm = accuracy_score(y_val, y_val_preds)

print(f'LGBM: {acc_lgbm:.4f}')

In [None]:
prepare_submission(model_lgbm)