# TPS April 2021 - Visualization + Optuna + LGBM 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
import optuna
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
primary_bgcolor = "#f4f0ea"
primary_palette = ['#ed4f37', '#40aff5']
plt.rcParams['axes.facecolor'] = primary_bgcolor
plt.rcParams['figure.dpi'] = 120

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')

# 1. EDA & Visualization

In [None]:
train.head()

In [None]:
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)

### Features with null elements

Which feature has null elements? And how much?

In [None]:
null_count_df = train.isnull().sum().reset_index(name='count')
null_count_df['count'] = (null_count_df['count'] / train.shape[0]) * 100
null_count_df.sort_values(by='count', ascending=False, inplace=True)
null_count_df = null_count_df[null_count_df['count'] != 0]

ax = plt.figure(figsize=(5, 5))
plt.title('Features with null elements')
ax = sns.barplot(x=null_count_df['index'], y=[100]*5, color='#e9f1f2')
ax = sns.barplot(data=null_count_df, x='index', y='count', palette='Set2')

for p in ax.patches:
    val = '{:.2f}%'.format((p.get_height()))
    ax.annotate(val, (p.get_x()+0.1, p.get_height()+1.8), va='center')

"Cabin" has the most null elements, having a null proportion of 67.87%.

### Number of people survived

How many people survived and didn't survive? Let's find out.

In [None]:
plt.figure(figsize=(5, 5))
sns.catplot(data=train, x='Survived', kind='count', palette=primary_palette)

### Numerical / Categorial features distribution - who survived?

To find out what kind of passengers did or didn't survive, let's take a look at the distribution.


In [None]:
ft_cols = train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived'], axis=1).columns
num_cols = ['Age', 'Fare']
cat_cols = train[ft_cols].drop(num_cols, axis=1).columns

#### Numerical features

In [None]:
def kde_plot_num_cols(cols, train):
    L = len(cols)
    nrow = int(np.ceil(L/2))
    ncol = 2
    
    plt.subplots(nrow, ncol, figsize=(24, 6))
    plt.suptitle('Numerical features distribution')
    i = 1
    
    for col in cols:
        plt.subplot(nrow, ncol, i)
        sns.kdeplot(data=train, x=col, shade=True, hue='Survived', palette=primary_palette)
        i += 1
        
    plt.show()

In [None]:
kde_plot_num_cols(num_cols, train)

#### Categorial features

In [None]:
def count_plot_cat_cols(cols, train):
    L = len(cols)
    nrow = int(np.ceil(L/2))
    ncol = 2
    remove_last = (nrow*ncol) - L
    
    fig, ax = plt.subplots(nrow, ncol, figsize=(15, 15))
    plt.suptitle('Categorial features distribution')
    ax.flat[-remove_last].set_visible(False)
    i = 1
    
    for col in cols:
        plt.subplot(nrow, ncol, i)
        sns.countplot(data=train, x=col, hue='Survived', alpha=0.7, palette=primary_palette)
        plt.legend()
        i += 1
    
    plt.show()

In [None]:
count_plot_cat_cols(cat_cols, train)

# 2. Modeling

## Preprocessing

For categorial features, I will do some label encoding.

In [None]:
data_combined = pd.concat([train, test], axis=0)

cat_cols = train.drop(['PassengerId', 'Survived'], axis=1).dtypes[train.dtypes != 'float64'].index.tolist()

le = LabelEncoder()

for col in cat_cols:
    le.fit(data_combined[col])
    data_combined[col] = le.transform(data_combined[col])
    
train_df = data_combined[:len(train)]
test_df = data_combined[len(train):]

In [None]:
print('Train size: ', train_df.shape)
print('Test size: ', test_df.shape)

## Hyperparameter tuning (Optuna)

Let's find the optimal hyperparameter using Optuna in order to get the best score. 

In [None]:
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

In [None]:
def objective(trial, data=X, target=y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    
    params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 11, 300),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01,0.02,0.05,0.005,0.1]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'random_state': 42,
        'boosting_type': 'gbdt',
    }
    
    model = LGBMClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=200, verbose=False)
    pred = model.predict(X_test)
    acc_score = accuracy_score(y_test, pred)
    
    return acc_score

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_params = study.best_trial.params
print('Best model by Optuna: ', best_params)

After a long iteration, I finally got the best parameters! Now let's move on to the prediction part.

## Prediction (LGBM)

Let's make predictions using the best parameter I've got from Optuna.
I used LightGBM Classifier to make the prediction.

In [None]:
cols = [col for col in train_df.columns if col not in ['PassengerId', 'Survived']]

kfold = StratifiedKFold(5, shuffle=True, random_state=0)
scores = []
preds = []

for fold, (train_index, test_index) in enumerate(kfold.split(train_df[cols], train_df['Survived']), 1):
    train_f, test_f = train_df.iloc[train_index], train_df.iloc[test_index]
    
    X_train = train_f[cols]
    X_test = test_f[cols]
    y_train = train_f['Survived']
    y_test = test_f['Survived']
    lgbm_clf = LGBMClassifier(**best_params)
    
    lgbm_clf.fit(X_train, y_train)
    pred = lgbm_clf.predict(X_test)
    acc_score = accuracy_score(y_test, pred)
    pred_test = lgbm_clf.predict(test_df[cols])
    preds.append(pred_test)
    scores.append(acc_score)
    
    print(f'Fold {fold}, Accuracy score: {acc_score:.5f}')
    
print(f'Average accuracy score: {np.mean(scores)}')

pred = np.array(preds).mean(axis=0).round()

At last, I got my final prediction! It shows average accuracy score of 0.78431. 

## Make submission

In [None]:
sample_submission['Survived'] = np.rint(pred)
sample_submission['Survived'] = sample_submission['Survived'].apply(int)
sample_submission.to_csv('lgbm_submission.csv', index=False)