<h2 style="background-color:#f7e572;font-family:Rockwell;font-size:240%;text-align:center;border-radius: 20px 50px;">Tabular Playground Series: April Edition</h2>

### Notes
- There is missing data.
- There is various data types:
    - String.
    - Ordinal.
    - Ratio.
- Distributions between train and test sets seems to be different.

### Data Types
<center><img src="https://lh5.googleusercontent.com/7jyxzQ2ObysJGLFcGB6Zc25AHAswexk68SbOh_KYa4if2P9yRe7lIC8NDUgZEcSGspqpRIGQcMx_qCmrG6sjHegFHy9Sqhp_1z3PFido6d19TKYFq0pMTHDs4OV9l6pP-MTNmeKu"></center>

<h2 style="background-color:#f7e572;font-family:Rockwell;font-size:240%;text-align:center;border-radius: 20px 50px;">Setup</h2>

<h3 style="background-color:#f7e572;font-family:Rockwell;font-size:200%;text-align:center;border-radius: 20px 50px;">Libraries and Utilities</h3>

In [None]:
!pip install mljar-supervised

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Ploting
import matplotlib.pyplot as plt
import seaborn as sns

# H2O AutoML
import h2o
from h2o.automl import H2OAutoML

# LGBM
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

# MLJAR
from supervised.automl import AutoML

In [None]:
# Set style
sns.set_style('white')
train_color = 'blueviolet'
test_color = 'lightsalmon'

In [None]:
# Other colors
# #a6a65d
# #4f6446
# f7e572

<h3 style="background-color:#f7e572;font-family:Rockwell;font-size:200%;text-align:center;border-radius: 20px 50px;">Data Loading</h3>

In [None]:
# Read file with pandas
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv',
                    index_col='PassengerId')

test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv',
                   index_col='PassengerId')

<h3 style="background-color:#f7e572;font-family:Rockwell;font-size:200%;text-align:center;border-radius: 20px 50px;">Data Types</h3>

In [None]:
# Update data types from competition informations

# Numeric and categorical columns
num_cols = ['Age', 'SibSp', 'Parch', 'Fare']
cat_cols = ['Pclass', 'Sex']
str_cols = ['Name', 'Ticket']
target = 'Survived'

# Update numeric cols as float
train[num_cols] = train[num_cols].astype(np.float32)
test[num_cols] = test[num_cols].astype(np.float32)

# Update categorical cols as categories
train[cat_cols] = train[cat_cols].astype('category')
test[cat_cols] = test[cat_cols].astype('category')

# Update target col
train[target] = train[target].astype(bool)

<h2 style="background-color:#f7e572;font-family:Rockwell;font-size:240%;text-align:center;border-radius: 20px 50px;">Exploratory Data Analysis</h2>

<h3 style="background-color:#f7e572;font-family:Rockwell;font-size:200%;text-align:center;border-radius: 20px 50px;">Missing Values</h3>

In [None]:
# Check for missing values in both datasets
print(f"Train Dataset | Missing values: {train.isna().sum().sum()}")
print(f"Test Dataset | Missing values: {test.isna().sum().sum()}")

<h3 style="background-color:#f7e572;font-family:Rockwell;font-size:200%;text-align:center;border-radius: 20px 50px;">Univariate Analysis: Continous Features</h3>

In [None]:
# Plot histogram for train and test sets

# Params
n_col = 2
n_row = round(len(num_cols) / 2)
size = (n_col * 10, n_row * 4.5)

# Create figure
plt.subplots(n_row, n_col,figsize=size)

# Plot hists
for i, col in enumerate(num_cols, 1):
    plt.subplot(n_row, n_col, i)
    sns.distplot(train[col], color=train_color, label='train', kde=True, bins=100)
    sns.distplot(test[col], color=test_color, label='test', kde=True, bins=100)
    plt.xlabel(col, fontsize=14)
    plt.legend()
    
# Show image
plt.show()

<h3 style="background-color:#f7e572;font-family:Rockwell;font-size:200%;text-align:center;border-radius: 20px 50px;">Univariate Analysis: Categorical Features</h3>

In [None]:
# Plot histogram for train and test sets

# Params
n_col = 2
n_row = round(len(cat_cols) / 2)
size = (n_col * 10, n_row * 4.5)

# Create figure
plt.subplots(n_row, n_col,figsize=size)

# Plot hists
for i, col in enumerate(cat_cols, 1):
    plt.subplot(n_row, n_col, i)
    sns.countplot(train[col], color=train_color, label='train')
    sns.countplot(test[col], color=test_color, label='test')
    plt.xlabel(col, fontsize=14)
    plt.legend()
    
# Show image
plt.show()

<h2 style="background-color:#f7e572;font-family:Rockwell;font-size:240%;text-align:center;border-radius: 20px 50px;">Feature Engineering</h2>

In [None]:
# Features
predictors = num_cols + cat_cols
target = target

# Data
X_TRAIN = train[predictors]
Y_TRAIN = train[target]
X_TEST = test[predictors]

<h2 style="background-color:#f7e572;font-family:Rockwell;font-size:240%;text-align:center;border-radius: 20px 50px;">Starter Models</h2>

<h3 style="background-color:#f7e572;font-family:Rockwell;font-size:200%;text-align:center;border-radius: 20px 50px;">H2O AutoML</h3>

#### Auto ML with h2o.

In [None]:
# Run H2O
h2o.init(max_mem_size='16g')

In [None]:
# Load data
train_h2o = h2o.import_file('../input/tabular-playground-series-apr-2021/train.csv')
test_h2o = h2o.import_file('../input/tabular-playground-series-apr-2021/test.csv')

# For binary classification, response should be a factor
train_h2o[target] = train_h2o[target].asfactor()

In [None]:
# h2o train
aml = H2OAutoML(
    nfolds=10,
    sort_metric='auc',
    stopping_metric='auc',
    max_runtime_secs=3600,
    seed=0
)

aml.train(x=predictors, y=target, training_frame=train_h2o)

In [None]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)

In [None]:
# Predictions
output = aml.leader.predict(test_h2o)
h2o_predictions = h2o.as_list(output['p1'])
h2o_predictions = np.where(h2o_predictions.values>= 0.5, 1, 0).flatten()

In [None]:
# Save submission
test[target] = h2o_predictions
test[target].to_csv('h2o_submission.csv')

<h3 style="background-color:#f7e572;font-family:Rockwell;font-size:200%;text-align:center;border-radius: 20px 50px;">MLJAR</h3>

#### Auto ML with MLJAR.
#### Based on [Piotr work](https://www.kaggle.com/mt77pp/mljar-automl-tps-apr-21).

In [None]:
# Init AutoMl
automl = AutoML(results_path='mljar_result',
                total_time_limit=3600*7,
                mode='Compete', 
                ml_task='binary_classification',
                eval_metric='accuracy')

In [None]:
# Train
automl.fit(X_TRAIN, Y_TRAIN)

In [None]:
# Predict

# MLJAR predict doesn't work with 'category' dtypes
_X_TEST = X_TEST.copy()
_X_TEST[['Pclass', 'Sex']] = _X_TEST[['Pclass', 'Sex']].astype(object)

mljar_predictions = automl.predict(_X_TEST)
mljar_predictions = np.where(mljar_predictions>= 0.5, 1, 0)

In [None]:
# Save submission
test[target] = mljar_predictions
test[target].to_csv('mljar_submission.csv')

<h3 style="background-color:#f7e572;font-family:Rockwell;font-size:200%;text-align:center;border-radius: 20px 50px;">LightGBM</h3>

In [None]:
# Constant
K = 10
KF = StratifiedKFold(K)
EARLY_STOP = 10
PARAMS = {
    'objective': 'binary',
    'num_iterations': 1000,
    'learning_rate': 0.05, 
    'metric': 'binary_error'
}

In [None]:
# Create LGB Dataset
lgb_train = lgb.Dataset(X_TRAIN, Y_TRAIN)

In [None]:
# Train
booster = lgb.cv(PARAMS,
                 lgb_train,
                 folds=KF.split(X_TRAIN, Y_TRAIN), 
                 early_stopping_rounds=EARLY_STOP,
                 verbose_eval=10,
                 return_cvbooster=True)

In [None]:
# Predict
kfold_predictions = booster['cvbooster'].predict(X_TEST)
lgb_predictions = np.array(kfold_predictions).mean(axis=0)
lgb_predictions = np.where(lgb_predictions>= 0.5, 1, 0)

In [None]:
# Submission
test[target] = lgb_predictions
test[target].to_csv('lgb_submission.csv')

<h3 style="background-color:#f7e572;font-family:Rockwell;font-size:200%;text-align:center;border-radius: 20px 50px;">Ensemble</h3>

#### Vote between all models.

In [None]:
# Concat predicted label
predictions = pd.DataFrame({
    'h2o': h2o_predictions,
    'mljar': mljar_predictions,
    'lgb': lgb_predictions
})

# Vote
vote_predictions = predictions.mode(axis=1).values

In [None]:
# Submission
test[target] = vote_predictions
test[target].to_csv('vote_submission.csv')