# Tabular Playground Series - Apr-2021
https://www.kaggle.com/c/tabular-playground-series-apr-2021

# Libraries

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

import h2o

print("seaborn", sns.__version__)
print("pandas", pd.__version__)

from pathlib import Path

PATH = Path.cwd().parent
DATA_PATH = "../input/tabular-playground-series-apr-2021/"
SEED = 42

# Load the Data

In [None]:
h2o.init(min_mem_size="15G")

In [None]:
train = h2o.import_file(DATA_PATH + "train.csv")
print(train.shape)
train.head()

# Basic Stats

In [None]:
train.describe()

# Features

In [None]:
target = "Survived"

remove = ['PassengerId', 'Ticket', 'Cabin']
num_cols = ['Age', 'SibSp', 'Parch', 'Fare']
text_cols = ['Name']
cat_cols = ['Sex', 'Embarked', 'Pclass']
features = num_cols + cat_cols + text_cols

print("missing", [f for f in train.columns if f not in features + remove + [target]])
print("not founded", [f for f in features + remove + [target] if f not in train.columns])
print("features", len(features), features)

# Check Nulls

In [None]:
dfs = {"train":train}
for key in dfs:
    print("--------", key, dfs[key].shape, "--------")
    for col in features:
        print(col, dfs[key][col].isna().sum())

# Preprocessing

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

def mk_discrete_values(hdf, col, n_bins=10, sufix="_disc"):
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode="ordinal")
    discrete_values = discretizer.fit_transform(hdf[col].as_data_frame().values)
    hdf[col+sufix] = h2o.H2OFrame(discrete_values.reshape(-1))
    return hdf

def mk_imputation(hdf, col, by, method="median"):
    _ = hdf.impute(col, method=method, by=by)    
    return hdf

# split names
def get_names(hdf):
    names = hdf["Name"].strsplit(",")
    names.columns = ["Last_Name", "First_Name"]
    hdf = hdf.cbind(names)
    return hdf

def preprocess(hdf, as_trainning=True):
    # ensure the categorical variables
    hdf[cat_cols] = hdf[cat_cols].asfactor()
    if as_trainning:
        hdf[target] = hdf[target].asfactor()
        
    
    # some imputation
    imputation = {"Age":["Pclass", "Sex"], "Fare":["Pclass"]}
    for keys in imputation:
        hdf = mk_imputation(hdf, keys, imputation[keys])
        
    hdf = mk_discrete_values(hdf, "Fare")
    hdf = mk_imputation(hdf, "Embarked", ["Pclass", "Fare_disc"])
    
    hdf = get_names(hdf)
    
    return hdf

train = preprocess(train)

In [None]:
train[features].describe()

# Model

In [None]:
from h2o.automl import H2OAutoML

features.remove("Name")
features.append("Last_Name")

aml = H2OAutoML(max_runtime_secs = 6 * 3600,
                seed = SEED, 
                stopping_metric ='logloss', 
                sort_metric ='logloss', 
                max_models=100)

aml.train(x=features, y=target, training_frame=train)

lb = aml.leaderboard 
lb.head()

# Submission

In [None]:
test = h2o.import_file(DATA_PATH + "test.csv")
print(test.shape)
test.head()

test = preprocess(test, False)

In [None]:
predictions = aml.leader.predict(test)
predictions

In [None]:
submission = pd.concat([test.as_data_frame()["PassengerId"],
                        predictions.as_data_frame()["predict"]], axis=1)
submission.columns = ["PassengerId", target]
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)