In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install python-vivid

In [None]:
import matplotlib.pyplot as plt

def preprocess(input_df: pd.DataFrame):
    output_df = input_df.copy()
    
    return pd.concat([
        output_df,
        pd.DataFrame(input_df["Cabin"].fillna("//").str.split("/").values.tolist()).add_prefix("Cabin_")
    ], axis=1)

In [None]:
INPUT_DIR = "/kaggle/input/spaceship-titanic/"
OUTPUT_DIR = "/kaggle/working/"

train_df = pd.read_csv(os.path.join(INPUT_DIR, "train.csv"))
test_df = pd.read_csv(os.path.join(INPUT_DIR, "test.csv"))

train_df = preprocess(train_df)
test_df = preprocess(test_df)

### Simple EDA 

A quick look at the co-occurrence of values 

In [None]:
from matplotlib_venn import venn2
from typing import List

def get_uniques(input_df: pd.DataFrame, column):
    s = input_df[column]
    return set(s.dropna().unique())

def plot_intersection(
    left: pd.DataFrame, 
    right: pd.DataFrame, 
    target_column: str, 
    ax: plt.Axes = None, 
    set_labels: List[str]=None
):
    venn2(
        subsets=(get_uniques(left, target_column), get_uniques(right, target_column)),
        set_labels=set_labels or ("Train", "Test"),
        ax=ax
    )
    ax.set_title(target_column)

In [None]:
target_columns = test_df.columns.tolist()
n_cols = 5
n_rows = - (- len(target_columns) // n_cols)

fig, axes = plt.subplots(figsize=(4 * n_cols, 3 * n_rows), ncols=n_cols, nrows=n_rows)

for c, ax in zip(target_columns, np.ravel(axes)):
    plot_intersection(train_df, test_df, target_column=c, ax=ax)

### Define Features

* Filter (Nothing to do)
* CountEncoding (Convert to Count of Values)
* OneHotEncoding (zero-one whether it appears or not)

In [None]:
from vivid.features.base import FilterBlock, CountEncodingBlock, OneHotEncodingBlock

base_features = [
    FilterBlock("Filter", column=["CryoSleep", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]),
    CountEncodingBlock("CE", excludes=["PassengerId", "Cabin", "Transported"]),
    OneHotEncodingBlock("OH", column=["HomePlanet", "Destination", "Cabin_0", "Cabin_1", "Cabin_2"])
]

### Estimator

* LightGBM Classifier

In [None]:
from vivid.estimators.boosting import LGBMClassifierBlock
from vivid.estimators.boosting import XGBClassifierBlock
from vivid.estimators.boosting.block import create_boosting_seed_blocks

estimators = [
    create_boosting_seed_blocks(feature_class=LGBMClassifierBlock, parent=base_features, prefix="lgbm_"),
    create_boosting_seed_blocks(feature_class=XGBClassifierBlock, parent=base_features, prefix="xgb_"),
]

In [None]:
from vivid.estimators.linear import TunedLogisticBlock

In [None]:
stacked_models = [
    TunedLogisticBlock(name="logistic", parent=estimators),
]

### Runner

The runner is the executor that learns and infers the blocks

In [None]:
from vivid.backends.experiments import LocalExperimentBackend
from vivid.runner import create_runner


y = train_df["Transported"].values
runner = create_runner(blocks=stacked_models, experiment=LocalExperimentBackend(to=OUTPUT_DIR))

# run train phase ignore previous runnning logs.
oof_outputs = runner.fit(train_df, y=y, ignore_past_log=True)

### Predict

using runner.

In [None]:
predicts = runner.predict(test_df)

### Check Prediction and out of fold predict Distribution

In [None]:
for oof, pred in zip(oof_outputs, predicts):
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.distplot(oof.out_df.values, label="OutOfFold", ax=ax)
    sns.distplot(pred.out_df.values, label="Predict", ax=ax)
    ax.set_title(oof.block.name)

### Make Submission

In [None]:
for pred in predicts:
    y_label = pred.out_df.values[:, 0]
    y_label = np.where(y_label > .5, True, False)
    
    sub_df = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Transported": y_label
    })
    
    sub_df.to_csv(os.path.join(OUTPUT_DIR, f"{pred.block.name}_submission.csv"), index=False)

### TODO

* [ ] More Feature Engineering
* [ ] Additional estimators and ensembles 