In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv', index_col='id')

In [None]:
train.columns

## Check Missing Values

In [None]:
train.isna().mean()

In [None]:
test.isna().mean()

## Train-test Discrepancy
Check discrepancy between train and test using [KL divercence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence) on predictor variables. Specifically to calculate KL Divergence:
* Treat categorical variables like Multinomial distributed
* Treat continuos variables like Normal distributed

In [None]:
from torch.distributions import Categorical, Normal, kl_divergence
from torch import from_numpy
def kl_train_test(train, test):
    result = {}
    for c in test.columns:
        if c.startswith('cat'):
            train_d = train[c].value_counts() / len(train)
            test_d = test[c].value_counts() / len(test)
            for cat in train_d.index:
                if cat not in test_d.index:
                    test_d[cat] = 0.0
            for cat in test_d.index:
                if cat not in train_d.index:
                    train_d[cat] = 0.0
            train_d = Categorical(from_numpy(train_d.values))
            test_d = Categorical(from_numpy(test_d.values))
            result[c] = kl_divergence(train_d, test_d).item()
        elif c.startswith('cont'):
            train_d = Normal(train[c].mean(), train[c].std())
            test_d = Normal(test[c].mean(), test[c].std())
            result[c] = kl_divergence(train_d, test_d).item()
    return result

In [None]:
kl_train_test(train, test)

Conclusion: Remove cat10 because kl is infinite

## Train overview with pandas profiling

In [None]:
from pandas_profiling import ProfileReport

p = ProfileReport(train)

p.to_file('train_report.html')
p

# Preprocessing fn
Unique function for both train and test to avoid as much as possible discrepancy between train/test

In [None]:
def preprocessing(df):
    catcols = df.columns[df.columns.str.startswith('cat')]
    df[catcols] = df[catcols].astype('category')
    for c in catcols:
        df[c] = df[c].cat.codes
    df = df.drop(columns='cat10') #infinite kl divergence
    return df

In [None]:
train = preprocessing(train)
test = preprocessing(test)

In [None]:
train.head()

In [None]:
test.head()

* Numerical are already scaled, some are normal distributed other no
* There are some high cardinality categorical columns

# RF Training

In [None]:
val_mode = False

In [None]:
X_train, y_train = train.drop(columns='target'), train['target']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
if val_mode:
    if len(X_train) == 300000:
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)
    else:
        print('skipped multiple split')

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(max_depth=30)

In [None]:
rf.fit(X_train, y_train)

In [None]:
if val_mode:
    y_pred_train = rf.predict(X_train)
    y_pred_val = rf.predict(X_val)
    plot_roc_curve(rf, X_train, y_train)
    plot_roc_curve(rf, X_val, y_val)

# Test Prediction

In [None]:
y_prob_test = rf.predict_proba(test)

In [None]:
output = pd.DataFrame({'id': test.index, 'target': y_prob_test[:, 1]})
output.to_csv('rf_prediction.csv', index=False)

In [None]:
output.head()

# Extra: try TabNet (NN with transformer architecture)
* Paper link: https://arxiv.org/pdf/1908.07442.pdf
* Implementation link: https://github.com/dreamquark-ai/tabnet

In [None]:
!pip install pytorch-tabnet

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier

In [None]:
cat_indexes = [i for i, col in enumerate(X_train.columns) if col.startswith('cat')]

In [None]:
tabnet = TabNetClassifier(cat_idxs=cat_indexes)

In [None]:
tabnet.fit(X_train.values, y_train.values, max_epochs=100)

In [None]:
y_prob_test = tabnet.predict_proba(test.values)

In [None]:
output = pd.DataFrame({'id': test.index, 'target': y_prob_test[:, -1]})
output.to_csv('tabnet_prediction.csv', index=False)