In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# <center><b style='color:#009933;'>About TPS-May 2022</b></center>

TPS-May 2022 is the May edition of the Tabular Playground series
in which we need to perform binary classification. We are given a simulated manufacturing control data and have to predict whether the machine is in state `0` or state `1`. <br/>

# <center><b style='color:#009933;'>Data</b></center>

The data consists of following features:
- **id**: The id column
- **f_00** to **f_30**: The manufacturing control data with **f_27** being categorical one.
- **target**: The target column having values **0** or **1**. <br/>

# <center><b style='color:#009933;'>Evaluation Metric</b></center>
Submissions are evaluated on <a href='http://en.wikipedia.org/wiki/Receiver_operating_characteristic'>area under the ROC curve</a> between the predicted and the observed target.


# <center><b style='color:#009933;'>Libraries used</b></center>

In [None]:
# pip install -U tensorflow-addons

In [None]:
# !pip3 install tensorflow_decision_forests 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (15, 7)
import math

import warnings
warnings.filterwarnings('ignore')

from tensorflow.keras.layers import StringLookup
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_addons
import tensorflow_decision_forests as tfdf

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')
subs = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')

# <center><b style='color:#009933;'>Data Exploration</b></center>

In [None]:
train.loc[:, 'f_00': 'f_30'].describe().style.background_gradient(cmap='plasma')

In [None]:
test.loc[:, 'f_00': 'f_30'].describe().style.background_gradient(cmap='plasma')

In [None]:
pd.DataFrame({'Train null values': train.isna().sum().sum(), 
              'Test null values': test.isna().sum().sum()}, index=['count']).style.background_gradient(cmap='plasma')

# <center><b style='color:#009933;'>What do we observe?</b></center>

- There are 900000 i.e. 900k values in the training dataset and 700000 i.e. 700k values in the testing dataset.

- There are in total **33** columns. There is only one categorical feature: **f_27**; **f_29** and **f_30** are binary ones; rest all are continuous.

- There are no null values in either training or testing dataset. And that's a plus point for us!

# <center><b style='color:#009933;'>Exploratory Data Analysis</b></center>

## <b style='color:#009933;'>Target Distribution</b>

In [None]:
sns.kdeplot(train['target'], fill=True, color='#0099ff')

In [None]:
sns.countplot(train['target'])

So, the `target` column is almost balanced. That's again a plus point for us!

## <b style='color:#009933;'>Features Distribution</b>

In [None]:
fig = plt.figure()
features = [x for x in train.columns if x != 'f_27' and x != 'id' and x != 'target']
for i in range(len(features)):
    ax = fig.add_subplot(6, 5, i+1)
    sns.kdeplot(data=train, ax=ax, x = f'{features[i]}')
    plt.tight_layout()

# <center><b style='color:#009933;'>P-values check</b></center>

In [None]:
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

x = add_constant(train[features])
model = OLS(train["target"], x).fit()

In [None]:
pvalues = pd.DataFrame(model.pvalues)
pvalues.reset_index(inplace=True)
pvalues.rename(columns={0: "pvalue", "index": "feature"}, inplace=True)
pvalues.style.background_gradient(cmap='plasma')

As f_03, f_04 and f_06 have pvalue > 0.5, they can be removed from the data.

# <center><b style='color:#009933;'>Feature Engg. on <em>f_27</em></b></center>

In [None]:
for df in [train, test]:
    for i in tqdm(range(10)):
        df[f'f_27_{i}'] = df.f_27.str.get(i).apply(ord) - ord('A')
        
    df["unique_characters"] = df.f_27.apply(lambda s: len(set(s)))
    
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)

In [None]:
train.drop(['f_03', 'f_04', 'f_05', 'f_27'], axis=1, inplace=True)
test.drop(['f_03', 'f_04', 'f_05', 'f_27'], axis=1, inplace=True)

# <center><b style='color:#009933;'>Tensorflow Decision Forests</b></center>

<center>
    <figure><img src='https://1.bp.blogspot.com/-Ax59WK4DE8w/YK6o9bt_9jI/AAAAAAAAEQA/9KbBf9cdL6kOFkJnU39aUn4m8ydThPenwCLcBGAsYHQ/s0/Random%2BForest%2B03.gif'>
        </img>
    <figcaption>Random Forests are a popular type of decision forest model. Here, you can see a forest of trees classifying an example by voting on the outcome.</figcaption>
    </figure>
</center>

<br/><br/>

Decision forests are a family of machine learning algorithms with quality and speed competitive with (and often favorable to) neural networks, especially when you’re working with tabular data. They’re built from many decision trees, which makes them easy to use and understand - and you can take advantage of a plethora of interpretability tools and techniques that already exist today.

Source: <a href='https://blog.tensorflow.org/2021/05/introducing-tensorflow-decision-forests.html'><em>Link</em></a>

First, we'll convert the dataset into a <em>Tensorflow dataset</em>.

In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train, label="target")
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test)

Then, we'll train the Random Forest model.

In [None]:
model = tfdf.keras.RandomForestModel()

model.fit(train_ds, verbose=1)

In [None]:
model.summary()

# <center><b style='color:#009933;'>Interpreting the Model</b></center>

In [None]:
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

We see that the first split is based off `i_02_21` feature.

# <center><b style='color:#009933;'>Predicting the values</b></center>

In [None]:
pred = model.predict(test_ds)

# <center><b style='color:#009933;'>Creating submission file</b></center>

In [None]:
subs['target'] = pred
subs.to_csv('submission.csv', index=False)

# <center><b style='color:#009933;'>References</b></center>

- https://www.kaggle.com/code/usharengaraju/tensorflow-decision-forests-w-b

- https://github.com/keras-team/keras-io/blob/master/examples/structured_data/deep_neural_decision_forests.py