[](https://data-science-blog.com/wp-content/uploads/2022/01/mha_visualization-930x1030.png)

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import gc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

# My take on [Tabular Playground Series - Dec 2021](https://www.kaggle.com/c/tabular-playground-series-dec-2021)
- Memory reduction and preprocessing inspired from: https://www.kaggle.com/junhyeok99/multi-head-attention
- I improved the models step by step (current best score: 0.95705, Dec 9th)
    1. Public score: 0.95332 <-- `model1`, early stopping (Version: 3)
    1. Public score: 0.95400 <-- `model2`, 20 epochs (Version: 4)
    1. Public score: 0.95394 <-- `model2`, no Hillshade correction, 10 epochs (Version: 6)
    1. Public score: 0.95438 <-- `model2`, adding more engineered features (Version: 8)
    1. Public score: 0.95269 <-- `model3`, SELU, 10 epoch (Version: 10)
    1. Public score: 0.95343 <-- `model3`, SELU, 20 epoch (Version: 11)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
train

In [None]:
train.describe().T

In [None]:
test.describe().T

In [None]:
display(train['Cover_Type'].value_counts().sort_index())

Note that:
1. there are no missing values (`count` is the same for all)
2. there are columns where all values are zero -> to remove
3. `Cover_Type`=5 occurs only one time: we have better to remove this sample

# Preprocessing

In [None]:
# Drop never-see Soil types
for df in [train, test]:
    df = df.drop(columns = ['Id', 'Soil_Type7', 'Soil_Type15'])

In [None]:
# cos/sin split allows the NN to understand that 360=0
for df in [train, test]:
    df["Aspect_cos"] = np.cos(np.radians(df["Aspect"]))
    df["Aspect_sin"] = np.sin(np.radians(df["Aspect"]))
    df = df.drop(columns=["Aspect"])

In [None]:
# This is a correction that is discussed for this competition, but I'm not sure we should take action on it, since the data were generated in this way
# UPDATE: actually removing it results in a very similar score, so I keep it removed for the following reasons
#         1) we are anyhow speaking about some unphysical generated values and therefore there is no point in contraint them to physical values
#         2) Robust Scaler will take care of avoidind extreme values
for df in train, test:
    for col in ["Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"]:
        df[col] = df[col].clip(lower=0, upper=255)

In [None]:
for df in [train, test]:
    df['Sum_Hydrology'] = np.abs(df['Horizontal_Distance_To_Hydrology']) + np.abs(df['Vertical_Distance_To_Hydrology'])
    df['Sub_Hydrology'] = np.abs(df['Horizontal_Distance_To_Hydrology']) - np.abs(df['Vertical_Distance_To_Hydrology'])

In [None]:
# More feature engineering suggested in https://www.kaggle.com/c/tabular-playground-series-dec-2021/discussion/293612
for df in [train, test]:
    df['EHiElv'] = df['Horizontal_Distance_To_Roadways'] * df['Elevation']
    df['EViElv'] = df['Vertical_Distance_To_Hydrology'] * df['Elevation']
    # df['Aspect2'] = df.Aspect.map(r) --> I'm already using sin/cos
    df['Highwater'] = (df.Vertical_Distance_To_Hydrology < 0).astype(int)
    df['EVDtH'] = df.Elevation - df.Vertical_Distance_To_Hydrology
    df['EHDtH'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * 0.2
    df['Euclidean_Distance_to_Hydrolody'] = (df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2)**0.5 # A bit redundant with Sum/Sub_Hydrology, but I keep it
    df['Manhattan_Distance_to_Hydrolody'] = df['Horizontal_Distance_To_Hydrology'] + df['Vertical_Distance_To_Hydrology']              # A bit redundant with Sum/Sub_Hydrology, but I keep it
    df['Hydro_Fire_1'] = df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points']
    df['Hydro_Fire_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
    df['Hydro_Road_1'] = abs(df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
    df['Hydro_Road_2'] = abs(df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_2'] = abs(df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
    df['Hillshade_3pm_is_zero'] = (df.Hillshade_3pm == 0).astype(int)

In [None]:
# Manage targets: note that there is one class that has only one sample, and therefore remove it!
train = train.drop(index = train[train['Cover_Type'] == 5].index).reset_index(drop = True)
display(train['Cover_Type'].value_counts())

In [None]:
# Convert categoricals to integers (encoder) ans then apply one-hot encoding (to_categorical)
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le = LabelEncoder()
target = le.fit_transform(train['Cover_Type']) # REMEMBER: need to run `le.inverse_transform(test_pred)` at the end
target = to_categorical(target)                # REMEMBER: need to run `np.argmax(test_pred, axis = 1)` at the end

# Remove from the training set
train = train.drop(columns = ['Cover_Type'])

gc.collect()

In [None]:
# Scaling (note: I'm doing this after managing targets because I need first to remove some rows and cols)
from sklearn.preprocessing import RobustScaler
rb = RobustScaler()

cols = train.columns

train[cols] = rb.fit_transform(train[cols].values) # note: df[cols] is a trick to keep the df as a DataFrame for later (instead of an array)
test[cols] = rb.transform(test[cols].values)       # note: df[cols] is a trick to keep the df as a DataFrame for later (instead of an array)

In [None]:
# Reduce memory and transform to array
def reduce_mem_usage(df, verbose=True):
    """Make the dataframe lighter for the RAM: in this case by ca. 50%.
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

train = reduce_mem_usage(train).values
test = reduce_mem_usage(test).values

gc.collect()

# Modeling

In [None]:
shapes = {
    'nsamples': train.shape[0],
    'nfeatures': train.shape[1],
    'ncategories': target.shape[1]
}
print(shapes)

In [None]:
# Configuring TPU (https://www.kaggle.com/docs/tpu)
# NOTE: will fail if the notebook doe not have Accelerator=TPU!
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
print('Device:', tpu.master())
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
CURRENT_MODEL = 3

## Model1: simple sequential of 3 dense layer of 200 units

In [None]:
if CURRENT_MODEL==1:
    with strategy.scope(): # necessary for using the TPU
        model1 = keras.models.Sequential([
            keras.layers.Input((shapes['nfeatures'],)),
            keras.layers.Dense(200, activation='relu'),
            keras.layers.Dense(200, activation='relu'),
            keras.layers.Dense(200, activation='relu'),
            keras.layers.Dense(shapes['ncategories'], activation='softmax')
        ])
        display(keras.utils.plot_model(model1, show_shapes=True, show_dtype=True))
        model1.compile(
            loss='categorical_crossentropy',
            optimizer='Adam', 
            metrics=['accuracy'] # same as the competition: "Submissions are evaluated on multi-class classification accuracy."
        )
    

In [None]:
if CURRENT_MODEL==1:
    earlystop = EarlyStopping(patience=3, restore_best_weights=True)
    model1.fit(
        x=train,
        y=target,
        epochs=20,
        batch_size=128 * strategy.num_replicas_in_sync,# https://www.kaggle.com/docs/tpu
        validation_split=0.2,
        callbacks=[earlystop]
    )

In [None]:
if CURRENT_MODEL==1:
    pd.DataFrame(model1.history.history).plot(subplots=True, sharex=True, figsize=[15,8], grid=True)
    plt.show()

# Model2: using He-normal initializer, Batch normalization, and 10% dropout
REMEMBER: since dropout is action only on training, you can not compare training loss/accuracy, because it will be worse than in `model1` and than validation scores.

In [None]:
if CURRENT_MODEL==2:
    dropout_rate = 0.1
    with strategy.scope(): # necessary for using the TPU
        model2 = keras.models.Sequential([
            keras.layers.Input((shapes['nfeatures'],)),
            keras.layers.Dense(200, kernel_initializer="he_normal", use_bias=False),
            keras.layers.BatchNormalization(),
            keras.layers.Activation("relu"),
            keras.layers.Dropout(rate=dropout_rate),
            keras.layers.Dense(200, kernel_initializer="he_normal", use_bias=False),
            keras.layers.BatchNormalization(),
            keras.layers.Activation("relu"),
            keras.layers.Dropout(rate=dropout_rate),
            keras.layers.Dense(200, kernel_initializer="he_normal", use_bias=False),
            keras.layers.BatchNormalization(),
            keras.layers.Activation("relu"),
            keras.layers.Dense(shapes['ncategories'], activation='softmax')
        ])
        display(keras.utils.plot_model(model2, show_shapes=True, show_dtype=True))
        model2.compile(
            loss='categorical_crossentropy',
            optimizer='Adam', 
            metrics=['accuracy'] # same as the competition: "Submissions are evaluated on multi-class classification accuracy."
        )

In [None]:
if CURRENT_MODEL==2:
    earlystop = EarlyStopping(patience=3, restore_best_weights=True) 
    model2.fit(
        x=train,
        y=target,
        epochs=10,
        batch_size=128 * strategy.num_replicas_in_sync,# https://www.kaggle.com/docs/tpu
        validation_split=0.00, # I want to use the more possible samples for training, just a little to monitor validation (I don't need it anymore for EarlyStopping)
        #callbacks=[earlystop] # excluded: dropout makes the learning much more fuzzy! Morever, dropout is already avoiding overfit
    )

> Note: I was using `validation_split=0.05`, but after I noted that there is no problem of overfitting (the accuracy on the validation stays ca. constant after saturating),
> I prefer to use no validation since it is anyhow misleading. I have an high chance of having an improperly balanced sampling of `Cover_Type` 3,4,6,7 which are underrepresented.

In [None]:
if CURRENT_MODEL==2:
    pd.DataFrame(model2.history.history).plot(subplots=True, sharex=True, figsize=[15,8], grid=True)
    plt.show()

# Model3: using SELU activation w/ LeCun initializer, Batch normalization, and 10% dropout
- See pag/ 334 of Hands-on Machine Learning book, which suggests LeCun initialization with SELU.
- Remember that SELU (Scaled Exponential Linear Unit) has a non-zero (the case in RELU) negative tail which is tuned during the training

In [None]:
if CURRENT_MODEL==3:
    dropout_rate = 0.1
    with strategy.scope(): # necessary for using the TPU
        model3 = keras.models.Sequential([
            keras.layers.Input((shapes['nfeatures'],)),
            keras.layers.Dense(200, kernel_initializer="lecun_normal", use_bias=False),
            keras.layers.BatchNormalization(),
            keras.layers.Activation("selu"),
            keras.layers.Dropout(rate=dropout_rate),
            keras.layers.Dense(200, kernel_initializer="lecun_normal", use_bias=False),
            keras.layers.BatchNormalization(),
            keras.layers.Activation("selu"),
            keras.layers.Dropout(rate=dropout_rate),
            keras.layers.Dense(200, kernel_initializer="lecun_normal", use_bias=False),
            keras.layers.BatchNormalization(),
            keras.layers.Activation("selu"),
            keras.layers.Dense(shapes['ncategories'], activation='softmax')
        ])
        display(keras.utils.plot_model(model2, show_shapes=True, show_dtype=True))
        model3.compile(
            loss='categorical_crossentropy',
            optimizer='Adam', 
            metrics=['accuracy'] # same as the competition: "Submissions are evaluated on multi-class classification accuracy."
        )

In [None]:
if CURRENT_MODEL==3:
    model3.fit(
        x=train,
        y=target,
        epochs=20,
        batch_size=128 * strategy.num_replicas_in_sync,# https://www.kaggle.com/docs/tpu
    )

# Submission

In [None]:
if CURRENT_MODEL==1:
    test_pred = model1.predict(test, batch_size=128 * strategy.num_replicas_in_sync, verbose=1)
elif CURRENT_MODEL==2:
    test_pred = model2.predict(test, batch_size=128 * strategy.num_replicas_in_sync, verbose=1)
elif CURRENT_MODEL==3:
    test_pred = model3.predict(test, batch_size=128 * strategy.num_replicas_in_sync, verbose=1)

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
sub['Cover_Type'] = le.inverse_transform(np.argmax(test_pred, axis = 1))
sub.to_csv('submission.csv', index=False)
display(sub)