In this version of notebook we are verifying the best model indicaded by GridSearchCV, which was built in my other notebook:
https://www.kaggle.com/christoforum/tps-dec-2021-neural-network-with-gridsearchcv 

## Import necessary libraries and datasets

In [None]:
import math
import numpy as np

import pandas as pd
pd.set_option('display.max_columns', 60)

import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.callbacks import ReduceLROnPlateau
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.utils import plot_model

from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report, accuracy_score
from scipy import stats

In [None]:
INT8_MIN = np.iinfo(np.int8).min
INT8_MAX = np.iinfo(np.int8).max
INT16_MIN = np.iinfo(np.int16).min
INT16_MAX = np.iinfo(np.int16).max
INT32_MIN = np.iinfo(np.int32).min
INT32_MAX = np.iinfo(np.int32).max

FLOAT16_MIN = np.finfo(np.float16).min
FLOAT16_MAX = np.finfo(np.float16).max
FLOAT32_MIN = np.finfo(np.float32).min
FLOAT32_MAX = np.finfo(np.float32).max


def memory_usage(data, detail = 1):
    if detail:
        display(data.memory_usage())
    memory = data.memory_usage().sum() / (1024 * 1024)
    print("Memory usage : {0:.2f}MB".format(memory))
    return memory


def compress_dataset(data):
    memory_before_compress = memory_usage(data, 0)
    print()
    print('=' * 50)
    for col in data.columns:
        col_dtype = data[col][:100].dtype

        if col_dtype != 'object':
            print("Name: {0:24s} Type: {1}".format(col, col_dtype))
            col_series = data[col]
            col_min = col_series.min()
            col_max = col_series.max()

            if col_dtype == 'float64':
                print(" variable min: {0:15s} max: {1:15s}".format(str(np.round(col_min, 4)), str(np.round(col_max, 4))))
                if (col_min > FLOAT16_MIN) and (col_max < FLOAT16_MAX):
                    data[col] = data[col].astype(np.float16)
                    print("  float16 min: {0:15s} max: {1:15s}".format(str(FLOAT16_MIN), str(FLOAT16_MAX)))
                    print("compress float64 --> float16")
                elif (col_min > FLOAT32_MIN) and (col_max < FLOAT32_MAX):
                    data[col] = data[col].astype(np.float32)
                    print("  float32 min: {0:15s} max: {1:15s}".format(str(FLOAT32_MIN), str(FLOAT32_MAX)))
                    print("compress float64 --> float32")
                else:
                    pass
                memory_after_compress = memory_usage(data, 0)
                print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
                print('=' * 50)

            if col_dtype == 'int64':
                print(" variable min: {0:15s} max: {1:15s}".format(str(col_min), str(col_max)))
                type_flag = 64
                if (col_min > INT8_MIN / 2) and (col_max < INT8_MAX / 2):
                    type_flag = 8
                    data[col] = data[col].astype(np.int8)
                    print("     int8 min: {0:15s} max: {1:15s}".format(str(INT8_MIN), str(INT8_MAX)))
                elif (col_min > INT16_MIN) and (col_max < INT16_MAX):
                    type_flag = 16
                    data[col] = data[col].astype(np.int16)
                    print("    int16 min: {0:15s} max: {1:15s}".format(str(INT16_MIN), str(INT16_MAX)))
                elif (col_min > INT32_MIN) and (col_max < INT32_MAX):
                    type_flag = 32
                    data[col] = data[col].astype(np.int32)
                    print("    int32 min: {0:15s} max: {1:15s}".format(str(INT32_MIN), str(INT32_MAX)))
                    type_flag = 1
                else:
                    pass
                memory_after_compress = memory_usage(data, 0)
                print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
                if type_flag == 32:
                    print("compress (int64) ==> (int32)")
                elif type_flag == 16:
                    print("compress (int64) ==> (int16)")
                else:
                    print("compress (int64) ==> (int8)")
                print('=' * 50)

    print()
    memory_after_compress = memory_usage(data, 0)
    print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
    
    return data

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')

## Train set summary

Let's see what a train set looks like

In [None]:
df_train.head()

`Id` column **is redundant**. Let's remove it.

In [None]:
df_train.drop('Id', axis = 1, inplace = True)

Let's check how big is our data.

In [None]:
print(f'Train set shape:   {df_train.shape}')

Our train set has **4 000 000 rows** and **55 columns**.

Let's find out something more about data. 

In [None]:
df_train.info()

All columns consist of **integers** and the set is huge - it using a lot of memory.

Let's see a distribution of each column.

In [None]:
df_train.describe()

Columns `Soil_Type7` and `Soil_Type15` contain only **one value - 0**. They don't introduce a variability, so we can remove them. All remaining columns`Soil_Type` has **two values - 0 and 1**.

In [None]:
df_train.drop(['Soil_Type7', 'Soil_Type15'], axis = 1, inplace = True)

Let's check dataset has some missing values.

In [None]:
df_train.isnull().sum().max() != 0

There are **no missing values** in train dataset.

At the end let's check dataset has duplicated rows.

In [None]:
df_train.duplicated().unique()

There are **no duplicated rows** in train dataset.

## Test set summary

We'll carry out exactly the same steps as above.

Let's see what a test set looks like

In [None]:
df_test.head()

`Id` column **is redundant**. Let's remove it.

In [None]:
df_test.drop('Id', axis = 1, inplace = True)

Let's check how big is our data.

In [None]:
print(f'Test set shape:   {df_test.shape}')

Our test set has **1 000 000 rows** and **54 columns**.

Let's find out something more about data. 

In [None]:
df_test.info()

All columns consist of **integers** and the set is huge - it using a lot of memory. Just like in train set.

Let's see a distribution of each column.

In [None]:
df_test.describe()

Columns `Soil_Type7` and `Soil_Type15` contain only **one value - 0**. They don't introduce a variability, so we can remove them. All remaining columns`Soil_Type` has **two values - 0 and 1**. Just like in train set.

In [None]:
df_test.drop(['Soil_Type7', 'Soil_Type15'], axis = 1, inplace = True)

Let's check dataset has some missing values.

In [None]:
df_test.isnull().sum().max() != 0

There are **no missing values** in test dataset.

At the end let's check dataset has duplicated rows.

In [None]:
df_test.duplicated().unique()

There are **no duplicated rows** in test dataset.

## Target summary

Let's check a number of classes in target column.

In [None]:
df_train['Cover_Type'].unique()

We have **7** classes. We need to check that classes are balanced or not.

In [None]:
sns.countplot(x = df_train['Cover_Type'])
plt.grid()

In [None]:
df_train['Cover_Type'].value_counts()

Unfortunatelly, **classes are imbalanced**. Class no. 5 appears only once. We'll remove it.

In [None]:
df_train.drop(df_train[df_train['Cover_Type'] == 5].index, axis = 0, inplace = True)

## Correlation

In [None]:
non_binary_columns = list(df_train.columns[:10])
sns.heatmap(df_train[non_binary_columns].corr())

There is **no correlation between non-binary feature**.

## Feature engineering

Let's look at the `Aspect` column. It is the compass direction that a terrain faces and it is expressed in degrees. Values are contained in range [-33, 407] (train set) and [-33, 400] (test set), so to all values less than 0 we add 360 and to all values greater than or equal to 360 we substract 360.

In [None]:
def aspect(x):
    if x < 0:
        return x + 360
    elif (x >= 0) & (x < 360):
        return x
    else:
        return x - 360

In [None]:
df_train['Aspect'] = df_train['Aspect'].map(aspect)
df_test['Aspect'] = df_train['Aspect'].map(aspect)

Now let's analyse `Hillshade` columns. Hillshading computes surface illumination as values from 0 to 255 based on a given compass direction to the sun (azimuth) and a certain altitude above the horizon (altitude). All our hillshade's values are between -53 and 301 in train set and between -51 and 296 in test set. Therefore we will replace all negative numbers with 0 and all numbers greater than 255 with 255.

In [None]:
def hillshade(x):
    if x < 0:
        return 0
    elif (x >= 0) & (x < 256):
        return x
    else:
        return 255

In [None]:
df_train['Hillshade_9am'] = df_train['Hillshade_9am'].map(hillshade)
df_train['Hillshade_Noon'] = df_train['Hillshade_Noon'].map(hillshade)
df_train['Hillshade_3pm'] = df_train['Hillshade_3pm'].map(hillshade)

df_test['Hillshade_9am'] = df_test['Hillshade_9am'].map(hillshade)
df_test['Hillshade_Noon'] = df_test['Hillshade_Noon'].map(hillshade)
df_test['Hillshade_3pm'] = df_test['Hillshade_3pm'].map(hillshade)

We create a new columns based on the `Horizontal_Distance_To_Hydrology` and `Vertical_Distance_To_Hydrology` columns. First column will contain L1 distance and second column will contain L2 distance.

In [None]:
df_train['L1_distance'] = np.abs(df_train['Horizontal_Distance_To_Hydrology']) + np.abs(df_train['Vertical_Distance_To_Hydrology'])
df_test['L1_distance'] = np.abs(df_test['Horizontal_Distance_To_Hydrology']) + np.abs(df_test['Vertical_Distance_To_Hydrology'])

df_train['L2_distance'] = np.sqrt(np.square(df_train['Horizontal_Distance_To_Hydrology']) + np.square(df_train['Vertical_Distance_To_Hydrology']))
df_test['L2_distance'] = np.sqrt(np.square(df_test['Horizontal_Distance_To_Hydrology']) + np.square(df_test['Vertical_Distance_To_Hydrology']))

Finally, we create one column with sum of `Soil_Type` columns and the other one with sum of `Wilderness_Area` columns.

In [None]:
soil_type_cols = [col for col in df_train.columns if 'Soil' in col]

df_train['Soil_Type_sum'] = df_train[soil_type_cols].sum(axis = 1)
df_test['Soil_Type_sum'] = df_test[soil_type_cols].sum(axis = 1)

wilderness_area_cols = [col for col in df_train.columns if 'Wilderness' in col]

df_train['Wilderness_Area_sum'] = df_train[wilderness_area_cols].sum(axis = 1)
df_test['Wilderness_Area_sum'] = df_test[wilderness_area_cols].sum(axis = 1)

## Scaler

Let's scale and match our datasets. We use RobustScaler algotithm.

In [None]:
scale_cols = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
              'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 
              'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'L1_distance',
              'L2_distance', 'Soil_Type_sum', 'Wilderness_Area_sum']

scaler = RobustScaler()

df_train[scale_cols] = scaler.fit_transform(df_train[scale_cols])
df_test[scale_cols] = scaler.transform(df_test[scale_cols])

## Memory releasing

Datasets are very large and use huge quantity of memory, so we need to convert type of columns to ones using less memory.

In [None]:
df_train = compress_dataset(df_train)
df_test = compress_dataset(df_test)

## CNN

We need to modify target by encoding labels in following way:
* 1 -> 0
* 2 -> 1
* 3 -> 2
* 4 -> 3
* 6 -> 4
* 7 -> 5

In [None]:
le = LabelEncoder()
df_train['Cover_Type'] = le.fit_transform(df_train['Cover_Type'])

Let's define our features and target.

In [None]:
feats = [col for col in df_train.columns if 'Cover_Type' not in col]
X = df_train[feats]
y = df_train['Cover_Type']

Let's build self-normalizing neural network model.

In [None]:
def model_cnn():
    
    model = Sequential()
    model.add(Dense(128, activation = 'relu', kernel_initializer = 'uniform', input_shape = [X.shape[1]]))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(64, activation = 'relu', kernel_initializer = 'uniform'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(32, activation = 'relu', kernel_initializer = 'uniform'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(16, activation = 'relu', kernel_initializer = 'uniform'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(8, activation = 'relu', kernel_initializer = 'uniform'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(6, activation = 'softmax'))

    model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

    return model

early_stop = EarlyStopping(monitor = 'val_accuracy', patience = 10, 
                           verbose = 1, mode = 'max', restore_best_weights = True)
red_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.5, patience = 5, verbose = 1)

In [None]:
model_cnn().summary()

In [None]:
model = KerasClassifier(build_fn = model_cnn, epochs = 100, batch_size = 1024, 
                        verbose = 1, callbacks = [early_stop, red_lr])

In [None]:
plot_model(model_cnn(), show_shapes = True, show_layer_names = True)

In [None]:
preds = []
accuracy = []

cv = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 12)

for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    history = model.fit(X_train, y_train, validation_data = (X_test, y_test))
    y_preds = model.predict(X_test)
    y_preds = le.inverse_transform(y_preds)
    y_true = le.inverse_transform(y_test)
    acc = accuracy_score(y_true, y_preds)
    accuracy.append(acc)
    
    df_test_preds = model.predict(df_test)
    df_test_preds = le.inverse_transform(df_test_preds)
    preds.append(df_test_preds)
    
    print('*' * 20)
    print(f'*****    Summary of Fold {fold + 1}    *****')
    print('*' * 20)
    print(f'Acuuracy: {acc}')
    
    print('*' * 20)
    print(classification_report(y_true, y_preds, zero_division = 0))
    
    print('*' * 20)
    cm = confusion_matrix(y_true, y_preds)
    fig, ax = plt.subplots(figsize = (10,10))
    cmd = ConfusionMatrixDisplay(cm, display_labels = set(y_true))
    cmd.plot(cmap = plt.cm.Blues, ax = ax)
    plt.show()
    
    print('*' * 20)
    plt.figure(figsize = (15, 5))

    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])

    plt.title(f'Fold {fold + 1} - Model Accuracy', size = 16)
    plt.xlabel('Epoch')
    plt.legend(['Train accuracy', 'Test accuracy'], loc = 4)
    plt.grid()
    plt.show()

    plt.figure(figsize = (15, 5))

    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])

    plt.title(f'Fold {fold + 1} - Model loss', size = 16)
    plt.xlabel('Epoch')
    plt.legend(['Train loss', 'Test loss'], loc = 7)
    plt.grid()
    plt.show()
    
print('*' * 20)
print(f' Mean accuracy: {np.mean(accuracy)}')

## Submission

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
sub['Cover_Type'] = stats.mode(preds, axis = 0)[0].T
sub.head()

In [None]:
sub.to_csv(f'cnn_{np.mean(accuracy):0.7}.csv', index = False)