In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import LearningRateMonitor
import time
import gc
import torchmetrics


# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.6f' % x)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Data import**

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/train.csv", low_memory=False)#, nrows=10000)
test = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/test.csv", low_memory=False)#, nrows=10000)

In [None]:
train.info(memory_usage="deep")

In [None]:
test.info(memory_usage="deep")

# **EDA**

In [None]:
# Colors to be used for plots
colors = ["lightcoral", "sandybrown", "darkorange", "mediumseagreen",
          "lightseagreen", "cornflowerblue", "mediumpurple", "palevioletred",
          "lightskyblue", "sandybrown", "yellowgreen", "indianred",
          "lightsteelblue", "mediumorchid", "deepskyblue"]

In [None]:
train.head()

In [None]:
target = "Cover_Type"

features = list(train.columns[1:55])

In [None]:
train[target].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(5, 6))
pie = ax.pie([len(train), len(test)],
             labels=["Train dataset", "Test dataset"],
             colors=["salmon", "teal"],
             textprops={"fontsize": 15},
             autopct='%1.1f%%')
ax.axis("equal")
ax.set_title("Dataset length comparison", fontsize=18)
fig.set_facecolor('white')
plt.show();

In [None]:
fig, ax = plt.subplots(figsize=(14, 8))

bars = ax.bar(train[target].value_counts().sort_index().index,
                  train[target].value_counts().sort_index().values,
                  color=colors,
                  edgecolor="black")
ax.set_title("Target distribution", fontsize=20, pad=15)
ax.set_ylabel("Count", fontsize=14, labelpad=15)
ax.set_xlabel("Target label", fontsize=14, labelpad=20)
ax.tick_params(axis="x", pad=20)
ax.bar_label(bars, train[target].value_counts().sort_index().values,
                 padding=3, fontsize=12)
ax.bar_label(bars, [f"{x*100:2.1f}%" for x in train[target].value_counts().sort_index().values/len(train)],
                 padding=-20, fontsize=12)
ax.margins(0.025, 0.06)
ax.grid(axis="y")

plt.show();

In [None]:
train[features].describe()

In [None]:
test[features].describe()

In [None]:
df = pd.concat([train[features], test[features]], axis=0)
df.reset_index(inplace=True, drop=True)

unique_values = df[features].nunique() < 10
cat_features = list(unique_values[unique_values==True].index)
unique_values = df[features].nunique() >= 10
num_features = list(unique_values[unique_values==True].index)

print(f"There are {len(cat_features)} categorical features: {cat_features}")
print(f"\nThere are {len(num_features)} continuous features: {num_features}")

In [None]:
train.isna().sum().sum(), test.isna().sum().sum()

There are no missing values in the both datasets.

Let's check feature values distribution in the both datasets.

In [None]:
df = pd.concat([train[num_features], test[num_features]], axis=0)
columns = df.columns.values

cols = 3
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,20), sharex=False)

plt.subplots_adjust(hspace = 0.3)
i=0

for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            hist1 = axs[r, c].hist(train[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="deepskyblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train Dataset")
            hist2 = axs[r, c].hist(test[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="palevioletred",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Test Dataset")
            axs[r, c].set_title(columns[i], fontsize=12, pad=5)
            axs[r, c].set_yticks(axs[r, c].get_yticks())
            axs[r, c].set_yticklabels([str(int(i/1000))+"k" for i in axs[r, c].get_yticks()])
            axs[r, c].tick_params(axis="y", labelsize=10)
            axs[r, c].tick_params(axis="x", labelsize=10)
            axs[r, c].grid(axis="y")
            if i == 0:
                axs[r, c].legend(fontsize=10)
                                  
        i+=1
#plt.suptitle("Numerical feature values distribution in both datasets", y=0.99)
plt.show();

In [None]:
df = pd.concat([train[cat_features], test[cat_features]], axis=0)
columns = df.columns.values

cols = 4
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,40), sharex=False)

plt.subplots_adjust(hspace = 0.3)
i=0

for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            hist1 = axs[r, c].hist(train[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="deepskyblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train Dataset")
            hist2 = axs[r, c].hist(test[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="palevioletred",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Test Dataset")
            axs[r, c].set_title(columns[i], fontsize=12, pad=5)
            axs[r, c].set_yticks(axs[r, c].get_yticks())
            axs[r, c].set_yticklabels([str(int(i/1000))+"k" for i in axs[r, c].get_yticks()])
            axs[r, c].tick_params(axis="y", labelsize=10)
            axs[r, c].tick_params(axis="x", labelsize=10)
            axs[r, c].grid(axis="y")
            if i == 0:
                axs[r, c].legend(fontsize=10)
                                  
        i+=1
#plt.suptitle("Categorical feature values distribution in both datasets", y=0.99)
plt.show();

It looks like soil types 7 and 15 does not have any examples. Let's check it. If so, they could be dropped from the datasets.

In [None]:
print(f"Rows with soil type 7: {(train['Soil_Type7'] == 1).sum() + (test['Soil_Type7'] == 1).sum()}")
print(f"Rows with soil type 15: {(train['Soil_Type15'] == 1).sum() + (test['Soil_Type15'] == 1).sum()}")

In [None]:
train.drop(["Soil_Type7", "Soil_Type15"], axis=1, inplace=True)
test.drop(["Soil_Type7", "Soil_Type15"], axis=1, inplace=True)
features.remove("Soil_Type7")
features.remove("Soil_Type15")
cat_features.remove("Soil_Type7")
cat_features.remove("Soil_Type15")

In [None]:
print("Numerical features with the least amount of unique values:")
train[num_features].nunique().sort_values().head(5)

Some samples could have several wildernes area and soil types as you can see below.

In [None]:
display(train[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].sum(axis=1).value_counts().sort_index())
display(train[[x for x in train.columns if "Soil_Type" in x]].sum(axis=1).value_counts().sort_index())

Let's check how target distribution differs for samples different amount of said types.

In [None]:
print("Target distribution per amount of wildernes area types")
df = train[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].sum(axis=1)
df_2 = pd.DataFrame(columns=[str(x) + " wild_types" for x in df.value_counts().sort_index().index],
                    index=list(train[target].value_counts().sort_index().index))
df_2.fillna(0, inplace=True)
for i in df.value_counts().index:
    total_samples = len(train.loc[df==i, target]) 
    samples_per_class = train.loc[df==i, target].value_counts().sort_index()
    for sample_index in samples_per_class.index:
        df_2.loc[sample_index, str(i) + " wild_types"] = round((samples_per_class[sample_index] * 100 / total_samples), 4)
df_2

In [None]:
print("Target distribution per amount of soil types")
df = train[[x for x in train.columns if "Soil_Type" in x]].sum(axis=1)
df_2 = pd.DataFrame(columns=[str(x) + " soil_types" for x in df.value_counts().sort_index().index],
                    index=list(train[target].value_counts().sort_index().index))
df_2.fillna(0, inplace=True)
for i in df.value_counts().index:
    total_samples = len(train.loc[df==i, target]) 
    samples_per_class = train.loc[df==i, target].value_counts().sort_index()
    for sample_index in samples_per_class.index:
        df_2.loc[sample_index, str(i) + " soil_types"] = round((samples_per_class[sample_index] * 100 / total_samples), 4)
df_2

As you can see proportion of some classes differs from the amount of wildernes area nad soil types. It's a good idea to add two new features showing the amount of said types per sample.

To be continued...

In [None]:
# %%time
# from sklearn.decomposition import PCA, SparsePCA, KernelPCA
# pca = PCA(n_components=2)
# data_2D = pca.fit_transform(MinMaxScaler().fit_transform(train[cat_features]))
# pca = SparsePCA(n_components=2)
# data_2D = pca.fit_transform(MinMaxScaler().fit_transform(train[cat_features]))
# pca = KernelPCA(n_components=2, eigen_solver='randomized', kernel="poly")
# data_2D = pca.fit_transform(MinMaxScaler().fit_transform(train[cat_features]).astype("float32"))

In [None]:
# fig, ax = plt.subplots(figsize=(16,16))
# # fig_colors = train[target].copy()
# # fig_colors = fig_colors.map({1:"lightcoral", 2:"sandybrown", 3:"darkorange", 4:"mediumseagreen", 5:"cornflowerblue", 6:"mediumpurple"})

# # scatter = ax.scatter(data_2D[:10000, 0], data_2D[:10000, 1],
# #                  c=train.loc[:9999,target], cmap="hsv",
# #                  s=4)

# scatter = ax.scatter(data_2D[:, 0], data_2D[:, 1],
#                  c=train.loc[:,target], cmap="hsv",
#                  s=4)

# legend = ax.legend(*scatter.legend_elements(),
#                     loc="upper right", title="target", fontsize="large")
# ax.add_artist(legend)

# **Data preprocessing**

In [None]:
# Dropping a row which is the only one example of 5th class
train.drop(train[train[target]==5].index, axis=0, inplace=True)
train.reset_index(drop=True, inplace=True)
label_enc = LabelEncoder()
NUM_CLASSES = train[target].nunique()

In [None]:
# # Transforming Aspect feature to be in [0, 359] range 
# train.loc[train["Aspect"]<0, "Aspect"] = train.loc[train["Aspect"]<0, "Aspect"] + 360
# train.loc[train["Aspect"]>=360, "Aspect"] = train.loc[train["Aspect"]>=360, "Aspect"] - 360

# test.loc[test["Aspect"]<0, "Aspect"] = test.loc[test["Aspect"]<0, "Aspect"] + 360
# test.loc[test["Aspect"]>=360, "Aspect"] = test.loc[test["Aspect"]>=360, "Aspect"] - 360

# train["Aspect"].min(), train["Aspect"].max(), test["Aspect"].min(), test["Aspect"].max()

In [None]:
# # Clipping Hillshade features outliers to [0, 255] range 

# for df in [train, test]:
#     df.loc[df["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
#     df.loc[df["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
#     df.loc[df["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
#     df.loc[df["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
#     df.loc[df["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
#     df.loc[df["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
    
# train[["Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"]].min(), \
# train[["Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"]].max(), \
# test[["Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"]].min(), \
# test[["Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"]].max()

In [None]:
# # A new feature indicating that the patch is located lower than the closest water source
# train["lower_than_water"] = (train["Vertical_Distance_To_Hydrology"] < 0).astype("int16")
# test["lower_than_water"] = (test["Vertical_Distance_To_Hydrology"] < 0).astype("int16")
# features.append("lower_than_water")
# cat_features.append("lower_than_water")

In [None]:
# Adding two new features showing amount of different soil and wildernes area types
train["wild_areas_sum"] = train[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].sum(axis=1)
test["wild_areas_sum"] = test[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].sum(axis=1)

train["soil_types_sum"] = train[[x for x in train.columns if "Soil_Type" in x]].sum(axis=1)
test["soil_types_sum"] = test[[x for x in train.columns if "Soil_Type" in x]].sum(axis=1)

features.append("wild_areas_sum")
features.append("soil_types_sum")
cat_features.append("wild_areas_sum")
cat_features.append("soil_types_sum")

In [None]:
# A new feature showing straight distance to a water source
train["straight_dist_to_hydrology"] = (train["Horizontal_Distance_To_Hydrology"]**2 + train["Vertical_Distance_To_Hydrology"]**2)**0.5
test["straight_dist_to_hydrology"] = (test["Horizontal_Distance_To_Hydrology"]**2 + test["Vertical_Distance_To_Hydrology"]**2)**0.5

train["sum_dist_to_hydrology"] = train["Horizontal_Distance_To_Hydrology"] + train["Vertical_Distance_To_Hydrology"]
test["sum_dist_to_hydrology"] = test["Horizontal_Distance_To_Hydrology"] + test["Vertical_Distance_To_Hydrology"]

features.append("straight_dist_to_hydrology")
features.append("sum_dist_to_hydrology")
num_features.append("straight_dist_to_hydrology")
num_features.append("sum_dist_to_hydrology")

In [None]:
# Standardizing and scaling features
s_scaler = StandardScaler()
for col in num_features:
    train[col] = s_scaler.fit_transform(np.array(train[col]).reshape(-1,1))
    test[col] = s_scaler.transform(np.array(test[col]).reshape(-1,1))

In [None]:
# Reducing datasets memory size due to converting columns into lighter formats
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
X_nn = train[features].copy()
X_test_nn = test[features].copy()
y = pd.Series(label_enc.fit_transform(train[target]))

# # Generating OneHot encoded targets
# ohe = OneHotEncoder(sparse=False)
# y = ohe.fit_transform(np.array(train[target]).reshape(-1,1))
# y[:5]

In [None]:
X_nn.columns

In [None]:
mm_scaler = MinMaxScaler()
for col in X_nn.columns:
    X_nn[col] = mm_scaler.fit_transform(np.array(X_nn[col]).reshape(-1,1))
    X_test_nn[col] = mm_scaler.transform(np.array(X_test_nn[col]).reshape(-1,1))
    
# Transforming test data into tensors
X_test_nn = torch.tensor(X_test_nn.to_numpy()).float()

In [None]:
BATCH_SIZE = 4096

In [None]:
def prepare_datasets(X_nn, X_valid_nn, y_nn, y_valid_nn, batch_size=BATCH_SIZE):
    # Transforming data into tensors
    X_nn = torch.tensor(X_nn.to_numpy(), dtype=torch.float32)
    y_nn = torch.tensor(y_nn.to_numpy(), dtype=torch.long)
    X_valid_nn = torch.tensor(X_valid_nn.to_numpy(), dtype=torch.float32)
    y_valid_nn = torch.tensor(y_valid_nn.to_numpy(), dtype=torch.long)
    
    print("Using these datasets:")
    # Transforming tensors into tensor datasets
    train_ds = TensorDataset(X_nn, y_nn)
    valid_ds = TensorDataset(X_valid_nn, y_valid_nn)
#     test_ds = TensorDataset(X_test_nn)
    print(f"Train_ds elements: {len(train_ds)}")
    print(f"Valid_ds elements: {len(valid_ds)}")
#     print(f"Test_ds elements: {len(test_ds)}")

    # Transforming into dataloader objects using batches
    
    train_ds = DataLoader(train_ds, batch_size, drop_last=False, num_workers=4)
    valid_ds = DataLoader(valid_ds, batch_size, drop_last=False, num_workers=4)
#     test_ds = DataLoader(test_ds, batch_size=BATCH_SIZE)

    
    for data, label in train_ds:
        print(f"Train_ds batch: {data.shape}, {label.shape}")
        break
    for data, label in valid_ds:
        print(f"Valid_ds batch: {data.shape}, {label.shape}")
        break
#     for data in test_ds:
#         print(f"Test_ds batch: {data[0].shape}")
#         break
    return train_ds, valid_ds

# **Model training**

In [None]:
# Computing class weights to be used during training
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
# Reduce weight of the smallest classes in order to not give
# them too much attention
class_weights[3] = class_weights[0]
class_weights[4] = class_weights[0]
class_weights = torch.tensor(class_weights, dtype=torch.float32)
class_weights

In [None]:
# A function to initialize weights using Glorot normal initialization
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_normal_(m.weight.data)

In [None]:
# A custom callback to log required parameters like metrics and learning rate
# since TensorBoard is blocked on Kaggle.
class ParamsTracker(pl.callbacks.Callback):

    def __init__(self, verbose=True):
        self.verbose = verbose
        # Defining empty lists for metric values
        self.train_loss = []
        self.train_acc = []
        self.val_loss = []
        self.val_acc = []
        self.lr_epoch_start = []

    
    # Getting learning rate from an optimizer params at epoch start
    def on_train_epoch_start(self, trainer, module):
        current_learning_rate = trainer.optimizers[0].state_dict()["param_groups"][0]["lr"]
        self.lr_epoch_start.append(current_learning_rate)
#         print(f"Epoch start lr {current_learning_rate}")
        
    def on_validation_epoch_end(self, trainer, module):
        metrics_logs = trainer.logged_metrics
        self.val_loss.append(metrics_logs["val_loss"].item())
        self.val_acc.append(metrics_logs["val_acc"].item())
    
    # Getting last saved metrics from a trainer object and appending
    # the required values to the corresponding lists
    def on_train_epoch_end(self, trainer, module):
        metrics_logs = trainer.logged_metrics
        self.train_loss.append(metrics_logs["loss_epoch"].item())
        self.train_acc.append(metrics_logs["train_acc"].item())
        
        # Print all metrics at the end of current epoch if verbose is set to True.
        # It is done here because on_train_epoch_end event happens after on_validation_epoch_end event.
        if self.verbose == True:
            print(f"Epoch {module.current_epoch} start learning rate: {self.lr_epoch_start[-1]:.6f}, "
                  f"train_loss: {self.train_loss[-1]:.4f}, "
                  f"train_acc: {self.train_acc[-1]:.4f}, "
                  f"val_loss: {self.val_loss[-1]:.4f}, "
                  f"val_acc: {self.val_acc[-1]:.4f}")  

In [None]:
# Defining model parameters
class Model(pl.LightningModule):
    def __init__(self, input_shape, class_weights):
        super().__init__()
        
        # Input layer
        self.input = nn.Linear(input_shape,128)
        # Hidden layers
        self.hidden1 = nn.Linear(128,64)
        self.hidden2 = nn.Linear(64,32)
#         self.hidden3 = nn.Linear(32,16)
        # Output layer
        self.output = nn.Linear(32,6)
        
        # Dropout rate
        self.dr = 0.05
        # Activation functions
        self.activation = F.relu
        self.softmax = nn.Softmax(dim=1)
        # Metrics
        self.train_acc_metric = torchmetrics.Accuracy(num_classes=6, average="micro")
        self.val_acc_metric = torchmetrics.Accuracy(num_classes=6, average="micro")
        self.loss = nn.CrossEntropyLoss()#weight=class_weights)
        
        self.flag = False
    
    def forward(self, x):
        x = self.activation(self.input(x))
        x = F.dropout(x,p=self.dr,training=self.training)
        x = self.activation(self.hidden1(x))
        x = F.dropout(x,p=self.dr,training=self.training)
        x = self.activation(self.hidden2(x))
        x = F.dropout(x,p=self.dr,training=self.training)
#         x = self.swish(self.hidden3(x))
#         x = F.dropout(x,p=self.dr,training=self.training)
#         x = self.softmax(self.output(x))
        x = self.output(x)
        return x
    
    # Training loop with loss and metric computing for each train data batch
    def training_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self(X).squeeze(1)
#         if self.flag == False:
#             print(y_hat)
#             self.flag = True
        loss = self.loss(y_hat, y)
        self.train_acc_metric(torch.argmax(y_hat, dim=1), y)
        self.log('loss', loss, prog_bar=True, on_epoch=True, logger=True)
#         self.log('accuracy', self.train_acc_metric, prog_bar=True, on_epoch=False, logger=True)
        return {'loss': loss,}

    # Uses batch loss and metric values to compute and print
    # overall train data loss and metric 
    def training_epoch_end(self, outputs):
        train_acc = self.train_acc_metric.compute()
        self.log('train_acc', train_acc, prog_bar=True, on_epoch=True, on_step=False, logger=True)
        self.train_acc_metric.reset()



    # Computes loss and metric score for each valid data batch
    def validation_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self(X).squeeze(1)
        val_loss = self.loss(y_hat, y)
        self.val_acc_metric(torch.argmax(y_hat, dim=1), y)
        self.log('val_loss',val_loss, prog_bar=True, on_epoch=True, logger=True)
        return {'val_loss': val_loss}

    # Uses batch loss and metric values to compute and print
    # overall valid data loss and metric
    def validation_epoch_end(self, outputs):
        val_acc = self.val_acc_metric.compute()
        self.log('val_acc', val_acc, prog_bar=True, on_epoch=True, on_step=False, logger=True)
        self.val_acc_metric.reset()
        return {'val_acc': val_acc}

       
    def predict_step(self, X, batch_idx, dataloader_idx = None):
        return self.softmax(self(X[0]))
    
    # Setting optimizer and learning rate scheduler parameters if any
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3, eps=1e-8, weight_decay=1e-2, amsgrad=False)
#         optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        #lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.9, last_epoch=-1, verbose=False)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5,
                                                                  patience=7, min_lr=1e-04, eps=1e-08,
                                                                  verbose=False, threshold=0.002, threshold_mode="abs")
        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler, "monitor": "val_acc"}

In [None]:
# Trains the model using given train and valid datasets
# and returns filepath of the best saved model
def train_ann(train_ds, valid_ds, class_weights, Model=Model, input_shape=X_nn.shape[1]):
    
    model = Model(input_shape, class_weights)
    model.apply(initialize_weights)

    # A callback to save the best model
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        dirpath="models",
        filename=f'model_' + '{val_acc:.4}',
        monitor='val_acc',
        mode='max',
        save_weights_only=True)

    # A callback to stop training when there is no improvement
    early_stop_callback = EarlyStopping(
        monitor='val_acc',
        min_delta=0.0004,
        patience=20,
        verbose=False,
        mode='max'
    )

#     # A callback to check learning rate if it is not constant
#     lr_monitor = LearningRateMonitor(logging_interval='epoch')
    
    # A custom callback to print required parameters
    params_tracker_callback = ParamsTracker(verbose=True)

    # print(ModelSummary(model))

    # Setting training parameters
    trainer = pl.Trainer(
        fast_dev_run=False,
        max_epochs=60,
    #         gpus=1,
        precision=32,
        limit_train_batches=1.0,
        limit_val_batches=1.0, 
        num_sanity_val_steps=0,
        check_val_every_n_epoch=1,
        val_check_interval=1.0, 
        callbacks=[checkpoint_callback, early_stop_callback, params_tracker_callback],
     )

    # Training 
    trainer.fit(model, train_ds, valid_ds)

    # Switching model to evaluation mode
    model.eval()

    # Getting path of the best saved model
    best_model_path = checkpoint_callback.best_model_path
    
    return best_model_path, params_tracker_callback

In [None]:
%%time
# Fold splitting parameters
splits = 10
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

# Two zero-filled arrays for out-of-fold and test predictions
nn_oof_preds = np.zeros((X_nn.shape[0],))
nn_test_preds = np.zeros((X_test_nn.shape[0],6))
total_mean_acc = 0

# Generating folds and making training and prediction for each of them
for num, (train_idx, valid_idx) in enumerate(skf.split(X_nn, y)):
#     if num > 0:
#         break
    print(f"\n\n===Training with fold {num}")
    X_train, X_valid = X_nn.loc[train_idx], X_nn.loc[valid_idx]
    y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
    
    # Preparing datasets
    train_ds, valid_ds = prepare_datasets(X_train, X_valid, y_train, y_valid)
    
    # Training model
    best_model_path, tracked_values = train_ann(train_ds, valid_ds, class_weights, Model, X_nn.shape[1])
    
    # Loading weights of the best model
    model = Model(X_nn.shape[1], class_weights)
    model.load_state_dict(torch.load(best_model_path)['state_dict'])
    model.eval()
    
    # Making valid data preds and plotting their histogram
    preds = np.argmax(model(torch.tensor(X_valid.to_numpy()).float()).detach().numpy(), axis=1)
#     display(pd.DataFrame(preds).hist(bins=50))
    print(preds)
    
    # Calculating and printing this fold's model ROC AUC score
    fold_score = accuracy_score(y_valid, preds)
    print(f"\n===Fold {num} valid data accuracy score is {fold_score}")
    
    # Making test data preds and plotting their histogram
    test_preds = nn.Softmax(dim=1)(model(X_test_nn)).detach().numpy()
#     display(pd.DataFrame(test_preds).hist(bins=50))
    
    # Saving preds in corresponding arrays
    nn_oof_preds[valid_idx] = preds
    nn_test_preds += test_preds / splits
    
    total_mean_acc += fold_score / splits
    
print(f"Average accuracy score of all models is {total_mean_acc}")

In [None]:
predictions = pd.DataFrame()
predictions["Id"] = test["Id"]
predictions["Cover_Type"] = label_enc.inverse_transform(np.argmax(nn_test_preds, axis=1))

predictions.to_csv('submission.csv', index=False, header=predictions.columns)
predictions.head()

In [None]:
fig, ax = plt.subplots(figsize=(14, 8))

bars = ax.bar(predictions["Cover_Type"].value_counts().sort_index().index,
                  predictions["Cover_Type"].value_counts().sort_index().values,
                  color=colors,
                  edgecolor="black")
ax.set_title("Target distribution", fontsize=20, pad=15)
ax.set_ylabel("Count", fontsize=14, labelpad=15)
ax.set_xlabel("Target label", fontsize=14, labelpad=20)
ax.tick_params(axis="x", pad=20)
ax.bar_label(bars, predictions["Cover_Type"].value_counts().sort_index().values,
                 padding=3, fontsize=12)
ax.bar_label(bars, [f"{x*100:2.1f}%" for x in predictions["Cover_Type"].value_counts().sort_index().values/len(train)],
                 padding=-20, fontsize=12)
ax.margins(0.025, 0.06)
ax.grid(axis="y")

plt.show();