### In this notebook I am trying to approach this problem as a classification task. 

The target variable "loss" consists of 43 discrete integer values, instead of continuous values (float). That opens up a possibility to solve this problem as a **multi-class classification task** instead of a **regression task** (as suggested by the competition organizers).

I have used LightGBM as the algorithm and log_loss as the metrics.

However, as we will see **this solution performs much worse than the regression approach.**

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

In [None]:
def _get_X_Y_DF_from_CV(train_X, train_Y, train_index, validation_index):
    X_train, X_validation = (
        train_X.iloc[train_index],
        train_X.iloc[validation_index],
    )
    y_train, y_validation = (
        train_Y.iloc[train_index],
        train_Y.iloc[validation_index],
    )
    return X_train, X_validation, y_train, y_validation

## Load Data

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv', index_col="id")
test_df = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv', index_col="id")
sample_submission_df = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

## Define the constants & parameters

In [None]:
TARGET = "loss"
ID = "id"
SEED = 42
NUM_CLASSES = 43
EARLY_STOPPING_ROUNDS = 100
N_ESTIMATORS = 1000

# Define Parameters for LGBM
lgb_params = {
    "objective": "multiclass",
    "boosting_type": "gbdt",
    "learning_rate": 0.1,
    "num_class": 43,
    "num_leaves": 31,
    "tree_learner": "serial",
    "n_jobs": 4,
    "seed": SEED,
    "max_depth": -1,
    "max_bin": 255,
    "metric": "multi_logloss",
    "verbose": -1,
}

## Split the Data & Define CV Method

In [None]:
train_X = train_df.drop([TARGET], axis=1)
train_Y = train_df[TARGET]
test_X = test_df
print(f"Shape of train_X : {train_X.shape}, test_X: {test_X.shape}, train_Y: {train_Y.shape}")

predictors = list(train_X.columns)
print(f"List of features to be used {list(predictors)}")

# Selecting n_splits to be 3, since class 42 has 
# just 3 instances
kf = StratifiedKFold(n_splits=3, shuffle=True)

## Build the model

In [None]:
y_oof = np.zeros(shape=(len(train_X), NUM_CLASSES))
y_predicted = np.zeros(shape=(len(test_X), NUM_CLASSES))
cv_scores = []

fold = 0
n_folds = kf.get_n_splits()
for train_index, validation_index in kf.split(X=train_X, y=train_Y):
    fold += 1
    print(f"fold {fold} of {n_folds}")

    X_train, X_validation, y_train, y_validation = _get_X_Y_DF_from_CV(
        train_X, train_Y, train_index, validation_index
    )

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_validation, y_validation, reference=lgb_train)

    model = lgb.train(
        lgb_params,
        lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        verbose_eval=10,
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        num_boost_round=N_ESTIMATORS,
        feature_name=predictors,
        categorical_feature="auto",
    )

    del lgb_train, lgb_eval, train_index, X_train, y_train
    gc.collect()

    y_oof[validation_index] = model.predict(
        X_validation, num_iteration=model.best_iteration
    )

    y_predicted += model.predict(
        test_X.values, num_iteration=model.best_iteration
    )

    best_iteration = model.best_iteration
    print(f"Best number of iterations for fold {fold} is: {best_iteration}")

    cv_oof_score = metrics.log_loss(y_validation, y_oof[validation_index])
    cv_scores.append(cv_oof_score)
    print(f"CV OOF Score for fold {fold} is {cv_oof_score}")

    del validation_index, X_validation, y_validation
    gc.collect()

y_predicted /= n_folds
oof_score = round(metrics.log_loss(train_Y, y_oof), 5)
avg_cv_scores = round(sum(cv_scores) / len(cv_scores), 5)
std_cv_scores = round(np.array(cv_scores).std(), 5)

## Check the log_loss scores

In [None]:
print(f"Out of Fold (log_loss) score {oof_score}")
print(f"Avg CV (log_loss) score {avg_cv_scores}")
print(f"Avg CV (log_loss) std {std_cv_scores}")

### At this stage, the predicted values are nothing but probabilities for each of the 43 classes. In the next step, we will select the class with highest probability as the predicted class for each instance of the test data

In [None]:
y_predicted.shape

In [None]:
# Select the class with highest probability as the predicted class
class_prediction = np.argmax(y_predicted, axis=1)
class_y_oof = np.argmax(y_oof, axis=1)

## Calculate the OOF RMSE score

In [None]:
rmse_oof_score = np.sqrt(metrics.mean_squared_error(train_Y, class_y_oof))
print(f"RMSE score on the OOF data {rmse_oof_score}")

### RMSE score using the classification approach is worst compared to the regression approach

## Create the sample submission file

In [None]:
sample_submission_df.loss = class_prediction
sample_submission_df.to_csv('sample_submission.csv', index=None)
sample_submission_df.head()