In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.preprocessing import StandardScaler, LabelEncoder
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
import optuna

# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Data import**

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/train.csv", low_memory=False)#, nrows=10000)
test = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/test.csv", low_memory=False)#, nrows=10000)

In [None]:
# Reducing datasets memory size due to converting columns into lighter formats
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
train.info(memory_usage="deep")

In [None]:
test.info(memory_usage="deep")

# **EDA**

In [None]:
# Colors to be used for plots
colors = ["lightcoral", "sandybrown", "darkorange", "mediumseagreen",
          "lightseagreen", "cornflowerblue", "mediumpurple", "palevioletred",
          "lightskyblue", "sandybrown", "yellowgreen", "indianred",
          "lightsteelblue", "mediumorchid", "deepskyblue"]

In [None]:
train.head()

In [None]:
target = "Cover_Type"

features = list(train.columns[1:55])

In [None]:
train[target].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(5, 6))
pie = ax.pie([len(train), len(test)],
             labels=["Train dataset", "Test dataset"],
             colors=["salmon", "teal"],
             textprops={"fontsize": 15},
             autopct='%1.1f%%')
ax.axis("equal")
ax.set_title("Dataset length comparison", fontsize=18)
fig.set_facecolor('white')
plt.show();

In [None]:
fig, ax = plt.subplots(figsize=(14, 8))

bars = ax.bar(train[target].value_counts().sort_index().index,
                  train[target].value_counts().sort_index().values,
                  color=colors,
                  edgecolor="black")
ax.set_title("Target distribution", fontsize=20, pad=15)
ax.set_ylabel("Count", fontsize=14, labelpad=15)
ax.set_xlabel("Target label", fontsize=14, labelpad=20)
ax.tick_params(axis="x", pad=20)
ax.bar_label(bars, train[target].value_counts().sort_index().values,
                 padding=3, fontsize=12)
ax.bar_label(bars, [f"{x*100:2.1f}%" for x in train[target].value_counts().sort_index().values/len(train)],
                 padding=-20, fontsize=12)
ax.margins(0.025, 0.06)
ax.grid(axis="y")

# pie = axs[1].pie(train[target].value_counts(sort=False).sort_index().values,
#                  labels=train[target].value_counts(sort=False).sort_index().index,
#                  colors=colors,
#                  rotatelabels=True,
#                  textprops={"fontsize": 14})
# axs[1].axis("equal")
plt.show();

In [None]:
train[features].describe()

In [None]:
test[features].describe()

In [None]:
df = pd.concat([train[features], test[features]], axis=0)
df.reset_index(inplace=True, drop=True)

unique_values = df[features].nunique() < 10
cat_features = list(unique_values[unique_values==True].index)
unique_values = df[features].nunique() >= 10
num_features = list(unique_values[unique_values==True].index)

print(f"There are {len(cat_features)} categorical features: {cat_features}")
print(f"\nThere are {len(num_features)} continuous features: {num_features}")

In [None]:
train.isna().sum().sum(), test.isna().sum().sum()

There are no missing values in the both datasets.

Let's check feature values distribution in the both datasets.

In [None]:
df = pd.concat([train[num_features], test[num_features]], axis=0)
columns = df.columns.values

cols = 3
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,20), sharex=False)

plt.subplots_adjust(hspace = 0.3)
i=0

for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            hist1 = axs[r, c].hist(train[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="deepskyblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train Dataset")
            hist2 = axs[r, c].hist(test[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="palevioletred",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Test Dataset")
            axs[r, c].set_title(columns[i], fontsize=12, pad=5)
            axs[r, c].set_yticks(axs[r, c].get_yticks())
            axs[r, c].set_yticklabels([str(int(i/1000))+"k" for i in axs[r, c].get_yticks()])
            axs[r, c].tick_params(axis="y", labelsize=10)
            axs[r, c].tick_params(axis="x", labelsize=10)
            axs[r, c].grid(axis="y")
            if i == 0:
                axs[r, c].legend(fontsize=10)
                                  
        i+=1
#plt.suptitle("Numerical feature values distribution in both datasets", y=0.99)
plt.show();

In [None]:
df = pd.concat([train[cat_features], test[cat_features]], axis=0)
columns = df.columns.values

cols = 4
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,40), sharex=False)

plt.subplots_adjust(hspace = 0.3)
i=0

for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            hist1 = axs[r, c].hist(train[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="deepskyblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train Dataset")
            hist2 = axs[r, c].hist(test[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="palevioletred",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Test Dataset")
            axs[r, c].set_title(columns[i], fontsize=12, pad=5)
            axs[r, c].set_yticks(axs[r, c].get_yticks())
            axs[r, c].set_yticklabels([str(int(i/1000))+"k" for i in axs[r, c].get_yticks()])
            axs[r, c].tick_params(axis="y", labelsize=10)
            axs[r, c].tick_params(axis="x", labelsize=10)
            axs[r, c].grid(axis="y")
            if i == 0:
                axs[r, c].legend(fontsize=10)
                                  
        i+=1
#plt.suptitle("Categorical feature values distribution in both datasets", y=0.99)
plt.show();

It looks like soil types 7 and 15 does not have any examples. Let's check it. If so, they could be dropped from the datasets.

In [None]:
print(f"Rows with soil type 7: {(train['Soil_Type7'] == 1).sum() + (test['Soil_Type7'] == 1).sum()}")
print(f"Rows with soil type 15: {(train['Soil_Type15'] == 1).sum() + (test['Soil_Type15'] == 1).sum()}")

In [None]:
train.drop(["Soil_Type7", "Soil_Type15"], axis=1, inplace=True)
test.drop(["Soil_Type7", "Soil_Type15"], axis=1, inplace=True)
features.remove("Soil_Type7")
features.remove("Soil_Type15")
cat_features.remove("Soil_Type7")
cat_features.remove("Soil_Type15")

In [None]:
print("Numerical features with the least amount of unique values:")
train[num_features].nunique().sort_values().head(5)

Some samples could have several wildernes area and soil types as you can see below.

In [None]:
display(train[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].sum(axis=1).value_counts().sort_index())
display(train[[x for x in train.columns if "Soil_Type" in x]].sum(axis=1).value_counts().sort_index())

Let's check how target distribution differs for samples different amount of said types.

In [None]:
print("Target distribution per amount of wildernes area types")
df = train[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].sum(axis=1)
df_2 = pd.DataFrame(columns=[str(x) + " wild_types" for x in df.value_counts().sort_index().index],
                    index=list(train[target].value_counts().sort_index().index))
df_2.fillna(0, inplace=True)
for i in df.value_counts().index:
    total_samples = len(train.loc[df==i, target]) 
    samples_per_class = train.loc[df==i, target].value_counts().sort_index()
    for sample_index in samples_per_class.index:
        df_2.loc[sample_index, str(i) + " wild_types"] = round((samples_per_class[sample_index] * 100 / total_samples), 4)
df_2

In [None]:
print("Target distribution per amount of soil types")
df = train[[x for x in train.columns if "Soil_Type" in x]].sum(axis=1)
df_2 = pd.DataFrame(columns=[str(x) + " soil_types" for x in df.value_counts().sort_index().index],
                    index=list(train[target].value_counts().sort_index().index))
df_2.fillna(0, inplace=True)
for i in df.value_counts().index:
    total_samples = len(train.loc[df==i, target]) 
    samples_per_class = train.loc[df==i, target].value_counts().sort_index()
    for sample_index in samples_per_class.index:
        df_2.loc[sample_index, str(i) + " soil_types"] = round((samples_per_class[sample_index] * 100 / total_samples), 4)
df_2

As you can see proportion of some classes differs from the amount of wildernes area nad soil types. It's a good idea to add two new features showing the amount of said types per sample.

To be continued...

# **Data preprocessing**

In [None]:
# Dropping a row which is the only one example of 5th class
train.drop(train[train[target]==5].index, axis=0, inplace=True)
train.reset_index(drop=True, inplace=True)
label_enc = LabelEncoder()
NUM_CLASSES = train[target].nunique()

In [None]:
# Adding two new features
train["wild_areas_sum"] = train[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].sum(axis=1)
test["wild_areas_sum"] = test[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].sum(axis=1)

train["soil_types_sum"] = train[[x for x in train.columns if "Soil_Type" in x]].sum(axis=1)
test["soil_types_sum"] = test[[x for x in train.columns if "Soil_Type" in x]].sum(axis=1)

features.append("wild_areas_sum")
features.append("soil_types_sum")
cat_features.append("wild_areas_sum")
cat_features.append("soil_types_sum")

In [None]:
s_scaler = StandardScaler()
for col in num_features:
    train[col] = s_scaler.fit_transform(np.array(train[col]).reshape(-1,1))
    test[col] = s_scaler.transform(np.array(test[col]).reshape(-1,1))

In [None]:
X = train[features].copy()
X_test = test[features].copy()
y = pd.Series(label_enc.fit_transform(train[target]))

# **Hyperparameters optimization**

Hyperparameters used in this notebook were optimized using Optuna. The code used or that is shown below. They are commented in order to save runtime as optimization has been already done.

In [None]:
# def train_model_optuna(trial, X_train, X_valid, y_train, y_valid):
#     """
#     A function to train a model using different hyperparamerters combinations provided by Optuna. 
#     Loss of validation data predictions is returned to estimate hyperparameters effectiveness.
#     """
    
        
#     #A set of hyperparameters to optimize by optuna
#     cb_params = {
#              "iterations": trial.suggest_categorical('iterations', [10000]),
#              "learning_rate": trial.suggest_loguniform('learning_rate', 0.15, 1.0),
#              "loss_function": trial.suggest_categorical("loss_function", ["MultiClass"]),
#              "eval_metric": trial.suggest_categorical("eval_metric", ["Accuracy"]),
#              "l2_leaf_reg": trial.suggest_loguniform('l2_leaf_reg', 1, 100),
#              "bagging_temperature": trial.suggest_loguniform('bagging_temperature', 0.1, 20.0),
#              "random_strength": trial.suggest_float('random_strength', 1.0, 2.0, step=0.01),
#              "depth": trial.suggest_int('depth', 1, 10),
#              "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree"]),#, "Depthwise", "Lossguide"]),
#              "leaf_estimation_method": trial.suggest_categorical("leaf_estimation_method", ["Gradient"]),#, "Exact", "Newton"]),
#              "od_type": trial.suggest_categorical("od_type", ["Iter"]),
#              "early_stopping_rounds": trial.suggest_categorical("early_stopping_rounds", [100]),
#              "border_count": trial.suggest_categorical("border_count", [254]),
#              "use_best_model": trial.suggest_categorical("use_best_model", [True]),
# #              "bootstrap_type": trial.suggest_categorical('bootstrap_type', ["MVS"]),
# #              "subsample": trial.suggest_float('subsample', 0.1, 1.0, step=0.01),
# #              "sampling_frequency": trial.suggest_categorical('sampling_frequency', ["PerTree", "PerTreeLevel"]),
# #              "sampling_unit": trial.suggest_categorical('sampling_unit', ["Object", "Group"]),
#              "min_data_in_leaf": trial.suggest_int('min_data_in_leaf', 1, 300),
# #              "rsm": trial.suggest_float('rsm', 0.05, 1, step=0.05),
        
        

# # #                  "max_leaves": trial.suggest_int('max_leaves', 1, 64),
#              "task_type": trial.suggest_categorical('task_type', ["GPU"]),
#              "random_seed": trial.suggest_categorical('random_seed', [42]),
#                 }
    




#     # Model loading and training
#     model = CatBoostClassifier(**cb_params)
#     model.fit(
#                 X_train, y_train,
#                 eval_set=(X_valid, y_valid),
#                 verbose=False,
#             )
    
#     print(f"Number of boosting rounds: {model.tree_count_}")
#     oof = model.predict(X_valid)
    
#     return accuracy_score(y_valid, oof)

In [None]:
# %%time
# # Splitting data into train and valid folds using target bins for stratification
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# # Setting optuna verbosity to show only warning messages
# # If the line is uncommeted each iteration results will be shown
# # optuna.logging.set_verbosity(optuna.logging.WARNING)

# time_limit = 3600 * 4

# study = optuna.create_study(direction='maximize')
# study.optimize(lambda trial: train_model_optuna(trial, X_train, X_valid,
#                                                     y_train, y_valid),
# #                n_trials = 2
#                timeout=time_limit
#               )

# # Showing optimization results
# print('Number of finished trials:', len(study.trials))
# print('Best trial parameters:', study.best_trial.params)
# print('Best score:', study.best_value)

# **Model training**

In [None]:
# Model hyperparameters
cb_params = {'iterations': 10000,
             'learning_rate': 0.218904169525507,
             'loss_function': 'MultiClass',
             'eval_metric': 'Accuracy',
             'l2_leaf_reg': 1.6163189485316596,
             'bagging_temperature': 0.14353551008899088,
             'random_strength': 1.29,
             'depth': 10,
             'grow_policy': 'SymmetricTree',
             'leaf_estimation_method': 'Gradient',
             'od_type': 'Iter',
             'early_stopping_rounds': 300,
             'border_count': 254,
             'use_best_model': True,
             'min_data_in_leaf': 150,
             'task_type': 'GPU',
             'random_seed': 42}

In [None]:
%%time
# Setting up fold parameters
splits = 10
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

# Creating an array of zeros for storing "out of fold" predictions
oof_preds = np.zeros((X.shape[0],))
preds = np.zeros((X_test.shape[0],len(np.unique(y))))
model_fi = 0
total_mean_acc = 0

# Generating folds and making training and prediction for each of 10 folds
for num, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
    y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
    
    model = CatBoostClassifier(**cb_params)
    model.fit(X_train, y_train,
              verbose=False,
              eval_set=(X_valid, y_valid),
              )
    
    # Getting mean test data predictions (i.e. devided by number of splits)
    preds += model.predict_proba(X_test) / splits
    
    # Getting mean feature importances (i.e. devided by number of splits)
    model_fi += model.feature_importances_ / splits
    
    # Getting validation data predictions. Each fold model makes predictions on an unseen data.
    # So in the end it will be completely filled with unseen data predictions.
    # It will be used to evaluate hyperparameters performance only.
    oof_preds[valid_idx] = model.predict(X_valid).flatten()
    
    # Getting score for a fold model
    fold_acc = accuracy_score(y_valid, oof_preds[valid_idx])
    print(f"Fold {num} accuracy: {fold_acc}")

    # Getting mean score of all fold models (i.e. devided by number of splits)
    total_mean_acc += fold_acc / splits
    
print(f"\nOverall ROC AUC: {total_mean_acc}")

# **Feature importances**

In [None]:
# Creating a dataframe to be used for plotting
df = pd.DataFrame()
df["Feature"] = X.columns
# Extracting feature importances from the trained model
df["Importance"] = model_fi / model_fi.sum()
# Sorting the dataframe by feature importance
df.sort_values("Importance", axis=0, ascending=False, inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(13, 30))
bars = ax.barh(df["Feature"], df["Importance"], height=0.4,
               color="mediumorchid", edgecolor="black")
ax.set_title("Feature importances", fontsize=30, pad=15)
ax.set_ylabel("Feature name", fontsize=20, labelpad=15)
ax.set_xlabel("Feature importance", fontsize=20, labelpad=15)
ax.set_yticks(df["Feature"])
ax.set_yticklabels(df["Feature"], fontsize=13)
ax.tick_params(axis="x", labelsize=15)
ax.grid(axis="x")
# Adding labels on top
ax2 = ax.secondary_xaxis('top')
ax2.set_xlabel("Feature importance", fontsize=20, labelpad=13)
ax2.tick_params(axis="x", labelsize=15)
ax.margins(0.05, 0.01)

# Inverting y axis direction so the values are decreasing
plt.gca().invert_yaxis()

# **Predictions submission**

In [None]:
predictions = pd.DataFrame()
predictions["Id"] = test["Id"]
predictions["Cover_Type"] = label_enc.inverse_transform(np.argmax(preds, axis=1))

predictions.to_csv('submission.csv', index=False, header=predictions.columns)
predictions.head()

In [None]:
predictions["Cover_Type"].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(14, 8))

bars = ax.bar(predictions["Cover_Type"].value_counts().sort_index().index,
                  predictions["Cover_Type"].value_counts().sort_index().values,
                  color=colors,
                  edgecolor="black")
ax.set_title("Target distribution", fontsize=20, pad=15)
ax.set_ylabel("Count", fontsize=14, labelpad=15)
ax.set_xlabel("Target label", fontsize=14, labelpad=20)
ax.tick_params(axis="x", pad=20)
ax.bar_label(bars, predictions["Cover_Type"].value_counts().sort_index().values,
                 padding=3, fontsize=12)
ax.bar_label(bars, [f"{x*100:2.1f}%" for x in predictions["Cover_Type"].value_counts().sort_index().values/len(train)],
                 padding=-20, fontsize=12)
ax.margins(0.025, 0.06)
ax.grid(axis="y")

plt.show();