In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import time
import random
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor

# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Data import**

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/train.csv", low_memory=False)#, nrows=10000)
test = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/test.csv", low_memory=False)
train.info(memory_usage="deep")

In [None]:
test.info(memory_usage="deep")

In [None]:
# Colors to be used for plots
colors = ["lightcoral", "sandybrown", "darkorange", "mediumseagreen",
          "lightseagreen", "cornflowerblue", "mediumpurple", "palevioletred",
          "lightskyblue", "sandybrown", "yellowgreen", "indianred",
          "lightsteelblue", "mediumorchid", "deepskyblue"]

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(18, 8), gridspec_kw={'width_ratios': [2, 1]})

bars = axs[0].bar(train["target"].value_counts().sort_index().index,
                  train["target"].value_counts().sort_index().values,
                  color=colors,
                  edgecolor="black")
axs[0].set_title("Target distribution", fontsize=20, pad=15)
axs[0].set_ylabel("Count", fontsize=14, labelpad=15)
axs[0].set_xlabel("Target label", fontsize=14, labelpad=10)
axs[0].bar_label(bars, train["target"].value_counts().sort_index().values,
                 padding=3, fontsize=12)
axs[0].bar_label(bars, [f"{x:2.1f}%" for x in train["target"].value_counts().sort_index().values/2000],
                 padding=-20, fontsize=12)
axs[0].margins(0.025, 0.06)
axs[0].grid(axis="y")

pie = axs[1].pie(train["target"].value_counts(sort=False).sort_index().values,
                 labels=train["target"].value_counts(sort=False).sort_index().index,
                 colors=colors,
                 rotatelabels=True,
                 textprops={"fontsize": 14})
axs[1].axis("equal")
plt.show();

Let's check if these classes are evenly distributed in the dataset.

In [None]:
df = pd.DataFrame()
df["id"] = train["id"]
df["target"] = train["target"]
df["id"] = pd.cut(df["id"], np.arange(0, 201000, 1000), right=False)
values = df.groupby("id")["target"].value_counts(sort=False).values
classes = ["Class_" + str(x) for x in np.arange(9)]

cols = 3
rows = 3
fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,rows*4), sharey=False)

plt.subplots_adjust(hspace = 0.35)

i=0
for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        axs[r, c].plot(np.arange(0, 200, 1),
                       [values[x] for x in np.arange(i, 1800, 9)],
                       color=colors[i])
        axs[r, c].set_title(classes[i], fontsize=12, pad=5)
        axs[r, c].set_xticks(np.arange(0, 250, 50))
        axs[r, c].set_xticklabels([str(int(x))+"k" for x in axs[r, c].get_xticks()])
        axs[r, c].set_xlabel("Dataframe id")
        axs[r, c].set_ylabel("Class labels qty per 1k rows")
        axs[r, c].set_ylim(0, 320)
        i+=1
fig.suptitle("Class labels distribution in the train dataset", fontsize=20)
plt.show();

So it looks like class labels are distributed pretty evenly across the dataset.

## Original features value distribution in the train dataset

In [None]:
def make_data_plots(df, i=0):
    """
    Makes value distribution histogram plots for a given dataframe features
    """
    columns = df.drop(["target", "id"], axis=1).columns.values

    cols = 3
    rows = (len(columns) - i) // cols + 1

    fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,rows*4), sharey=True)
    plt.subplots_adjust(hspace = 0.2)
    for r in np.arange(0, rows, 1):
        for c in np.arange(0, cols, 1):
            if i >= len(columns):
                axs[r, c].set_visible(False)
            else:
                axs[r, c].hist(df[columns[i]].values, bins = 30,
                               color=random.choice(colors),
                               edgecolor="black")
                axs[r, c].set_title(columns[i], fontsize=12, pad=5)
            i+=1

In [None]:
make_data_plots(train)

## Original features nonzero value distribution in the train dataset

In [None]:
def make_nonzero_data_plots(df, i=0):
    """
    Makes nonzero value distribution histogram plots for a given dataframe features
    """    
    
    columns = df.drop(["target", "id"], axis=1).columns.values

    cols = 3
    rows = (len(columns) - i) // cols + 1

    fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,rows*4), sharey=False)
    plt.subplots_adjust(hspace = 0.2)
    for r in np.arange(0, rows, 1):
        for c in np.arange(0, cols, 1):
            if i >= len(columns):
                axs[r, c].set_visible(False)
            else:
                axs[r, c].hist(df[df[columns[i]] > 0][columns[i]].values, bins = 30,
                               color=random.choice(colors),
                               edgecolor="black")
                axs[r, c].set_title(columns[i], fontsize=12, pad=5)
            i+=1

In [None]:
make_nonzero_data_plots(train)

## Fraction of nonzero values in the both datasets

In [None]:
x = -1*np.arange(len(test.drop(["id"], axis=1).columns))

fig, ax = plt.subplots(figsize=(16, 30))
bars = ax.barh(x+0.2, train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).values / train.shape[0],
               height=0.4, color="cornflowerblue", label="Train dataset", edgecolor="black")
bars2 = ax.barh(x-0.2, test.drop(["id"], axis=1).astype(bool).sum(axis=0).values / test.shape[0],
                height=0.4, color="palevioletred", label="Test dataset", edgecolor="black")
ax.set_title("Fraction of nonzero values in the both datasets", fontsize=30, pad=15)
ax.set_ylabel("Feature names", fontsize=20, labelpad=15)
ax.set_xlabel("Fraction of nonzero values", fontsize=20, labelpad=15)
ax.set_xticks(np.arange(0, 0.8, 0.05))
ax.set_yticks(x)
ax.set_yticklabels(list(test.drop(["id"], axis=1).columns.values))
ax.tick_params(axis="x", labelsize=15)
ax.tick_params(axis="y", labelsize=14)
ax.grid(axis="x")
ax.legend(fontsize=15)
ax2 = ax.secondary_xaxis('top')
ax2.set_xticks(np.arange(0, 0.8, 0.05))
ax2.set_xlabel("Fraction of nonzero values", fontsize=20, labelpad=15)
ax2.tick_params(axis="x", labelsize=15)
plt.margins(0.05, 0.01)

## PCA

In [None]:
pca = PCA(n_components=2)
X_reduced = pd.DataFrame(pca.fit_transform(train.drop(["id", "target"], axis=1)))

fig, ax = plt.subplots(figsize=(16, 9))
ax.scatter(X_reduced[0], X_reduced[1], c=LabelEncoder().fit_transform(train["target"]), cmap="tab10")
ax.set_title("Scatter plot of the original train set reduced to 2 dimensions", fontsize=20, pad=15)
plt.show();

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train.drop(["id", "target"], axis=1))

pca = PCA(n_components=2)
X_reduced = pd.DataFrame(pca.fit_transform(X_scaled))

fig, ax = plt.subplots(figsize=(16, 9))
ax.scatter(X_reduced[0], X_reduced[1], c=LabelEncoder().fit_transform(train["target"]), cmap="tab10")
ax.set_title("Scatter plot of the scaled train set reduced to 2 dimensions", fontsize=20, pad=15)
plt.show();

As you can see, the data does not have distinct clusters after reducing dimensions.

## Detecting anomalies with IsolationForest

In [None]:
# iso_forest = IsolationForest(n_jobs=-1, random_state=42, n_estimators=3000)
# iso_forest.fit(train.drop(["id", "target"], axis=1))
# scores = iso_forest.decision_function(train.drop(["id", "target"], axis=1))
# to_drop = train.loc[scores < 0].index
# print(f"Anomalies found in the train dataset: {(scores < 0).sum()}")

In [None]:
# # Target distribution in detected anomalies
# train.loc[(scores < 0), "target"].value_counts()

## Detecting anomalies with LocalOutlierFactor

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train.drop(["id", "target"], axis=1))
lof = LocalOutlierFactor(n_jobs=-1)
lof.fit(X_scaled)
scores = lof.negative_outlier_factor_
score_threshold = -1.93
to_drop = train.loc[scores < score_threshold].index
print(f"Anomalies found in the train dataset: {(scores < score_threshold).sum()}")

In [None]:
# Target distribution in detected anomalies
train.loc[(scores < score_threshold), "target"].value_counts()

# **Data preparation**

In [None]:
# Dropping anomaly rows detected with Isolation Forest
train.drop(axis=0, index=set(to_drop), inplace=True)
train.shape

In [None]:
train.drop(["id", "target"], axis=1).duplicated(keep=False).sum()

There are some duplicates in the dataset. If they have identical feature but different target values it may decrease machine learning performance.

In [None]:
def delete_duplicates(df):
    """
    Finds duplicates in a given DataFrame and deletes rows with identical features values but different target. 
    """
    
    # Copying duplicate rows in a new dataset and getting their indices
    idx = df.drop(["id", "target"], axis=1).duplicated(keep=False)
    duplicates = df.loc[idx == True].copy()
    features = [x for x in duplicates.columns if "feature" in x]
    idx = duplicates["id"]
    
    # Checking if which rows with equal feature values have different target
    indx_to_drop = []
    for index in idx:
        for row in idx:
            if (row != index) and (row not in indx_to_drop):
                if duplicates.loc[index, features].equals(duplicates.loc[row, features]):
                    if duplicates.loc[index, "target"] != duplicates.loc[row, "target"]:
    #                     print(f"Found duplicates with different targets: {index} - {duplicates.loc[index, 'target']} and {row} - {duplicates.loc[row, 'target']}")
                        indx_to_drop.append(index)
                        indx_to_drop.append(row)
    #                 else:
    #                     print(f"Found duplicates with the same target: {index} and {row}")
    
    # Reporting results
    print(f"There are {len(duplicates['id'])} duplicated rows in the dataset.")
    print(f"{len(set(indx_to_drop))} of them have different target. They will be deleted from the dataset.")
    print(f"The datatframe has {len(df['id'])} rows.")
    df.drop(axis=0, index=set(indx_to_drop), inplace=True)
    print(f"After duplicated deletion there are {len(df['id'])} rows.")
    
    return df

In [None]:
train = delete_duplicates(train)

In [None]:
# Target feature encoding
encoder = LabelEncoder()
train["target"] = encoder.fit_transform(train["target"])
train["target"].value_counts(sort=False)

In [None]:
def transform_dataset(data):
    """
    Adds new custom features and transforms original features into custom categories
    """
    
    # Copying features in a temporary dataset which will be transformed with MinMaxScaler
    df = data[[x for x in data.columns if "feature_" in x]].copy()
    
    # Adding custom features
    data["feature_75"] = df.max(axis=1)
    data["feature_76"] = df.mean(axis=1)
    data["feature_77"] = df.median(axis=1)
    data["feature_78"] = df.nunique(axis=1)
    data["feature_79"] = (df == 0).astype(int).sum(axis=1)
    data["feature_80"] = (df != 0).sum(axis=1)
    data["feature_81"] = (df == 0).astype(int).sum(axis=1) / 75
    data["feature_82"] = (df != 0).sum(axis=1) / 75
    
    
    # Scaling original features and adding new features basing on them
    scaled_df = pd.DataFrame(index = data.index.values, columns = df.columns.values)
    for col in df.columns.values:
        scaler = MinMaxScaler()
        scaled_df[col] = scaler.fit_transform(np.array(df[col]).reshape(-1, 1))
    data["feature_83"] = scaled_df.mean(axis=1)
    data["feature_84"] = (scaled_df == 1).sum(axis=1)
    
    # Cutting original features into custom intevals [0, 1), [1, 15), [15, 30) ...
    intervals = np.insert(np.arange(15, 370, 15), 0, [0, 1])
    intervals_text = pd.cut(train["feature_0"], intervals, right=False).value_counts().sort_index().index.astype("string")
    map_dict = dict(zip(intervals_text, list(np.arange(len(intervals_text)))))
    for i, column in enumerate(data.drop(["id"], axis=1).columns):
        if ((i < 75) and (column!="target")):
            data[column] = pd.cut(data[column], intervals, right=False).astype("string")
            data[column].replace(map_dict, inplace=True)
    
    return data

In [None]:
# Transforming the train dataset and making value distribution plots for new custom features
X_train = transform_dataset(train.copy())
make_data_plots(X_train, i=75)

# **Machine Learning**

In [None]:
X_train.drop(["id", "target"], axis=1, inplace=True)
y_train = train["target"]
X_test = transform_dataset(test).drop("id", axis=1)

## Hyperparameters tuning using Optuna

The code below is commented in order to save runtime.

In [None]:
# def train_optuna_cb(trial, X=X_train, y=y_train):
#     """
#     A function to train a model using different hyperparamerters combinations provided by Optuna. 
#     Log loss of validation data predictions is returned to estimate hyperparameters effectiveness.
#     """
#     cat_features = [x for x in X.columns[:75]]
#     preds = 0
#     split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
#     for train_idx, valid_idx in split.split(X, y):
#         X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
#         y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
#         # A set of hyperparameters to optimize by optuna
#         cb_params = {
#                  "iterations": trial.suggest_int('iterations', 1000, 10000),
#                  "learning_rate": trial.suggest_float('learning_rate', 0.001, 1.0),
#                  "depth": trial.suggest_int('depth', 1, 6),
#                  "loss_function": 'MultiClass',
#                  "eval_metric": 'MultiClass',
#                  "leaf_estimation_method": trial.suggest_categorical("leaf_estimation_method", ["Newton", "Gradient"]),#, "Exact"]),
#                  "od_type": "Iter",
#                  "early_stopping_rounds": 500,
#                  "l2_leaf_reg": trial.suggest_float('l2_leaf_reg', 0.00001, 10),
#                  "random_strength": trial.suggest_float('random_strength', 1.0, 2.0),
#                  "bagging_temperature": trial.suggest_float('bagging_temperature', 0.0, 10.0),
#                  "border_count": 254,
#                  "use_best_model": trial.suggest_categorical("use_best_model", [False, True]),
#                  "grow_policy": "SymmetricTree",#trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]),
# #                  "max_leaves": trial.suggest_int('max_leaves', 1, 64),
#                  "task_type": "GPU",
#                     }
            
#         model = CatBoostClassifier(**cb_params)
#         model.fit(
#                     X_train, y_train,
#                     eval_set=(X_valid, y_valid),
#                     verbose=False,
#                     cat_features=cat_features
#                 )

#         oof = model.predict_proba(X_valid)
    
#     return log_loss(y_valid, oof)

In [None]:
# %%time
# Creating Optuna object and defining its parameters
# study = optuna.create_study(direction='minimize')
# study.optimize(train_optuna_cb, n_trials = 200)

# Showing optimization results
# print('Number of finished trials:', len(study.trials))
# print('Best trial parameters:', study.best_trial.params)
# print('Best score:', study.best_value)

In [None]:
# def train_optuna_lgbm(trial, X=X_train, y=y_train):
#     """
#     A function to train a model using different hyperparamerters combinations provided by Optuna. 
#     Log loss of validation data predictions is returned to estimate hyperparameters effectiveness.
#     """    
#     preds = 0
#     split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
#     for train_idx, valid_idx in split.split(X, y):
#         X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
#         y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

#         # A set of hyperparameters to optimize by optuna
#         lgbm_params = {
#             'objective': 'multiclass',
#             'reg_alpha': trial.suggest_float('reg_alpha', 1.0, 20.0),
#             'reg_lambda': trial.suggest_float('reg_lambda', 0.00001, 0.1),
#             'subsample': trial.suggest_float('subsample', 0.1, 1.0),
#             'num_leaves': trial.suggest_int('num_leaves', 2, 20),
#             'min_child_samples': trial.suggest_int('min_child_samples', 5, 40),
#             'subsample_freq': trial.suggest_int('subsample_freq', 1, 5),
#             'max_depth': trial.suggest_int('max_depth', 1, 30),
#             'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1),
#             'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.8),
#             'n_estimators': 20000,
#             'random_state': 42,
#             'boosting_type': 'gbdt',
#             'metric': 'multi_logloss',
#             'num_class': 9,
#             'device': 'GPU'
#         }
        
#         model = LGBMClassifier(**lgbm_params)
#         model.fit(
#                     X_train, y_train,
#                     eval_set=(X_valid, y_valid),
#                     eval_metric='multi_logloss',
#                     early_stopping_rounds=500,
#                     verbose=False,
# #                     categorical_feature=cat_features
#                 )

#         oof = model.predict_proba(X_valid)
    
#     return log_loss(y_valid, oof)

In [None]:
# %%time
# Defining and running Optuna using the function above
# study = optuna.create_study(direction='minimize')
# study.optimize(train_optuna_lgbm, n_trials = 60)

# Showing optimization results
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)
# print('Best value:', study.best_value)

## Models training using optimized hyperparameters

In [None]:
def train_with_folds(X, y, X_test, models, splits=10):
    cat_features = [x for x in X.columns[:75]]
    preds = 0
    fi_df = pd.DataFrame(columns=[x.__class__.__name__ for x in models])
    for model in models:
        skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
        oof = np.zeros((train.shape[0], y.nunique()))
        feature_importances = 0
        for num, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

            if "CatBoost" in model.__class__.__name__:
                model.fit(X_train, y_train,
                          eval_set=(X_valid, y_valid),
                          verbose=False,
                          cat_features=cat_features)           
            elif "LGBM" in model.__class__.__name__:
                model.fit(X_train, y_train,
                          eval_set=(X_valid, y_valid),
                          eval_metric='multi_logloss',
                          early_stopping_rounds=400,
                          verbose=False,
                          categorical_feature=cat_features)
            else:
                model.fit(X_train, y_train)
                
            oof[valid_idx] = model.predict_proba(X_valid)
            preds += model.predict_proba(X_test) / (splits*len(models))
            print(f"{model.__class__.__name__} fold {num} logloss: {log_loss(y_valid, oof[valid_idx])}")
            feature_importances += model.feature_importances_ / splits
        fi_df[str(model.__class__.__name__)] = feature_importances
        print(f"\n{model.__class__.__name__} overall logloss: {log_loss(y, oof)}\n")
    
    return preds, fi_df

In [None]:
%%time

models = [LGBMClassifier(objective="multiclass",
                         n_estimators=10000,
                         boosting_type="gbdt",
                         random_state=42,
                         learning_rate=0.00786559751278979,
                         max_depth=28,
                         num_leaves=19,
                         subsample=0.7548376269285053,
                         subsample_freq=4,
                         colsample_bytree=0.10277352165216944,
                         reg_alpha=15.857914898332481,
                         reg_lambda=0.03275652415252568,
                         min_child_samples=12,
                         device="gpu"),
    
            CatBoostClassifier(random_state=42,
                               thread_count=4,
                               verbose=False,
                               iterations=5300,
                               learning_rate=0.004996686623648068,
                               grow_policy="SymmetricTree",
                               loss_function="MultiClass",
                               eval_metric="MultiClass",
                               classes_count=9,
                               od_type="Iter",
                               depth=6,
                               l2_leaf_reg=3.994384171429022,
                               random_strength=1.8493809581160419,
                               bagging_temperature=0.6721279933587145,
                               early_stopping_rounds=400,
                               task_type="GPU")
          
            
         ]

preds, feature_importances = train_with_folds(X_train, y_train, X_test, models)

# **Feature importances**

In [None]:
# Transforming different estimators' feaature importances to the same scale
for col in feature_importances:
    feature_importances[col] = feature_importances[col] / feature_importances[col].sum()

feature_importances["Feature"] = X_test.columns
feature_importances.loc[:74, "Label"] = "Original feature"
feature_importances.loc[75:, "Label"] = "Custom feature"

In [None]:
df = feature_importances.copy()
df.sort_values("CatBoostClassifier", axis=0, ascending=False, inplace=True)
df.reset_index(inplace=True, drop=True)

x = np.arange(0, len(df["Feature"]))
height = 0.4

# Custom legend elements
legend_lines = [Patch(facecolor="cornflowerblue", label="Original CatBoost features"),
                Patch(facecolor="cornflowerblue", hatch='|', label="Custom CatBoost features"),
                Patch(facecolor="palevioletred", label="Original LGBM features"),
                Patch(facecolor="palevioletred", hatch='|', label="Custom LGBM features"),
                ]

fig, ax = plt.subplots(figsize=(16, 30))
bars1 = ax.barh(x-height/2, df["CatBoostClassifier"], height=height,
                color="cornflowerblue", edgecolor="black", label="CatBoostClassifier", hatch=['|' if x=="Custom feature" else '' for x in df["Label"]])
bars2 = ax.barh(x+height/2, df["LGBMClassifier"], height=height,
                color="palevioletred", edgecolor="black", label="LGBMClassifier", hatch=['|' if x=="Custom feature" else '' for x in df["Label"]])
ax.set_title("Feature importances of CatBoost and LGBM models", fontsize=30, pad=15)
ax.set_ylabel("Feature names", fontsize=20, labelpad=15)
ax.set_xlabel("Feature importance", fontsize=20, labelpad=15)
ax.set_yticks(x)
ax.set_yticklabels(df["Feature"], fontsize=15)
ax.tick_params(axis="x", labelsize=15)
ax.grid(axis="x")
ax2 = ax.secondary_xaxis('top')
ax2.set_xlabel("Feature importance", fontsize=20, labelpad=15)
ax2.tick_params(axis="x", labelsize=15)
ax.legend(handles=legend_lines, fontsize=15, loc=1, bbox_to_anchor=(0, 0, 1, 0.90))
plt.margins(0.04, 0.01)
plt.gca().invert_yaxis()

In [None]:
df = pd.DataFrame(columns=["Feature", "Importance"])
df["Feature"] = feature_importances.loc[:75, "Feature"]
df["Importance"] = feature_importances.loc[:75, "CatBoostClassifier"]
df.sort_values("Importance", axis=0, ascending=True, inplace=True)


height = 0.7

fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(16,8))

bars1 = axs[0].barh(df["Feature"].iloc[-15:], df["Importance"].iloc[-15:],
                    height=height,
                    color="mediumorchid",
                    edgecolor="black")
axs[0].set_title("Top 15 most important CatBoost original features", fontsize=15)
axs[0].set_xlabel("Feature importance", fontsize=15, labelpad=10)

bars2 = axs[1].barh(train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).sort_values(ascending=True).index[-15:],
                    train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).sort_values(ascending=True).values[-15:] / train.shape[0],
                    height=height,
                    color="mediumseagreen",
                    edgecolor="black")
axs[1].set_title("Top 15 original features with the most nonzero values", fontsize=15)
axs[1].set_xlabel("Fraction of nonzero values", fontsize=15, labelpad=10)
axs[1].bar_label(bars2, 
                 ["Top 15 important feature" if x in list(df["Feature"].iloc[-15:]) else "" for x in train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).sort_values(ascending=True).index[-15:]],
                 padding=-175, fontsize=12, color="white", weight="bold")

fig.suptitle("Top 15 original features with the most importance for CatBoost and nonzero values", fontsize=20)
plt.show();

In [None]:
df = pd.DataFrame(columns=["Feature", "Importance"])
df["Feature"] = feature_importances.loc[:75, "Feature"]
df["Importance"] = feature_importances.loc[:75, "LGBMClassifier"]
df.sort_values("Importance", axis=0, ascending=True, inplace=True)


height = 0.7

fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(16,8))

bars1 = axs[0].barh(df["Feature"].iloc[-15:], df["Importance"].iloc[-15:],
                    height=height,
                    color="mediumorchid",
                    edgecolor="black")
axs[0].set_title("Top 15 most important LGBM original features", fontsize=15)
axs[0].set_xlabel("Feature importance", fontsize=15, labelpad=10)

bars2 = axs[1].barh(train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).sort_values(ascending=True).index[-15:],
                    train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).sort_values(ascending=True).values[-15:] / train.shape[0],
                    height=height,
                    color="mediumseagreen",
                    edgecolor="black")
axs[1].set_title("Top 15 original features with the most nonzero values", fontsize=15)
axs[1].set_xlabel("Fraction of nonzero values", fontsize=15, labelpad=10)
axs[1].bar_label(bars2, 
                 ["Top 15 important feature" if x in list(df["Feature"].iloc[-15:]) else "" for x in train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).sort_values(ascending=True).index[-15:]],
                 padding=-175, fontsize=12, color="white", weight="bold")

fig.suptitle("Top 15 original features with the most importance for LGBM and nonzero values", fontsize=20)
plt.show();

In [None]:
df = pd.DataFrame(columns=["Feature", "Importance"])
df["Feature"] = feature_importances.loc[:75, "Feature"]
df["Importance"] = feature_importances.loc[:75, "CatBoostClassifier"]
df.sort_values("Importance", axis=0, ascending=False, inplace=True)

colors = ["mediumorchid" if x in df["Feature"].iloc[-15:] else "mediumseagreen" for x in train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).sort_values(ascending=True).index[-15:]]

height = 0.7

fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(16,8))

bars1 = axs[0].barh(df["Feature"].iloc[-15:], df["Importance"].iloc[-15:],
                    height=height,
                    color="mediumorchid",
                    edgecolor="black")
axs[0].set_title("Top 15 least important CatBoost original features", fontsize=15)
axs[0].set_xlabel("Feature importance", fontsize=15, labelpad=10)

bars2 = axs[1].barh(train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).sort_values(ascending=False).index[-15:],
                    train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).sort_values(ascending=False).values[-15:] / train.shape[0],
                    height=height,
                    color="lightcoral",
                    edgecolor="black")
axs[1].set_title("Top 15 original features with the least nonzero values", fontsize=15)
axs[1].set_xlabel("Fraction of nonzero values", fontsize=15, labelpad=10)
axs[1].bar_label(bars2, ["Top 15 least important feature" if x in list(df["Feature"].iloc[-15:]) else "" for x in train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).sort_values(ascending=False).index[-15:]],
                 padding=-215, fontsize=12, color="white", weight="bold")

fig.suptitle("Top 15 original features with the least importance for CatBoost and nonzero values", fontsize=20)                
plt.show();

In [None]:
df = pd.DataFrame(columns=["Feature", "Importance"])
df["Feature"] = feature_importances.loc[:75, "Feature"]
df["Importance"] = feature_importances.loc[:75, "LGBMClassifier"]
df.sort_values("Importance", axis=0, ascending=False, inplace=True)

colors = ["mediumorchid" if x in df["Feature"].iloc[-15:] else "mediumseagreen" for x in train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).sort_values(ascending=True).index[-15:]]

height = 0.7

fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(16,8))

bars1 = axs[0].barh(df["Feature"].iloc[-15:], df["Importance"].iloc[-15:],
                    height=height,
                    color="mediumorchid",
                    edgecolor="black")
axs[0].set_title("Top 15 least important LGBM original features", fontsize=15)
axs[0].set_xlabel("Feature importance", fontsize=15, labelpad=10)

bars2 = axs[1].barh(train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).sort_values(ascending=False).index[-15:],
                    train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).sort_values(ascending=False).values[-15:] / train.shape[0],
                    height=height,
                    color="lightcoral",
                    edgecolor="black")
axs[1].set_title("Top 15 original features with the least nonzero values", fontsize=15)
axs[1].set_xlabel("Fraction of nonzero values", fontsize=15, labelpad=10)
axs[1].bar_label(bars2, ["Top 15 least important feature" if x in list(df["Feature"].iloc[-15:]) else "" for x in train.drop(["id", "target"], axis=1).astype(bool).sum(axis=0).sort_values(ascending=False).index[-15:]],
                 padding=-215, fontsize=12, color="white", weight="bold")

fig.suptitle("Top 15 original features with the least importance for LGBM and nonzero values", fontsize=20)                
plt.show();

# **Predictions submission**

In [None]:
predictions = pd.DataFrame()
predictions["id"] = test["id"]
predictions = pd.concat([predictions, pd.DataFrame(preds, columns=["Class_" + str(x) for x in np.arange(1, 10, 1)])], axis=1)
predictions.to_csv('submission.csv', index=False, header=predictions.columns)
predictions.head()