In [None]:
!pip install -U scikit-learn scikit-learn-intelex >> z_pip.log

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
ss = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')

X = train.drop(['Cover_Type', 'Id'], axis = 1)
y = train['Cover_Type']

X_test = test.drop(['Id'], axis = 1)

# Get a peek on the dataset
X.tail()

In [None]:
%%time
# Perform the usual logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_s = scaler.fit_transform(X)

regr = LogisticRegression(penalty = 'none')
regr.fit(X_s, y)

In [None]:
# Show the coefficients
regr.coef_.shape

In [None]:
# Perform bootstrapping, but using subsampling with replacement
# so each model can be fitted very quickly
from sklearn.utils import resample
from tqdm.auto import tqdm
import warnings
from sklearn.exceptions import ConvergenceWarning

RNG = np.random.RandomState(19)
N_BOOTSTRAP = 25_000
N_SAMPLE = 50_000

BOOTSTRAP_COEFS = {class_: [] for class_ in y.unique()}

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    for _ in tqdm(range(N_BOOTSTRAP)):
        X_b, y_b = resample(X_s, y, n_samples = N_SAMPLE, random_state = RNG)
        regr.fit(X_b, y_b)
        for i, class_ in enumerate(regr.classes_):
            BOOTSTRAP_COEFS[class_].append(regr.coef_[i, :])

In [None]:
stacked_coefs = {class_: np.stack(coefs) for class_, coefs in BOOTSTRAP_COEFS.items()}

In [None]:
from matplotlib import ticker

fig, axs = plt.subplots(1, 7, figsize = (15, 25))

for i, (class_, coefs) in enumerate(stacked_coefs.items()):
    ax = axs[i]
    ax.set_axisbelow(True)
    ax.xaxis.grid(color='gray', linestyle='dashed')
    ax.axvline(0, color = 'red', zorder = -1)
    
    sns.boxenplot(data = pd.DataFrame(coefs, columns = X.columns),
                  showfliers = False, orient = 'h', ax = ax)
    
    tick_range = (coefs.max() - coefs.min())
    tick_stepsize = tick_range/5
    tick_stepsize = np.round(tick_stepsize * 2)/2
    ax.xaxis.set_major_locator(ticker.MultipleLocator(tick_stepsize))
    ax.tick_params(axis="x", bottom=True, top=True, labelbottom=True, labeltop=True)
    
    if i != 0:
        ax.yaxis.set_visible(False)

plt.tight_layout()
plt.show()