In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import time


from numpy import mean, std, asarray, vstack, hstack
from matplotlib import pyplot

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, KFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import mode

from tqdm.notebook import tqdm


from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

import warnings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings('ignore')

## COMPETITION GOAL
For the February 2022 Tabular Playground Series competition, your task is to classify 10 different bacteria species using data from a genomic analysis technique that has some data compression and data loss. In this technique, 10-mer snippets of DNA are sampled and analyzed to give the histogram of base count. 

## ML PROBLEM
- multiclass classification

## METRIC
- Accuracy - Accuracy is the quintessential classification metric. Easily suited for binary as well as a multiclass classification problem. Accuracy = (TP+TN)/(TP+FP+FN+TN). Accuracy is the proportion of true results among the total number of cases examined.

## Dataset contains of 10 bacterias types.  

Source (description and picutes): Wikipedia

**Bacteroides fragilis** - https://en.wikipedia.org/wiki/Bacteroides_fragilis
Klebsiella pneumoniae is a Gram-negative, non-motile, encapsulated, lactose-fermenting, facultative anaerobic, rod-shaped bacterium. It appears as a mucoid lactose fermenter on MacConkey agar.

![Bacteroides fragilis](https://i.ibb.co/vx37m7N/Bacteroides-Fragilis-Gram.jpg)

Although found in the normal flora of the mouth, skin, and intestines, it can cause destructive changes to human and animal lungs if aspirated, specifically to the alveoli resulting in bloody, brownish or yellow colored jelly like sputum. In the clinical setting, it is the most significant member of the genus Klebsiella of the Enterobacteriaceae. K. oxytoca and K. rhinoscleromatis have also been demonstrated in human clinical specimens. In recent years, Klebsiella species have become important pathogens in nosocomial infections.


**Streptococcus pyogenes** - https://en.wikipedia.org/wiki/Streptococcus_pyogenes
Streptococcus pyogenes is a species of Gram-positive, aerotolerant bacteria in the genus Streptococcus. These bacteria are extracellular, and made up of non-motile and non-sporing cocci (round cells) that tend to link in chains. They are clinically important for humans, as they are an infrequent, but usually pathogenic, part of the skin microbiota that can cause Group A streptococcal infection. S. pyogenes is the predominant species harboring the Lancefield group A antigen, and is often called group A Streptococcus (GAS). However, both Streptococcus dysgalactiae and the Streptococcus anginosus group can possess group A antigen as well. Group A streptococci, when grown on blood agar, typically produce small (2–3 mm) zones of beta-hemolysis, a complete destruction of red blood cells. The name group A (beta-hemolytic) Streptococcus (GABHS) is thus also used.

![Streptococcus pyogenes](https://i.ibb.co/GRhzXHd/Streptococcus-pyogenes.jpg)

The species name is derived from Greek words meaning 'a chain' (streptos) of berries (coccus [Latinized from kokkos]) and pus (pyo)-forming (genes), since a number of infections caused by the bacterium produce pus. The main criterion for differentiation between Staphylococcus spp. and Streptococcus spp. is the catalase test. Staphylococci are catalase positive whereas streptococci are catalase-negative. S. pyogenes can be cultured on fresh blood agar plates. Under ideal conditions, it has an incubation period of 1 to 3 days.

An estimated 700 million GAS infections occur worldwide each year. While the overall mortality rate for these infections is 0.1%, over 650,000 of the cases are severe and invasive, with these cases having a mortality rate of 25%. Early recognition and treatment are critical; diagnostic failure can result in sepsis and death.


**Streptococcus pneumoniae** - https://en.wikipedia.org/wiki/Streptococcus_pneumoniae
Streptococcus pneumoniae, or pneumococcus, is a Gram-positive, spherical bacteria, alpha-hemolytic (under aerobic conditions) or beta-hemolytic (under anaerobic conditions), aerotolerant anaerobic member of the genus Streptococcus. They are usually found in pairs (diplococci) and do not form spores and are non motile.As a significant human pathogenic bacterium S. pneumoniae was recognized as a major cause of pneumonia in the late 19th century, and is the subject of many humoral immunity studies.

![Streptococcus ](https://i.ibb.co/zPKBrJB/Pneumococcus-CDC-PHIL-ID1003.jpg)

Streptococcus pneumoniae resides asymptomatically in healthy carriers typically colonizing the respiratory tract, sinuses, and nasal cavity. However, in susceptible individuals with weaker immune systems, such as the elderly and young children, the bacterium may become pathogenic and spread to other locations to cause disease. It spreads by direct person-to-person contact via respiratory droplets and by auto inoculation in persons carrying the bacteria in their upper respiratory tracts.[3] It can be a cause of neonatal infections.

**Campylobacter jejuni** - https://en.wikipedia.org/wiki/Campylobacter_jejuni
Campylobacter jejuni (/ˈkæmpɪloʊˌbæktər dʒəˈdʒuːni/) is one of the most common causes of food poisoning in Europe and in the United States. The vast majority of cases occur as isolated events, not as part of recognized outbreaks. Active surveillance through the Foodborne Diseases Active Surveillance Network (FoodNet) indicates that about 20 cases are diagnosed each year for each 100,000 people in the US, while many more cases are undiagnosed or unreported; the CDC estimates a total of 1.5 million infections every year. The European Food Safety Authority reported 246,571 cases in 2018, and estimated approximately nine million cases of human campylobacteriosis per year in the European Union.

![Campylobacter jejuni](https://i.ibb.co/7KDgWym/ARS-Campylobacter-jejuni.jpg)

Campylobacter jejuni is in a genus of bacteria that is among the most common causes of bacterial infections in humans worldwide. Campylobacter means "curved rod", deriving from the Greek kampylos (curved) and baktron (rod). Of its many species, C. jejuni is considered one of the most important from both a microbiological and public health perspective.

**Salmonella enterica** - https://en.wikipedia.org/wiki/Salmonella_enterica
Salmonella enterica (formerly Salmonella choleraesuis) is a rod-headed, flagellate, facultative anaerobic, Gram-negative bacterium and a species of the genus Salmonella. A number of its serovars are serious human pathogens.

![Salmonella ](https://i.ibb.co/xMbfx62/1280px-Salmonella-enterica-serovar-typhimurium-01.jpg)

**Escherichia coli** - https://en.wikipedia.org/wiki/Escherichia_coli
Escherichia coli (/ˌɛʃəˈrɪkiə ˈkoʊlaɪ/), also known as E. coli (/ˌiː ˈkoʊlaɪ/), is a Gram-negative, facultative anaerobic, rod-shaped, coliform bacterium of the genus Escherichia that is commonly found in the lower intestine of warm-blooded organisms. Most E. coli strains are harmless, but some serotypes (EPEC, ETEC etc.) can cause serious food poisoning in their hosts, and are occasionally responsible for food contamination incidents that prompt product recalls.The harmless strains are part of the normal microbiota of the gut, and can benefit their hosts by producing vitamin K2, and preventing colonisation of the intestine with pathogenic bacteria, having a mutualistic relationship. E. coli is expelled into the environment within fecal matter. The bacterium grows massively in fresh fecal matter under aerobic conditions for 3 days, but its numbers decline slowly afterwards.
![Escherichia ](https://i.ibb.co/XzM8NrX/1280px-E-coli-at-10000x-original.jpg)

**Enterococcus_hirae** - https://en.wikipedia.org/wiki/Enterococcus
Enterococcus is a large genus of lactic acid bacteria of the phylum Firmicutes. Enterococci are gram-positive cocci that often occur in pairs (diplococci) or short chains, and are difficult to distinguish from streptococci on physical characteristics alone. Two species are common commensal organisms in the intestines of humans: E. faecalis (90–95%) and E. faecium (5–10%). Rare clusters of infections occur with other species, including E. casseliflavus, E. gallinarum, and E. raffinosus.

![Enterococcus_hirae](https://i.ibb.co/Dzs7FvB/Enterococcus-histological-pneumonia-01.png)

**Escherichia fergusonii** - https://en.wikipedia.org/wiki/Escherichia_fergusonii
Escherichia fergusonii is a Gram-negative, rod-shaped species of bacterium. Closely related to the well-known species Escherichia coli, E. fergusonii was first isolated from samples of human blood. The species is named for American microbiologist William W. Ferguson.
Some strains of E. fergusonii are pathogenic. It is known to infect open wounds in humans and may also cause bacteraemia or urinary tract infections. Strains causing these infections have been found to be highly resistant to the antibiotic ampicillin, though some are also resistant to gentamicin and chloramphenicol. An antibiotic-resistant strain of the species was found to be associated with an incidence of cystitis in a 52-year-old woman in 2008.

**Staphylococcus aureus** - https://en.wikipedia.org/wiki/Staphylococcus_aureus
Staphylococcus aureus is a Gram-positive round-shaped bacterium, a member of the Firmicutes, and is a usual member of the microbiota of the body, frequently found in the upper respiratory tract and on the skin. It is often positive for catalase and nitrate reduction and is a facultative anaerobe that can grow without the need for oxygen. Although S. aureus usually acts as a commensal of the human microbiota it can also become an opportunistic pathogen, being a common cause of skin infections including abscesses, respiratory infections such as sinusitis, and food poisoning. Pathogenic strains often promote infections by producing virulence factors such as potent protein toxins, and the expression of a cell-surface protein that binds and inactivates antibodies. S. aureus is one of the leading pathogens for deaths associated with Antimicrobial resistance and the emergence of antibiotic-resistant strains such as methicillin-resistant S. aureus (MRSA) is a worldwide problem in clinical medicine. Despite much research and development, no vaccine for S. aureus has been approved.

![Staphylococcus ](https://i.ibb.co/Kzm0DvQ/1280px-Staphylococcus-aureus-VISA-2.jpg)

**Klebsiella pneumoniae** - https://en.wikipedia.org/wiki/Klebsiella_pneumoniae
Klebsiella pneumoniae is a Gram-negative, non-motile, encapsulated, lactose-fermenting, facultative anaerobic, rod-shaped bacterium. It appears as a mucoid lactose fermenter on MacConkey agar.

![Klebsiella ](https://i.ibb.co/GTPQmYP/Klebsiella-pneumoniae-01.png)

Although found in the normal flora of the mouth, skin, and intestines, it can cause destructive changes to human and animal lungs if aspirated, specifically to the alveoli resulting in bloody, brownish or yellow colored jelly like sputum. In the clinical setting, it is the most significant member of the genus Klebsiella of the Enterobacteriaceae. K. oxytoca and K. rhinoscleromatis have also been demonstrated in human clinical specimens. In recent years, Klebsiella species have become important pathogens in nosocomial infections.
It naturally occurs in the soil, and about 30% of strains can fix nitrogen in anaerobic conditions.As a free-living diazotroph, its nitrogen-fixation system has been much-studied, and is of agricultural interest, as K. pneumoniae has been demonstrated to increase crop yields in agricultural conditions.

In [None]:
test = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv", index_col="row_id")
train = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv", index_col="row_id")

TARGET = 'target'
FEATURES = [col for col in train.columns if col not in [TARGET]]

## LOOK INTO DATA
Let's look into data .... first 10 rows only for quick look ...

In [None]:
train.head(10)

## HOW MANY OBSERVATIONS IN DATASETS?

In [None]:
print(f'Number of observations in TRAIN:{len(train)}')
print(f'Number of observations in TEST:{len(test)}')

Observation:
* there are 200.000 observations in train dataset and 100.000 in test dataset

## HOW TARGET LOOKS LIKE?

In [None]:
train.target.value_counts()

Observations:
- there are 10 classes in dataset
- dataset is balanced

## ARE ANY DUPLICATES IN OBSERVATIONS?

In [None]:
print(train[FEATURES].duplicated().sum())
print(test[FEATURES].duplicated().sum())

Observations:
- there are 76007 duplicates in training and 26779 test set

In [None]:
train.drop_duplicates(keep='first', inplace=True)

## ARE THERE ANY MISSING VALUES IN DATASETS?

In [None]:
print(train.isnull().values.any())
print(test.isnull().values.any())

Observations:
- no missing values in both test and train dataset

## FEATURE DISTRIBUTION

In [None]:
fig, axs = plt.subplots(72, 4, figsize=(16,300))
i = 0
for f in FEATURES:
    current_ax = axs.flat[i]
    current_ax.hist(train[f], bins=100)
    current_ax.set_title(f)
    current_ax.grid()
    i = i + 1

## DO WE SEE ANY CLUSTERS?

### A. T-SNE

In [None]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

train_sub = train.sample(10000, random_state= 42)
model = TSNE(n_components=2, random_state=0, perplexity= 50, n_iter=3000)
tsne_data = model.fit_transform(StandardScaler().fit_transform(train_sub.drop('target', axis = 1).astype(float)))
tsne_data = np.vstack((tsne_data.T, train_sub.target)).T

tsne_df = pd.DataFrame(data=tsne_data, columns=("D1", "D2", "target"))

sns.FacetGrid(tsne_df, hue="target", height=6).map(plt.scatter, 'D1', 'D2').add_legend()
plt.title('Perplexity= 50, n_iter=3000')
plt.show()

### B. LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

train_sub = train.sample(10000, random_state= 42)
lda_data = LDA(n_components=2).fit_transform(train_sub.drop(columns='target'),train_sub.target)
plt.figure(figsize=(10,10))
sns.scatterplot(x = lda_data[:, 0], y = lda_data[:, 1], hue = 'target', data=train_sub)

Obserwations:
- as we can see there are visible patterns in data 

### C. UMAP

In [None]:
import umap

train_sub = train.sample(10000, random_state= 42)
embedding_2d = umap.UMAP(random_state = 42 ,n_components=2).fit_transform(train_sub.drop(columns='target').to_numpy())
embedding_3d = umap.UMAP(random_state = 42 ,n_components=3).fit_transform(train_sub.drop(columns='target').to_numpy())

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x = embedding_2d[:, 0], y = embedding_2d[:, 1], hue='target', data=train_sub)

### D. UMAP 3D

In [None]:
plt.figure(figsize=(50,30))
umap_3d = px.scatter_3d(
    embedding_3d, x=0, y=1, z=2,
    labels={'color': 'target'},
    color= train_sub.target,
    color_discrete_sequence=['red', 'seagreen', 'gold', 'black'],
)

umap_3d.update_traces(marker_size=2)
umap_3d.show()

## HELPER FUNCTION - REDUCE MEMORY USAGE

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

## STARTING MODEL

But created very fast for first check

In [None]:
lb = LabelEncoder()
y = lb.fit_transform(train['target'])
train['target'] = y

In [None]:
N_SPLITS = 5
SEED = 42

## EXTRA TREE HYPERPARAMETER SEARCH

In [None]:
def get_models_ntrees():
    models = dict()
    n_trees = [30, 50]
    for n in n_trees:
        models[str(n)] = ExtraTreesClassifier(n_estimators=n)
    return models

def get_models_features(n_estimators):
    models = dict()
    for i in range(19, 23):
        models[str(i)] = ExtraTreesClassifier(n_estimators = n_estimators, max_features=i)
    return models

def get_models_min_samples_split(n_estimators, max_features):
    models = dict()
    for i in range(2, 5):
        models[str(i)] = ExtraTreesClassifier(n_estimators = n_estimators, max_features = max_features, min_samples_split=i)
    return models


In [None]:
def evaluate_model(model, X, y):
    cv = StratifiedKFold(n_splits = N_SPLITS, random_state = SEED)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

In [None]:
def run_test(models_hyp):
    results, names = list(), list()
    best_score = 0
    best_param = 0
    for param, model in models_hyp.items():
        scores = evaluate_model(model, train[FEATURES], y)
        results.append(scores)
        names.append(param)
        score = mean(scores)
        print('>%s %.5f (%.5f)' % (param, mean(scores), std(scores)))
        if score > best_score:
            best_score = score
            best_param = int(param)

    pyplot.boxplot(results, labels=names, showmeans=True)
    pyplot.show()
    
    print(f'Best param value: {best_param} score: {best_score}')
    
    return best_param

In [None]:
# DEMO!!! Looking for best n_estimators - it takes hours to find best params
models_hyp = get_models_ntrees()
n_estimators = run_test(models_hyp)

In [None]:
# DEMO!!! Looking for best n_estimators - it takes hours to find best params
models_hyp = get_models_features(n_estimators = n_estimators)
max_features = run_test(models_hyp)

In [None]:
# DEMO!!! Looking for best n_estimators - it takes hours to find best params
models_hyp = get_models_min_samples_split(n_estimators, max_features)
min_samples_split = run_test(models_hyp)

In [None]:
# Found during experiment
n_estimators = 1000

In [None]:
def get_models():
    models = dict()

    models['ExtraTreesClassifier'] = ExtraTreesClassifier(n_estimators= n_estimators, 
                                                          n_jobs = -1)

    models['RandomForestClassifier'] = RandomForestClassifier(n_estimators= 600)
    return models

## SUPER LEARNER ENSEMBLE
The super learner algorithm was proposed by Mark van der Laan, et al. from Berkeley in their
2007 paper titled Super Learner.

Source: Ensemble Learning Algorithms With Python, Jason Brownlee

1. Select a k-fold split of the training dataset
2. Select m base-models or model configurations.
3. For each base-model:
    * (a) Evaluate using k-fold cross-validation.
    * (b) Store all out-of-fold predictions.
    * (c) Fit the model on the full training dataset and store.
4. Fit a meta-model on the out-of-fold predictions.
5. Evaluate the model on a holdout dataset or use model to make predictions.

![super](https://i.ibb.co/tZ219TQ/super-learner.jpg)
Image source: Super Learner In Prediction, 2010

In [None]:
def get_out_of_fold_predictions(X, y, models):
    meta_X, meta_y = list(), list()
    kfold = KFold(n_splits=N_SPLITS, shuffle=True)
    print("* START - OOF PREDICTION for base models * ")
    for fold_id, (train_ix, test_ix) in enumerate(kfold.split(X)):
        print(f'FOLD: {fold_id+1}')
        fold_yhats = list()
        train_X, test_X = X[train_ix], X[test_ix]
        train_y, test_y = y[train_ix], y[test_ix]
        meta_y.extend(test_y)

        for name, model in models.items():
            model.fit(train_X, train_y)
            yhat = model.predict_proba(test_X)
            fold_yhats.append(yhat)
            
            yhat_eval = model.predict(test_X)
            score = accuracy_score(test_y, yhat_eval)
            print('     %s: %.5f' % (name, score))
    
        meta_X.append(hstack(fold_yhats))
    return vstack(meta_X), asarray(meta_y)

def fit_base_models(X, y, models):
    print("* START - Fitting base models * ")
    for name, model in models.items():
        print(f'   Fitting - {name} model')
        model.fit(X, y)
    print("* FINISH - Fitting base models * ")

def fit_meta_model(X, y):
    print("* START - Fitting meta model * ")
    model = LogisticRegression(solver='liblinear')
    model.fit(X, y)
    print("* FINISH - Fitting meta model * ")
    return model

def evaluate_models(X, y, models):
    for name, model in models.items():
        yhat = model.predict(X)
        score = accuracy_score(y, yhat)
        print('%s: %.5f' % (model.__class__.__name__, score))

def super_learner_predictions(X, models, meta_model):
    meta_data_X = []
    for name, model in models.items():
        yhat = model.predict_proba(X)
        meta_data_X.append(yhat)
    meta_data_X = hstack(meta_data_X)
    return meta_model.predict(meta_data_X)

def super_learner_predictions_proba(X, models, meta_model):
    meta_data_X = []
    for name, model in models.items():
        yhat = model.predict_proba(X)
        meta_data_X.append(yhat)
    meta_data_X = hstack(meta_data_X)
    return meta_model.predict_proba(meta_data_X)


## CROSS VALIDATION AND MODEL EVALUATION

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train[FEATURES].values, y, test_size=0.10)

In [None]:
models = get_models()

In [None]:
meta_X, meta_y = get_out_of_fold_predictions(X_train, y_train, models)

### FIT THE BASE MODEL

In [None]:
fit_base_models(X_train, y_train, models)

### FIT THE META MODEL

In [None]:
meta_model = fit_meta_model(meta_X, meta_y)

### EVALUATE BASE MODELS

In [None]:
evaluate_models(X_val, y_val, models)

### EVALUATE SUPER LEARNER MODEL 

In [None]:
y_hat = super_learner_predictions(X_val, models, meta_model)
score = accuracy_score(y_val, y_hat)
print('Super Learner: %.5f' % score)

## TRAIN ON FULL DATA FOR FINAL PRED

In [None]:
final_models = get_models()
meta_X_full, meta_y_full = get_out_of_fold_predictions(train[FEATURES].values, y, final_models)
fit_base_models(train[FEATURES].values, y, final_models)
meta_model_full = fit_meta_model(meta_X_full, meta_y_full)

## PREDICT AND SUBMIT

In [None]:
preds = super_learner_predictions(test[FEATURES], final_models, meta_model_full)

In [None]:
y_prob = super_learner_predictions_proba(test[FEATURES], final_models, meta_model_full)

This is hack only for public LB - it provides overfitting in my opinion

In [None]:
# "Optimization" code from https://www.kaggle.com/sfktrkl/tps-feb-2022

target_distribution = train['target'].value_counts().sort_index() / len(train) * 100
def get_diff(tune):
    y_pred_tuned = np.argmax(y_prob + tune, axis=1)
    return target_distribution - pd.Series(y_pred_tuned).value_counts().sort_index() / len(test) * 100

tune = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
diff = get_diff(tune)
while abs(diff).max() > 0.1:
    for i in range(len(diff)):
        if diff[i] > 0.1:
            tune[i] += 0.001
            break
        if diff[i] < -0.1:
            tune[i] -= 0.001
            break
    diff = get_diff(tune)

# Credits to https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants
y_prob += tune
y_pred_tuned = lb.inverse_transform(np.argmax(y_prob, axis=1))
pd.Series(y_pred_tuned, index=test.index).value_counts().sort_index() / len(test) * 100

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

In [None]:
sub.target = y_pred_tuned
sub.to_csv("sl-tuned-submission.csv", index=False)
sub.head(10)

sub.target = lb.inverse_transform(preds)
sub.to_csv("sl-base-submission.csv", index=False)
sub.head(10)