## Table Of Contents
* **[EDA](#eda)**
* **[Dealing with Outliers](#outliers)**
* **[Dealing with Duplicates](#duplicates)**
* **[Model Training](#model)**
* **[Post Processing](#post)**
* **[Submission](#submit)**
* **[Acknowledgements](#thanks)**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random

from datetime import datetime
from scipy.stats import mode
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Checks whether the kernal is in the batch (submission) mode
submission = os.environ['KAGGLE_KERNEL_RUN_TYPE'] == 'Batch'

data_dir = "/kaggle/input/tabular-playground-series-feb-2022/"

In [None]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything(50)

# EDA <a id="eda"></a>

In [None]:
train = pd.read_csv(data_dir + 'train.csv', index_col="row_id")
test = pd.read_csv(data_dir + "test.csv", index_col="row_id").astype(np.float32)

In [None]:
train.shape

In [None]:
train.head().T

In [None]:
# Any missing values?
train.isna().sum().sum()

# Dealing with Outliers <a id="outliers"></a>

In [None]:
# Trim values to the highest possible value that isn't an outlier
data = pd.concat([train.drop('target', axis=1), test])
for column in train.columns:
    if column == 'target':
        continue

    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1

    coef = 1.5
    min_bound = Q1 - coef * IQR
    max_bound = Q3 + coef * IQR

    train[column] = np.where(train[column] < min_bound, min_bound, train[column])
    train[column] = np.where(train[column] > max_bound, max_bound, train[column])

    test[column] = np.where(test[column] < min_bound, min_bound, test[column])
    test[column] = np.where(test[column] > max_bound, max_bound, test[column])

# Dealing with Duplicates <a id="duplicates"></a>

In [None]:
features = train.columns[train.columns != 'target']

sample_weight = (train.groupby(by=features.to_list(), sort=False)).count().values
sample_weight = sample_weight.flatten()

In [None]:
dedup_train = train.drop_duplicates(keep='first')

# No need to drop row_id because at the beginning I change it to the index of train 
X = dedup_train.drop("target", axis=1).astype(np.float32)

target_encoder = LabelEncoder()
y = pd.Series(target_encoder.fit_transform(dedup_train["target"]))

target_encoder.classes_

# Model training <a id="model"></a>

In [None]:
fold_probs = []
y_preds = []
scores = []

if not submission:
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=431)
    estimators = 50
    est_inc = 0
else:
    folds = StratifiedKFold(n_splits=15, shuffle=True, random_state=431)
    estimators = 1250
    est_inc = 25

for fold, (train_id, test_id) in enumerate(folds.split(X, y)):  
    X_train = X.iloc[train_id]
    y_train = y.iloc[train_id]
    X_valid = X.iloc[test_id]
    y_valid = y.iloc[test_id]
    
    model = ExtraTreesClassifier(
        n_estimators=estimators,
        n_jobs=-1,
        random_state=fold
    )
    
    start = datetime.now()

    model.fit(X_train, y_train, sample_weight = sample_weight[train_id])
    
    end = datetime.now()
    
    valid_pred = model.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred, sample_weight = sample_weight[test_id])
    
    print("Fold:", fold + 1, "Accuracy:", valid_score, 'Time:', end - start)
    
    scores.append(valid_score)
    
    # Save predictions to later submit the mean values
    y_preds.append(model.predict(test))
    fold_probs.append(model.predict_proba(test))
    
    estimators += est_inc
    
print("Mean accuracy score:", np.array(scores).mean())

# Postproccessing <a id="post"></a>

In [None]:
y_pred = target_encoder.inverse_transform(mode(y_preds).mode[0])

mean_prob = sum(fold_probs) / len(fold_probs) # Mean probability for each row

# The distribution of bacteria types
target_dist = pd.Series(target_encoder.transform(train['target'])).value_counts().sort_index() / len(train) * 100

# Finds the difference in percent between the normal and tuned target distributions
def get_diff(deltas, distribution):
    tuned_predictions = pd.Series(np.argmax(mean_prob + deltas, axis=1))
    return distribution - tuned_predictions.value_counts().sort_index() / len(test) * 100

# The list of probability deltas to match distributions
#deltas = [0, 0, 0.03, 0.036, 0, 0, 0, 0.027, 0, 0]
deltas = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

diff = get_diff(deltas, target_dist)
print("Mean difference before tuning:", diff.abs().mean(), "%")

# Finding the optimal probability deltas
for i in range(1000):
    diff_max_id = np.argmax(diff.abs())

    if diff[diff_max_id] > 0.1:
        deltas[diff_max_id] += 0.001
    elif diff[diff_max_id] < -0.1:
        deltas[diff_max_id] -= 0.001
    else:
        break
    diff = get_diff(deltas, target_dist)

print("Mean difference after tuning:", diff.abs().mean(), "%")
mean_prob += deltas

# Submitting results <a id="submit"></a>

In [None]:
if submission:    
    out = pd.read_csv(data_dir + "sample_submission.csv")
    out["target"] = target_encoder.inverse_transform(np.argmax(mean_prob, axis=1))
    out.to_csv("submission.csv", index=False)
    out.head()

# Acknowledgements <a id="thanks"></a>

> Shoutout to [Maxence Fuzellier](https://www.kaggle.com/maxencefzr/tps-feb22-eda-extratrees)


> Also, a big thanks to [AmbrosM](https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants) and [this other notebook by AmbrosM](https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense)


> Thanks to [Şafak Türkeli](https://www.kaggle.com/sfktrkl/tps-feb-2022)