# Objective

Classify 10 different bacteria species based on repeated lossy measurements of DNA snippets.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from tqdm import tqdm
import re
import joblib
import gc
from scipy import stats

import lightgbm as lgb
from sklearn.ensemble import ExtraTreesClassifier as et
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

import warnings
warnings.simplefilter('ignore')

In [None]:
# Load data
train = pd.read_pickle("../input/tpsfeb2022-ds-to-pickle-with-folds/train.pkl")
print(train.shape)

In [None]:
classes_dist_train = pd.Series(train.target).value_counts().sort_index() / len(train) * 100
classes_dist_train

# Baseline model

In [None]:
params = dict(n_estimators=1000,
              criterion='gini', 
              max_depth=None, 
              min_samples_split=2, 
              min_samples_leaf=1, 
              min_weight_fraction_leaf=0.0, 
              max_features='auto', 
              max_leaf_nodes=None, 
              min_impurity_decrease=0.0, 
              bootstrap=True, 
              oob_score=True, 
              n_jobs=-1, 
              random_state=42, 
              verbose=0, 
              warm_start=False, 
              class_weight=None, 
              ccp_alpha=0.0, 
              max_samples=None
             )

In [None]:
X = train.drop(['target', '5_folds', '10_folds', '20_folds'], axis=1)

le = LabelEncoder()
y = le.fit_transform(train.target)

del train
gc.collect()

In [None]:
model = et(**params)

model.fit(X, y)
joblib.dump(model, f'et_all.pkl')

del X, y
gc.collect()

# Submission

In [None]:
X_test = pd.read_pickle("../input/tpsfeb2022-ds-to-pickle-with-folds/test.pkl")

In [None]:
y_test = model.predict_proba(X_test)

In [None]:
sub = pd.read_pickle("../input/tpsfeb2022-ds-to-pickle-with-folds/sub.pkl")

sub.target = le.inverse_transform(np.argmax(y_test, axis=1))
classes_dist_test = pd.Series(sub.target).value_counts().sort_index() / len(X_test) * 100
classes_dist_test

In [None]:
   
#Rebalancing the classes with respect to training set, credit: https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants
y_test += np.array([0, 0, 0.03, 0.036, 0, 0, 0, 0, 0, 0])
sub.target = le.inverse_transform(np.argmax(y_test, axis=1))
pd.Series(sub.target).value_counts().sort_index() / len(X_test) * 100

In [None]:
sub.to_csv('submission.csv', index=False)
sub.head(5)