# Objective

Classify 10 different bacteria species based on repeated lossy measurements of DNA snippets.


## Versions



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from tqdm import tqdm
import re
import joblib
import gc
from scipy import stats

import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

import sys
sys.path.append('../input/pycaret/pyod')
sys.path.append('../input/pycaret/pycaret')
from pycaret.classification import *

import warnings
warnings.simplefilter('ignore')

### Load data

In [None]:
# Load data
train = pd.read_pickle('../input/tpsfeb2022-ds-to-pickle-with-folds/train.pkl')
train.drop(['5_folds', '10_folds', '20_folds'], axis=1, inplace=True)
train = train.sample(frac=1)

In [None]:
#Drop douplicates, credit: https://www.kaggle.com/sfktrkl/tps-feb-2022
train.drop_duplicates(keep='first', inplace=True)

In [None]:
print(train.shape)
train.head()

# Model comparison with Pycaret

In [None]:
le = LabelEncoder()
train.target = le.fit_transform(train.target)

In [None]:
clf = setup(data = train,  
            target = 'target',
            silent=True,
            verbose=0)

In [None]:
top = compare_models(exclude=['rf', 'xgboost', 'gbc', 'lr', 'catboost'], n_select=2, sort='Accuracy')

# Prediction & submission

In [None]:
# Blend top 2 models
model = blend_models(top)
model = finalize_model(model)

In [None]:
test = pd.read_pickle('../input/tpsfeb2022-ds-to-pickle-with-folds/test.pkl')
sub = pd.read_pickle('../input/tpsfeb2022-ds-to-pickle-with-folds/sub.pkl')

In [None]:
pred = predict_model(model, data=test).Label

In [None]:
sub.target = le.inverse_transform(pred)
sub.to_csv('submission.csv', index=False)
sub.head()