# Objective

Classify 10 different bacteria species based on repeated lossy measurements of DNA snippets.


## Versions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from tqdm import tqdm
import re
import joblib
import gc
from scipy import stats

import lightgbm as lgb
from sklearn.ensemble import ExtraTreesClassifier as et
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

import optuna

import warnings
warnings.simplefilter('ignore')

In [None]:

def objective(trial):
    train = pd.read_pickle('../input/tpsfeb2022-ds-to-pickle-with-folds/train.pkl').drop_duplicates(keep='first').sample(frac=1)
    models = []
    kfold = StratifiedKFold(5, shuffle=True, random_state=42)
    X = train.drop(['5_folds', '10_folds', '20_folds', 'target'], axis=1)

    le = LabelEncoder()
    y = le.fit_transform(train.target)
    del train
    params = dict(n_estimators=trial.suggest_int('n_estimators', 100,1000),
              criterion='gini', 
              max_depth=None, 
              min_samples_split=2, 
              min_samples_leaf=1, 
              min_weight_fraction_leaf=0.0, 
              max_features='auto', 
              max_leaf_nodes=None, 
              min_impurity_decrease=0.0, 
              bootstrap=False, 
              oob_score=False, 
              n_jobs=-1, 
              random_state=42, 
              verbose=0, 
              warm_start=False, 
              class_weight=None, 
              ccp_alpha=0.0, 
              max_samples=None
             )

    
    fold_scores = []
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(X,y)):
        X_train, y_train = X.iloc[trn_idx], y[trn_idx]
        X_val, y_val = X.iloc[val_idx], y[val_idx]
        
        model = et(**params)

        model.fit(X_train, y_train)
        #joblib.dump(model, f'et_fold_{fold}.pkl')

        y_pred = model.predict(X_val)

        score = accuracy_score(y_pred, y_val)
        fold_scores.append(score)
        #models.append(model)
        
        del model, y_pred, score, X_train, y_train, X_val, y_val
        gc.collect()

    del X, y
    gc.collect()
    
    return np.mean(fold_scores, axis=0)

In [None]:
# Optimization with optuna
study = optuna.create_study(direction='maximize', pruner = optuna.pruners.MedianPruner(n_warmup_steps=10))
study.optimize(objective, timeout=8*3600)

print(len(study.trials))
print(study.best_trial.params)