# Objective

Classify 10 different bacteria species based on repeated lossy measurements of DNA snippets.


## Versions

v1: run with just 1000 rows

v2: run with full dataset

v3: change the ensemble method for models trained from different folds

v4: consider StratifiedKFold

v5: blending with ExtraTreeClassifier

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from tqdm import tqdm
import re
import joblib
import gc
from scipy import stats

import lightgbm as lgb
from sklearn.ensemble import ExtraTreesClassifier as et
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

import warnings
warnings.simplefilter('ignore')

## Exploratory Data Analysis

### Load data

In [None]:
# Load data
train = pd.read_pickle('../input/tpsfeb2022-ds-to-pickle-with-folds/train.pkl')
print(f'Train shape: {train.shape}')

### Target

In [None]:
# List of the bacteria spacies
target_names = train.target.unique().tolist()
print(target_names)

In [None]:
# The balance of the classes
print(train.target.value_counts().tolist())

### Features

In [None]:
# List of features
cols = train.columns.tolist()
print(f'Train columns:\n{cols}')

There are 286 histogram possibilities that corresponds to 286 features. Each of them contains a random number (from 0 to 10) of A, T, G or C.

In [None]:
df = pd.DataFrame(columns={'A','T','G','C'})
for i, col in enumerate(cols[1:-5]):
    df.loc[i]=(re.split('A|T|G|C',col)[1:])
df.head(3)

In [None]:
plt.figure(figsize=(20,10))

plt.subplot(2,2,1)
plt.plot(df['A'], 'r.')
plt.title('A', fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.subplot(2,2,2)
plt.plot(df['T'], 'b.')
plt.title('T', fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.subplot(2,2,3)
plt.plot(df['G'], 'g.')
plt.title('G', fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.subplot(2,2,4)
plt.plot(df['C'], 'y.')
plt.title('C', fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

plt.show()

# Baseline model

In [None]:
params = dict(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
def run(train):
    fold_scores = []
    features = [col for col in train.columns if 'A' in col]
    for fold in [0.0, 1.0, 2.0, 3.0, 4.0]:
        X_train = train[features][train['5_folds'] != fold]
        y_train = train.target[train['5_folds'] != fold]
        X_val   = train[features][train['5_folds'] == fold]
        y_val   = train.target[train['5_folds'] == fold]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=100, early_stopping_rounds=50)
        joblib.dump(model, f'lgbm_fold_{fold}.pkl')
        models.append(model)
        
        model_et = et(n_estimators=1000)
        model_et.fit(X_train, y_train)
        joblib.dump(model_et, f'et_fold_{fold}.pkl')
        models.append(model_et)
        
        y_pred = model.predict(X_val)
        y_pred_et = model_et.predict(X_val)
        
        y_pred = stats.mode(np.array([y_pred, y_pred_et]), axis=0)[0].transpose()
        
        score = accuracy_score(y_pred, y_val)
        print(f"Fold {fold}: {score}")

        fold_scores.append(score)
        
        
        del model, model_et, y_pred, y_pred_et, score, X_train, y_train, X_val, y_val
        gc.collect()

    del train
    gc.collect()
    
    print(f"Overall score: {np.mean(fold_scores, axis=0)}")

In [None]:
le = LabelEncoder()
train.target = le.fit_transform(train.target)

models=[]
run(train)

# Prediction and submission

In [None]:
X_test = pd.read_pickle('../input/tpsfeb2022-ds-to-pickle-with-folds/test.pkl')
y_test = []
for model in models:
    y_test.append(np.argmax(model.predict_proba(X_test)+np.array([0, 0, 0.03, 0.036, 0, 0, 0, 0, 0, 0]), axis=1))

In [None]:
sub = pd.read_pickle('../input/tpsfeb2022-ds-to-pickle-with-folds/sub.pkl')
sub.target = le.inverse_transform(stats.mode(np.array(y_test), axis=0)[0].transpose().astype('int16'))

sub.to_csv('submission.csv', index=False)
sub.head()

In [None]:
pd.Series(sub.target).value_counts().sort_index() / len(X_test) * 100