In this notebook, I compare some of sklearn models plus XGBoost, LGMB and CatBoost models through cross validation. Based upon this comparison, I will use the best model to make predictions on test data.

In [None]:
import sklearn
sklearn.__version__

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os

In [None]:
from pathlib import Path

data_dir = Path('../input/tabular-playground-series-feb-2022')


train_path = os.path.join(data_dir, 'train.csv')
train_df = pd.read_csv(train_path, index_col="row_id")
test_path = os.path.join(data_dir, 'test.csv')
test_df = pd.read_csv(test_path, index_col="row_id")

elements = [e for e in train_df.columns if e != 'row_id' and e != 'target']

# Convert the 10 bacteria names to the integers 0 .. 9
le = LabelEncoder()
train_df['target_num'] = le.fit_transform(train_df.target)
train_df.drop('target', axis=1, inplace=True)

In [None]:
import numpy as np
from sklearn.decomposition import PCA

pca = PCA(n_components=100,random_state=1).fit(train_df[0:-1])
pca_transformed = pca.transform(train_df[0:-1])
pca_df = pd.DataFrame(pca_transformed)
pca_df['target_num'] = train_df['target_num']

In [None]:
pca_df.head()

In [None]:
!pip install ai4water==1.0b4

Building and training the best performing model on train data.

In [None]:
from ai4water import Model

model = Model(model={'ExtraTreesClassifier':{
                                            'class_weight': 'balanced',
                                            'n_estimators': 500,
                                            'random_state': 92
                                            }},
             train_fraction=1.0,
            val_fraction=0.0)

model.fit(data=pca_df)

In [None]:
t,p = model.predict(return_true=True)

In [None]:
from SeqMetrics import ClassificationMetrics
metrics = ClassificationMetrics(t, p)
accuracy = metrics.accuracy()

In [None]:
accuracy

In [None]:
pca_test = PCA(n_components=100,random_state=1).fit(test_df)
pca_test_transformed = pca_test.transform(test_df)
pca_test_df = pd.DataFrame(pca_test_transformed)

In [None]:
test_pred = model.predict(x=pca_test_df.values, 
                          metrics='all')

test_pred = test_pred.astype('int64')

In [None]:
test_pred_dec = le.inverse_transform(test_pred)

In [None]:
submission = pd.DataFrame(test_pred_dec, index =test_df.index, columns=['target'])  
submission

In [None]:
submission.to_csv('submission.csv')