In this notebook, I compare some of sklearn models plus XGBoost, LGMB and CatBoost models through cross validation. Based upon this comparison, I will use the best model to make predictions on test data.

In [None]:
import sklearn
sklearn.__version__

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os

In [None]:
from pathlib import Path

data_dir = Path('../input/tabular-playground-series-feb-2022')


train_path = os.path.join(data_dir, 'train.csv')
train_df = pd.read_csv(train_path, index_col="row_id")
test_path = os.path.join(data_dir, 'test.csv')
test_df = pd.read_csv(test_path, index_col="row_id")

elements = [e for e in train_df.columns if e != 'row_id' and e != 'target']

# Convert the 10 bacteria names to the integers 0 .. 9
le = LabelEncoder()
train_df['target_num'] = le.fit_transform(train_df.target)
train_df.drop('target', axis=1, inplace=True)

In [None]:
!pip install ai4water=1.0b4

#Experiments

The `MLClassificationExperiments` class can be used to compare classification models.

In [None]:
from ai4water.experiments import MLClassificationExperiments

In [None]:
class MyExperiments(MLClassificationExperiments):
    
    def model_CatBoostClassifier(self, **suggestions):

        self.path = "xgboost.CatBoostClassifier"
        self.param_space = self.classification_space["CatBoostClassifier"]["param_space"]
        self.x0 = [50, 0.01, 5, 3.0, 0.5, 0.5, 32, 'GreedyLogSum']
        
        if len(suggestions) == 0:
            suggestions = {'iterations':50}
            

        return {'model': {'CatBoostClassifier': suggestions}}

In [None]:
experiment = MyExperiments(cross_validator = {'KFold': {'n_splits': 5}})

In [None]:

experiment.fit(data=train_df, 
               run_type='dry_run', 
               cross_validate=True,
                  include=['BaggingClassifier',
                            'CatBoostClassifier',
                            'DecisionTreeClassifier',
                            'ExtraTreeClassifier',
                            'ExtraTreesClassifier',
                            'HistGradientBoostingClassifier',
                            'LGBMClassifier',
               ]
                         )


In [None]:
errors = experiment.compare_errors("accuracy")

In [None]:
errors

In [None]:
experiment.plot_cv_scores()

In [None]:
cv_dict = experiment.cv_scores_
cv_dict

In [None]:
import numpy as np
for key, value in cv_dict.items():
    print(key, np.mean(value))

Building and training the best performing model on train data.

In [None]:
from ai4water import Model

model = Model(model='ExtraTreesClassifier',
             train_fraction=1.0,
            val_fraction=0.0)

model.fit(data=train_df)

In [None]:
test_pred = model.predict(x=test_df.values, 
                          metrics='all')

test_pred = test_pred.astype('int64')

In [None]:
test_pred_dec = le.inverse_transform(test_pred)

In [None]:
submission = pd.DataFrame(test_pred_dec, index =test_df.index, columns=['target'])  
submission

In [None]:
submission.to_csv('submission.csv')