In this notebook I show some basic EDA and comparison of all sklearn models plus XGBoost, LGMB and CatBoost models.
Based upon this comparison, I will use the best model to make predictions on test data.

In [None]:
import sklearn
sklearn.__version__

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os

In [None]:
## removing previous rsult files if exists

# import shutil
# _path = os.path.join(os.getcwd(), "results")
# if os.path.exists(_path):
#     shutil.rmtree(_path)
#     print("results removed")

In [None]:
from pathlib import Path

data_dir = Path('../input/tabular-playground-series-feb-2022')


train_path = os.path.join(data_dir, 'train.csv')
train_df = pd.read_csv(train_path, index_col="row_id")
test_path = os.path.join(data_dir, 'test.csv')
test_df = pd.read_csv(test_path, index_col="row_id")

elements = [e for e in train_df.columns if e != 'row_id' and e != 'target']

# Convert the 10 bacteria names to the integers 0 .. 9
le = LabelEncoder()
train_df['target_num'] = le.fit_transform(train_df.target)
train_df.drop('target', axis=1, inplace=True)

train_df.shape, test_df.shape

In [None]:
test_df.head()

In [None]:
train_df.columns

In [None]:
try:
    import ai4water
except (ImportError, ModuleNotFoundError):
    !python -m pip install git+https://github.com/AtrCheema/AI4Water.git@dev

# Exploratory data analysis

In [None]:
from ai4water.eda import EDA

In [None]:
eda_train = EDA(train_df)

In [None]:
eda_train.plot_data(subplots=True, figsize=(10, 14), max_cols_in_plot=20)

In [None]:
eda_train.plot_missing(figsize=(10, 14))

In [None]:
!pip install seaborn

In [None]:
eda_train.box_plot(palette="Set3", max_features=50, figsize=(18, 8))

In [None]:
eda_train.box_plot(palette="Set3", max_features=50, figsize=(18, 8), normalize=False)

In [None]:
eda_train.box_plot(palette="Set3", max_features=50, figsize=(18, 8), violen=True)

In [None]:
_ = eda_train.plot_histograms(max_subplots=20)

In [None]:
eda_train.correlation(figsize=(24,24), xticklabels=eda_train.data.columns.tolist(), 
                      yticklabels=eda_train.data.columns.tolist()
                     )

In [None]:
eda_train.correlation(method="spearman", figsize=(24,24), xticklabels=eda_train.data.columns.tolist(), 
                      yticklabels=eda_train.data.columns.tolist()
                     )

In [None]:
from easy_mpl import pie

In [None]:
pie(train_df['target_num'].values)

# Experiments

The `MLClassificationExperiments` class can be used to compare classification models.

In [None]:
from ai4water.experiments import MLClassificationExperiments

customize the parameter space of CatBoost because default parameters take very long time.

In [None]:
class MyExperiments(MLClassificationExperiments):
    
    def model_CatBoostClassifier(self, **suggestions):

        self.path = "xgboost.CatBoostClassifier"
        self.param_space = self.classification_space["CatBoostClassifier"]["param_space"]
        self.x0 = [50, 0.01, 5, 3.0, 0.5, 0.5, 32, 'GreedyLogSum']
        
        if len(suggestions) == 0:
            suggestions = {'iterations':50}
            

        return {'model': {'CatBoostClassifier': suggestions}}

In [None]:
experiment = MyExperiments()

In [None]:
# sample_data = train_df[['A0T0G0C10', 'A0T0G1C9', 'A0T0G2C8', 'A0T0G3C7', 'A0T0G4C6', 'target_num']]
# sample_data = sample_data.iloc[0:100]
experiment.fit(data=train_df, 
               run_type='dry_run', 
               exclude=['LabelSpreading',  # stops the kernel
                        'LabelPropagation', # stops the kernel
                        'NuSVC',  # taking very long 
                        'LinearSVC',  # taking very long 
                        'SVC',  # taking very long 
                        'KNeighborsClassifier',  # takes very long time
                        'model_LogisticRegression',
               ]
                         )

Comparison of models based upon their accuracy

In [None]:
errors = experiment.compare_errors("accuracy")

In [None]:
errors

another way to compare models is using Taylor plot

In [None]:
experiment.taylor_plot(figsize=(6, 12), 
                       leg_kws={'facecolor': 'white',
                       'edgecolor': 'black','bbox_to_anchor':(1.4, 0.7)})

Find the best model

In [None]:
best_model = list(errors.values())[0][1]
best_model

find the path where best model is saved.

In [None]:
best_model_path = os.path.join(experiment.exp_path, best_model)
best_model_path = os.path.join(best_model_path, os.listdir(best_model_path)[0])
best_model_path

Load the best model and update its weights

In [None]:
from ai4water import Model 

c_path = os.path.join(best_model_path, "config.json")
model = Model.from_config_file(c_path)

w_file = os.path.join(best_model_path, "weights", best_model)
model.update_weights(w_file)

make predictions using best model

In [None]:
test_pred = model.predict(x=test_df.values, 
                          metrics='all')

test_pred = test_pred.astype('int64')
test_pred.shape

In [None]:
test_pred_dec = le.inverse_transform(test_pred)

In [None]:
submission = pd.DataFrame(test_pred_dec, index =test_df.index, columns=['target'])  
submission

In [None]:
submission.to_csv('submission.csv')