In [None]:
!pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

In [None]:
from warnings import simplefilter
simplefilter("ignore")

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

# Read data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col="row_id")

sample = train.sample(n=5000, axis=0)

X = sample.drop("target", axis=1).astype(np.float32)

target_encoder = LabelEncoder()
Y = pd.Series(target_encoder.fit_transform(sample["target"]))

# Split train / test

In [None]:
test_size = 0.2
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

num_folds = 5
n_estimators = 300
scoring = 'accuracy'

# Classifiers

In [None]:
classifiers = []
classifiers.append({'name': 'LDA',   'model': LinearDiscriminantAnalysis(),                                  'description': 'LinearDiscriminantAnalysis'})
classifiers.append({'name': 'KNN',   'model': KNeighborsClassifier(),                                        'description': 'KNeighborsClassifier'})
classifiers.append({'name': 'CART',  'model': DecisionTreeClassifier(),                                      'description': 'DecisionTreeClassifier'})
classifiers.append({'name': 'NB',    'model': GaussianNB(),                                                  'description': 'GaussianNB'})
classifiers.append({'name': 'LSVC',  'model': LinearSVC(),                                                   'description': 'LinearSVC'})
classifiers.append({'name': 'SVC',   'model': SVC(),                                                         'description': 'SVC'})
classifiers.append({'name': 'MLP',   'model': MLPClassifier(),                                               'description': 'MLPClassifier'})
classifiers.append({'name': 'BG',    'model': BaggingClassifier(n_estimators=100),                           'description': 'BaggingClassifier'})
classifiers.append({'name': 'RF',    'model': RandomForestClassifier(n_estimators=100),                      'description': 'RandomForestClassifier'})
classifiers.append({'name': 'ET',    'model': ExtraTreesClassifier(n_estimators=100),                        'description': 'ExtraTreesClassifier'})
classifiers.append({'name': 'AB',    'model': AdaBoostClassifier(algorithm='SAMME', n_estimators=100),       'description': 'AdaBoostClassifier'})
classifiers.append({'name': 'GB',    'model': GradientBoostingClassifier(n_estimators=100),                  'description': 'GradientBoostingClassifier'})
classifiers.append({'name': 'XGB',   'model': XGBClassifier(n_estimators=100, objective='multi:mlogloss'),    'description': 'XGBClassifier'})
classifiers.append({'name': 'CAT',   'model': CatBoostClassifier(n_estimators=100, objective='MultiClass', verbose=None),  'description': 'CatBoostClassifier'})
classifiers.append({'name': 'LGBM',  'model': LGBMClassifier(n_estimators=100, objective='MultiClass'),      'description': 'LGBMClassifier'})

# Score

In [None]:
scores = []

for clf in classifiers:
    model = clf['model']
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_scores = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring, verbose=0)
    
    for cv_score in cv_scores:
        scores.append({'model':clf['name'], 'cv_score':cv_score})
    
    clf['cv_scores'] = cv_scores
    clf['cv_mean'] = cv_scores.mean()
    clf['cv_std']  = cv_scores.std()

    model.fit(X_train, Y_train)
    prediction = model.predict(X_test)
    score = model.score(X_test, Y_test)
    clf['test_score'] = score
    print(clf['description'].rjust(30), f'train = {cv_scores.mean():.3f} ({cv_scores.std():.3f}), test = {score:.3f}\n')

In [None]:
df_classifiers = pd.DataFrame(classifiers)
df_classifiers

# Results

In [None]:
df_scores  = pd.DataFrame(scores)
fig = plt.figure(figsize=(12,5))
sns.boxplot(data=df_scores, x='model', y='cv_score')
plt.show()
fig = plt.figure(figsize=(12,5))
sns.barplot(data=df_classifiers, x='name', y='cv_mean')
plt.show()
fig = plt.figure(figsize=(12,5))
sns.barplot(data=df_classifiers, x='name', y='test_score')
plt.show()

# Scaling
* Let's compare classifiers after scaling

In [None]:
for clf in classifiers:
    clf['pipeline'] = Pipeline([('Scaler', StandardScaler()), (clf['name'], clf['model'])])

In [None]:
sc_scores = []

for clf in classifiers:
    model = clf['pipeline']
    kfold = KFold(n_splits=num_folds, random_state=seed)
    sc_cv_scores = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring, verbose=0)
    
    for sc_cv_score in sc_cv_scores:
        sc_scores.append({'model':clf['name'], 'cv_score':sc_cv_score})
    
    clf['sc_cv_scores'] = sc_cv_scores
    clf['sc_cv_mean'] = sc_cv_scores.mean()
    clf['sc_cv_std']  = sc_cv_scores.std()

    model.fit(X_train, Y_train)
    prediction = model.predict(X_test)
    score = model.score(X_test, Y_test)
    clf['sc_test_score'] = score
    print(clf['description'].rjust(30), f'train = {sc_cv_scores.mean():.3f} ({sc_cv_scores.std():.3f}), test = {score:.3f}\n')

In [None]:
df_classifiers = pd.DataFrame(classifiers)
df_classifiers

In [None]:
df_scores  = pd.DataFrame(sc_scores)
fig = plt.figure(figsize=(12,5))
sns.boxplot(data=df_scores, x='model', y='cv_score')
plt.show()
fig = plt.figure(figsize=(12,5))
sns.barplot(data=df_classifiers, x='name', y='sc_cv_mean')
plt.show()
fig = plt.figure(figsize=(12,5))
sns.barplot(data=df_classifiers, x='name', y='sc_test_score')
plt.show()