In [None]:
%%capture

!pip install scikit-learn-intelex -q

import numpy as np
import pandas as pd
import warnings
from numba import jit

from scipy.stats import mode
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier


from sklearnex import patch_sklearn
patch_sklearn()

warnings.filterwarnings("ignore")

## Baseline and One-vs .... One and Rest - minimalistic approach 

I challange myself and set goal:
- use few lines of code (no more that 50 excluding import Python modules)
- achieve good score - above 0.975 (LB)  

In [None]:
test = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv", index_col="row_id")
train = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv", index_col="row_id")
sub = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

train.drop_duplicates(keep='first', inplace=True)

FEATURES = [col for col in train.columns if col not in ['target']]

lb = LabelEncoder()
y = lb.fit_transform(train['target'])

In [None]:
cv = StratifiedKFold(n_splits = 5,  random_state = 42)

BASELINE - **Extremely Randomized Trees**, or Extra Trees for short, is an ensemble machine learning
algorithm. Specifically, it is an ensemble of decision trees and is related to other ensembles
of decision trees algorithms such as bootstrap aggregation (bagging) and random forest. The
Extra Trees algorithm works by creating a large number of unpruned decision trees from the
training dataset. Predictions are made by averaging the prediction of the decision trees in the
case of regression or using majority voting in the case of classification.


In [None]:
%%time

baseline = ExtraTreesClassifier(n_estimators = 500)
print('Mean Accuracy: %.5f' % (np.mean(cross_val_score(baseline, train[FEATURES].values, y, scoring = 'accuracy', cv = cv, n_jobs = -1))))

In [None]:
%%time
baseline.fit(train[FEATURES].values, y)
baseline_preds = baseline.predict(test[FEATURES].values)

MODEL 2 - **One-vs-One** (OvO for short) is heuristic method for using binary classification algorithms for multiclass classification. One-vs-One splits a multiclass classification dataset into binary classification problems.

In [None]:
%%time

model = ExtraTreesClassifier(n_estimators = 300)
ovo = OneVsOneClassifier(model)

print('Mean Accuracy: %.5f' % (np.mean(cross_val_score(ovo, train[FEATURES].values, y, scoring = 'accuracy', cv = cv, n_jobs = -1))))

In [None]:
%%time
ovo.fit(train[FEATURES].values, y)
ovo_preds = ovo.predict(test[FEATURES].values)

MODEL 3 - **One-vs-Rest** (OvR for short, also referred to as One-vs-All or OvA) is a heuristic method
for using binary classification algorithms for multiclass classification. It involves splitting the
multiclass dataset into multiple binary classification problems. A binary classifier is then trained
on each binary classification problem and predictions are made using the model that is the most
confident.

In [None]:
%%time

model = ExtraTreesClassifier(n_estimators = 300)
ovr = OneVsRestClassifier(model)

print('Mean Accuracy: %.5f' % (np.mean(cross_val_score(ovr, train[FEATURES].values, y, scoring = 'accuracy', cv = cv, n_jobs = -1))))

In [None]:
%%time
ovr.fit(train[FEATURES].values, y)
ovr_preds = ovr.predict(test[FEATURES].values)

## BLEND MODELS

We use hard voting method - this method involves summing the predictions for each class label and predicting the class label with the most votes. 

In [None]:
predictions = pd.DataFrame(np.stack([baseline_preds, ovr_preds, ovo_preds], axis =1), columns = ['baseline', 'ovr', 'ovo'])

diff = len(predictions[predictions.apply(pd.Series.nunique, axis=1)!=1])
print(f'Model see the same classes in : {len(test) - diff} observations and do not agree with {diff} observations')

In [None]:
@jit(nopython=True)
def np_blending(row):
    return np.argmax(np.bincount(row))

predictions['final_pred'] = predictions.apply(lambda row: np_blending(row.values), axis=1)

## PREDICT AND SUBMIT

For comparision reason I save all prediction to compare correlation between local CV and public LB. 

In [None]:
sub.target = lb.inverse_transform(baseline_preds)
sub.to_csv("ovr-submission.csv", index = False)

sub.target = lb.inverse_transform(ovo_preds)
sub.to_csv("ovo-submission.csv", index = False)

sub.target = lb.inverse_transform(ovr_preds)
sub.to_csv("ovr-submission.csv", index = False)

sub.target = lb.inverse_transform(predictions.final_pred.values)
sub.to_csv("base-ovo-ovr-submission.csv", index = False)

In [None]:
pd.Series(sub.target, index=sub.index).value_counts().sort_index() / len(test) * 100