# Libraries

In [None]:
import pandas as pd
import numpy as np
import random
import time
import os

from tqdm.notebook import tqdm
import datatable as dt

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier

from scipy.stats import mode

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters

In [None]:
N_SPLITS = 5
SEED = 42

N_CLASSES = 10
N_ESTIMATORS = 1000
VERBOSE = False

In [None]:
DEBUG = False
if DEBUG:
    N_ESTIMATORS = 10
    VERBOSE = 1

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

# Datasets

In [None]:
train = dt.fread("../input/tabular-playground-series-feb-2022/train.csv").to_pandas()
test = dt.fread("../input/tabular-playground-series-feb-2022/test.csv").to_pandas()
submission = dt.fread("../input/tabular-playground-series-feb-2022/sample_submission.csv").to_pandas()

In [None]:
target = 'target'
target_encoded = 'target_encoded'
features = [col for col in test.columns if 'row_id' not in col]

In [None]:
le = LabelEncoder()
train[target_encoded] = le.fit_transform(train[target])

### **Thanks to [@teckmengwong](https://www.kaggle.com/teckmengwong) for [pointing this out](https://www.kaggle.com/c/tabular-playground-series-feb-2022/discussion/305364)**

In [None]:
train = train.drop(index=train[train[features].duplicated()].index).reset_index(drop=True)

# ExtraTreeClassifier

In [None]:
oof = np.zeros(train.shape[0])
pred = []

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (trn_idx, val_idx) in enumerate(tqdm(skf.split(X=train[features], y=train[target_encoded]), total=N_SPLITS)):
    X_train = train[features].iloc[trn_idx]
    y_train = train[target_encoded].iloc[trn_idx]
    X_valid = train[features].iloc[val_idx]
    y_valid = train[target_encoded].iloc[val_idx]
    X_test = test[features]

    start = time.time()
    clf = ExtraTreesClassifier(
        n_estimators=N_ESTIMATORS,
        n_jobs=-1,
        random_state=SEED,
        verbose=VERBOSE
    )
    clf.fit(X_train, y_train)

    oof[val_idx] = clf.predict(X_valid)
    pred.append(clf.predict(X_test))

    elapsed = time.time() - start
    acc = accuracy_score(y_valid, oof[val_idx])
    print(f"fold{fold}, ACCURACY: {acc:6f}, elapsed time: {elapsed:.2f}sec")
    
acc = accuracy_score(train[target_encoded], oof)
print(f"ACCURACY: {acc:6f}")

# Confusion matrix

In [None]:
plt.figure(figsize=(16, 8))
cm = confusion_matrix(train[target_encoded], oof)
sns.heatmap(cm, annot=True, cmap='Blues_r')

# Classification report

In [None]:
cr = classification_report(train[target_encoded], oof, digits=6)
print(cr)

# Submission

In [None]:
pred_decoded = le.inverse_transform(mode(pred).mode[0])
submission[target] = pred_decoded
submission.to_csv("submission.csv", index=False)
submission