In [None]:
!pip install scikit-learn-intelex

import os
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import mode
from tqdm import tqdm
from pathlib import Path

# from sklearnex import patch_sklearn
# patch_sklearn()
from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings('ignore')

In [None]:
data_dir = Path('../input/tabular-playground-series-feb-2022')

df_train = pd.read_csv(data_dir / 'train.csv', index_col='row_id')
df_test  = pd.read_csv(data_dir / 'test.csv', index_col='row_id')

TARGET = df_train.columns.difference(df_test.columns)[0]
features = df_train.columns[df_train.columns != TARGET]

In [None]:
vc = df_train.value_counts()
dedup_train = pd.DataFrame([list(tup) for tup in vc.index.values], columns=df_train.columns)
dedup_train['sample_weight'] = vc.values
dedup_train

In [None]:
(df_train[features].values == dedup_train[features].iloc[0].values.reshape(1, -1)).all(axis=1).sum()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X = dedup_train[features]
y = pd.DataFrame(le.fit_transform(dedup_train[TARGET]), columns=[TARGET])
sample_weight = dedup_train['sample_weight']

In [None]:
%%time

N_SPLITS =10
folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True)
y_pred_list, y_proba_list, scores = [], [], []

for fold, (train_idx, valid_idx) in enumerate(tqdm(folds.split(X, y), total=N_SPLITS)):
    print('FOLD: ', fold)
    
    X_train, y_train, sample_weight_train = X.iloc[train_idx], y.iloc[train_idx], sample_weight.iloc[train_idx]
    X_valid, y_valid, sample_weight_valid = X.iloc[valid_idx], y.iloc[valid_idx], sample_weight.iloc[valid_idx]
    
    model = ExtraTreesClassifier(
        n_estimators=300,
        n_jobs=-1,
        verbose=0,
        random_state=1
    )
    
    model.fit(X_train, y_train, sample_weight_train)
    
    valid_pred = model.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred, sample_weight=sample_weight_valid)
    print(f'ACCURACY SCORE: {valid_score:5f}\n')
    scores.append(valid_score)
    
    y_pred_list.append(model.predict(df_test))
    y_proba_list.append(model.predict_proba(df_test))

score = np.array(scores).mean()
print(f'MEAN ACCURACY SCORE: {score:6F}')

In [None]:
y_pred = mode(y_pred_list).mode[0]
y_pred = le.inverse_transform(y_pred)

In [None]:
target_distrib = pd.DataFrame({
    'count': df_train.target.value_counts(),
    'share': df_train[TARGET].value_counts() / df_train.shape[0] * 100
})

target_distrib['pred_count'] = pd.Series(y_pred, index=df_test.index).value_counts()
target_distrib['pred_share'] = target_distrib['pred_count'] / len(df_test) * 100
target_distrib.sort_index()

In [None]:
y_proba = sum(y_proba_list) / len(y_proba_list)
y_proba += np.array([0, 0, 0.01, 0.03, 0, 0, 0, 0, 0, 0])
y_pred_tuned = le.inverse_transform(np.argmax(y_proba, axis=1))
pd.Series(y_pred_tuned, index=df_test.index).value_counts().sort_index() / len(df_test) * 100

In [None]:
submission = pd.read_csv(data_dir / 'sample_submission.csv')
submission[TARGET] = y_pred_tuned
submission.to_csv('submission_against_mutants_01.csv', index=False)