In [1]:
%pip install helpers

Collecting helpers
  Downloading helpers-0.2.0-py3-none-any.whl (2.3 kB)
Installing collected packages: helpers
Successfully installed helpers-0.2.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install scikit-learn-intelex

Collecting scikit-learn-intelex
  Downloading scikit_learn_intelex-2024.0.0-py311-none-manylinux1_x86_64.whl.metadata (12 kB)
Collecting daal4py==2024.0.0 (from scikit-learn-intelex)
  Downloading daal4py-2024.0.0-py311-none-manylinux1_x86_64.whl.metadata (7.4 kB)
Collecting daal==2024.0.0 (from daal4py==2024.0.0->scikit-learn-intelex)
  Downloading daal-2024.0.0-py2.py3-none-manylinux1_x86_64.whl.metadata (1.1 kB)
Collecting tbb==2021.* (from daal==2024.0.0->daal4py==2024.0.0->scikit-learn-intelex)
  Downloading tbb-2021.11.0-py2.py3-none-manylinux1_x86_64.whl.metadata (1.0 kB)
Downloading scikit_learn_intelex-2024.0.0-py311-none-manylinux1_x86_64.whl (136 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m136.2/136.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading daal4py-2024.0.0-py311-none-manylinux1_x86_64.whl (9.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m0

In [1]:
# import general modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.svm import OneClassSVM
import xgboost as xgb
from helpers import *
import pandas as pd
import numpy as np

# import specialised modules
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
new_features = False

if new_features:
    # prepare the X data for analysis
    X_ = pd.read_csv('X_train.csv', engine='c')
    X_.drop(columns='id', inplace=True)
    col_names = X_.index
    # transform the data
    scaler = MinMaxScaler(feature_range=(-1,1))
    X_processed = pd.DataFrame(scaler.fit_transform(X_.transpose()).transpose())
    # convert the frame to np arrays and remove the nans
    X_processed = [item[~np.isnan(item)] for item in X_processed.to_numpy()]
    #extracting noise features
    noise_df = extract_noise_features(X_processed)
    X_pp = preprocess(X_processed)
    #extract features based on temporal and frequental things
    df = extract_features(X_pp)
    df = pd.concat([df, noise_df], axis=1)
    df.to_csv('train_features.csv')
    X_ = pd.read_csv('X_test.csv', engine='c')
    X_.drop(columns='id', inplace=True)
    col_names = X_.index
    # transform the data
    scaler = MinMaxScaler(feature_range=(-1,1))
    X_processed = pd.DataFrame(scaler.fit_transform(X_.transpose()).transpose())
    # convert the frame to np arrays and remove the nans
    X_processed = [item[~np.isnan(item)] for item in X_processed.to_numpy()]
    #extracting noise features
    noise_df = extract_noise_features(X_processed)
    X_pp = preprocess(X_processed)
    #extract features based on temporal and frequental things
    df = extract_features(X_pp)
    df = pd.concat([df, noise_df], axis=1)
    df.to_csv('test_features.csv')

In [3]:
#print general description
df = pd.read_csv('train_features.csv').drop(['Unnamed: 0'],axis=1)
df.describe(include='all')

FileNotFoundError: [Errno 2] No such file or directory: 'train_features.csv'

In [289]:
# replace inf with nan
df.replace([np.inf, -np.inf], np.NaN, inplace=True)
imp = KNNImputer(n_neighbors=4, weights='distance')
# imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
df_X = pd.DataFrame(imp.fit_transform(df), columns=df.columns)

In [290]:
y = pd.read_csv('y_train.csv')
y.drop(columns='id', inplace=True)

Remove outliers

In [291]:
samples_before = df_X.shape[0]

# trans = ExperimentalTransformer(OneClassSVM(nu=0.995))
model = IsolationForest(contamination="auto")
outl_pred = model.fit_predict(df_X)
mask = outl_pred != -1

X_selection, y = df_X[mask], y[mask]

samples_after = X_selection.shape[0]

print(f'Data size reduced from {samples_before} to {samples_after}')
df_X = X_selection

data_train_backup = df_X.copy()

Data size reduced from 5117 to 4726


Find most important features with a random forest, since different libraries are used and they showed very inconsistant performance.
The 40 best features are used.

In [292]:
# feature importance (optional)
from sklearn.feature_selection import RFE

rfe = RFE(estimator=RandomForestClassifier(n_estimators=200, n_jobs=-1), n_features_to_select=40, step=0.05)
rfe.fit(df_X, y.values.ravel())
df_X = rfe.transform(df_X)

In [293]:
# X_train, X_test, y_train, y_test = train_test_split(X_selection, y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(df_X, y, random_state=42)

## Model

In [294]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import warnings
warnings.filterwarnings('ignore')

space = {'max_depth': hp.quniform("max_depth", 3, 12, 1),
         'gamma': hp.uniform('gamma', 1, 9),
         'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1),
         'reg_lambda': hp.uniform('reg_lambda', 0, 1),
         'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
         'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
         'n_estimators': 1000,
         'seed': 0
         }

def objective(space):
    clf = xgb.XGBClassifier(
        use_label_encoder=False,n_jobs=-1,
        n_estimators=space['n_estimators'], max_depth=int(space['max_depth']), gamma=space['gamma'],
        reg_alpha=int(space['reg_alpha']), min_child_weight=int(space['min_child_weight']),
        colsample_bytree=int(space['colsample_bytree']))

    evaluation = [(X_train, y_train.values.ravel()),
                  (X_test, y_test.values.ravel())]

    clf.fit(X_train, y_train.values.ravel(),
            eval_set=evaluation, eval_metric="mlogloss",
            early_stopping_rounds=10, verbose=False)

    pred = clf.predict(X_test)
    accuracy = f1_score(y_test.values.ravel(), pred, average='micro')
    return {'loss': -accuracy, 'status': STATUS_OK}

In [295]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 500,
                        trials = trials)

best_xgb = xgb.XGBClassifier(best_hyperparams)
best_xgb.fit(X_train, y_train.values.ravel(), verbose=False)

100%|█████████████████████████████████████████████| 500/500 [05:41<00:00,  1.46trial/s, best loss: -0.7428087986463621]


In [296]:
print(f1_score(y_test, best_xgb.predict(X_test), average=None))
print(f1_score(y_test, best_xgb.predict(X_test), average='micro'))

[0.88740839 0.62569832 0.70170015 0.48648649]
0.8104906937394247


In [300]:
data_test = pd.read_csv('test_features.csv').drop(['Unnamed: 0'],axis=1)
data_test.replace([np.inf, -np.inf], np.NaN, inplace=True)
# print(len(data_test.columns), len(data_train_backup.columns))
data_test = pd.DataFrame(imp.fit(data_train_backup).transform(data_test), columns=data_train_backup.columns)
data_test = rfe.transform(data_test)

y_prob = None
n_rounds = 1
for r in range(n_rounds):
    best_xgb = xgb.XGBClassifier(best_hyperparams)
    best_xgb.fit(df_X, y.values.ravel(), verbose=False, eval_metric='mlogloss')
    y_test_prob = best_xgb.predict_proba(data_test)
    if y_prob is None:
        y_prob = np.zeros_like(y_test_prob)
    y_prob = y_prob + y_test_prob
y_test = np.argmax(y_prob, axis=1)
data_out = {"id" : np.arange(len(y_test)), "y": y_test}
df_out = pd.DataFrame(data_out)
df_out.to_csv("submission.csv", index=False)

In [301]:
print(best_hyperparams)

{'colsample_bytree': 0.8623811480223281, 'gamma': 1.2017606607223974, 'max_depth': 4.0, 'min_child_weight': 0.0, 'reg_alpha': 40.0, 'reg_lambda': 0.19101490078046782}
