In [1]:
%load_ext autoreload
%autoreload 2

import os
head, tail = os.path.split(os.getcwd())
if tail == "notebooks": os.chdir(head)

from sca import models, plots, helpers as h
SEED = 42

In [2]:
import pandas as pd

X, y = h.load_data('data/processed/v1_var_desync0_clean.h5', subset="Profiling")

df_idx = pd.read_csv('data/processed/ttest_comb_idx.csv')
df_idx.drop(["Unnamed: 0"], axis=1, inplace=True)
df_idx.head()

Unnamed: 0,i,j,tstat
0,999,220,10.74549
1,1000,220,12.845836
2,1000,221,12.206614
3,1000,222,12.01982
4,1001,220,12.245995


Najpierw spróbuję wytrenować prosty klasyfikator Bayesowki, ignorując założenie o normalności rozkładu cech.

In [3]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from tqdm.auto import tqdm

X_arr = np.asarray(X, dtype=np.float32)
X_arr -= np.mean(X_arr, axis=0, keepdims=True)
X_comb = X_arr[:, df_idx['i']] * X_arr[:, df_idx['j']]

def perform_cv(X, y, model, n_splits=5):
    mean_ranks = 0.0
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    for train_idx, test_idx in tqdm(kf.split(X, y)):
        X_train, X_test = X_comb[train_idx], X_comb[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        model.fit(X_train, y_train)
    
        probs = model.predict_proba(X_test)
        mr = h.compute_guessing_entropy(y_test, probs)
        mean_ranks += mr

    return mean_ranks / n_splits

m = make_pipeline(GaussianNB())
mr_cv_gnb = perform_cv(X_comb, y, m)
print(f"Mean ranks (5 splits): {mr_cv_gnb:.2f}")

0it [00:00, ?it/s]

Mean ranks (5 splits): 116.66


In [4]:
from sklearn.ensemble import RandomForestClassifier as RFC

m = make_pipeline(
    RFC(n_estimators=50, max_depth=10, n_jobs=-1)
)
mr_cv_rf = perform_cv(X_comb, y, m)
print(f"Mean ranks (RF, 5 splits): {mr_cv_rf:.2f}")

0it [00:00, ?it/s]

Mean ranks (RF, 5 splits): 115.08


In [5]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.feature_selection import SelectKBest, f_classif

pl = make_pipeline(
    SelectKBest(f_classif, k=5), 
    QDA()
)
mr_cv_qda = perform_cv(X_comb, y, pl)
print(f"Mean ranks (QDA, 5 splits): {mr_cv_qda:.2f}")

0it [00:00, ?it/s]

Mean ranks (QDA, 5 splits): 104.64
