In [1]:
%load_ext autoreload
%autoreload 2
import config as cfg
from sca import plots, helpers as h

In [6]:
import numpy as np
X, y, pts, ks = h.load_data(cfg.DATA_DIR / 'ascadv_clean.h5', as_df=True)
best_feats_idx = np.load('data/rf_rfe_5_best.npy')
X_best = X.iloc[:, best_feats_idx[:5]]

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier as RFC
import logging

scaler = StandardScaler()
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
rf = RFC(max_depth=5, min_samples_leaf=10, n_jobs=-1, random_state=cfg.SEED)
pl_rfc = make_pipeline(scaler, poly, rf)

score = np.mean(h.cv(pl_rfc, X_best, y, pts, ks, seed=cfg.SEED))
logging.info(f"Mean PI: {score:.2e}")

12:37:14: [1] Mean PI: 0.027
12:37:19: [2] Mean PI: 0.028
12:37:24: [3] Mean PI: 0.028
12:37:29: [4] Mean PI: 0.027
12:37:34: [5] Mean PI: 0.028
12:37:40: [6] Mean PI: 0.029
12:37:45: [7] Mean PI: 0.027
12:37:50: [8] Mean PI: 0.027
12:37:55: [9] Mean PI: 0.027
12:38:01: [10] Mean PI: 0.028
12:38:01: Mean PI: 2.76e-02


In [9]:
import pandas as pd
rf = pl_rfc.named_steps['randomforestclassifier']

df_imps = pd.DataFrame({
    'feature': poly.get_feature_names_out(), 
    'importance': rf.feature_importances_
})
df_imps = df_imps.sort_values(by='importance', ascending=False)
df_imps

Unnamed: 0,feature,importance
7,x0 x2,0.128869
15,x2 x3,0.11414
6,x0 x1,0.090707
12,x1 x3,0.082417
18,x3 x4,0.076614
9,x0 x4,0.071469
8,x0 x3,0.060921
16,x2 x4,0.050416
11,x1 x2,0.04335
13,x1 x4,0.042472


In [None]:
scaler = StandardScaler()
poly = PolynomialFeatures(degree=(2, 3), interaction_only=False, include_bias=False)
rf = RFC(max_depth=3, min_samples_leaf=10, n_jobs=-1, random_state=cfg.SEED)
pl_rfc = make_pipeline(scaler, poly, rf)

score = np.mean(h.cv(pl_rfc, X_best, y, pts, ks, seed=cfg.SEED))
logging.info(f"Mean PI: {score:.2e}")

12:45:03: [1] Mean PI: 0.027
12:45:09: [2] Mean PI: 0.027
12:45:14: [3] Mean PI: 0.027
12:45:20: [4] Mean PI: 0.026
12:45:25: [5] Mean PI: 0.027
12:45:31: [6] Mean PI: 0.028


In [15]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import PowerTransformer

pl_gnb = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(degree=(2, 2), interaction_only=False, include_bias=False),
    PowerTransformer(method='yeo-johnson'),
    GaussianNB()
)

score = np.mean(h.cv(pl_gnb, X_best, y, pts, ks, seed=cfg.SEED))
logging.info(f"Mean PI: {score:.2e}")

12:47:06: [1] Mean PI: -0.935
12:47:09: [2] Mean PI: -0.912
12:47:12: [3] Mean PI: -0.947
12:47:15: [4] Mean PI: -0.904
12:47:17: [5] Mean PI: -0.897
12:47:20: [6] Mean PI: -0.949
12:47:23: [7] Mean PI: -1.028
12:47:26: [8] Mean PI: -0.992
12:47:29: [9] Mean PI: -0.951
12:47:32: [10] Mean PI: -0.899
12:47:32: Mean PI: -9.41e-01


In [16]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

pl_qda = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(degree=(2, 2), interaction_only=False, include_bias=False),
    PowerTransformer(method='yeo-johnson'),
    QDA(solver='eigen', shrinkage='auto')
)

score = np.mean(h.cv(pl_qda, X_best, y, pts, ks, seed=cfg.SEED))
logging.info(f"Mean PI: {score:.2e}")

12:48:00: [1] Mean PI: -1.052
12:48:05: [2] Mean PI: -1.084
12:48:10: [3] Mean PI: -1.109
12:48:15: [4] Mean PI: -1.055
12:48:20: [5] Mean PI: -1.071
12:48:25: [6] Mean PI: -0.998
12:48:29: [7] Mean PI: -1.073
12:48:34: [8] Mean PI: -1.050
12:48:39: [9] Mean PI: -1.060
12:48:44: [10] Mean PI: -1.062
12:48:44: Mean PI: -1.06e+00
