In [1]:
%load_ext autoreload
%autoreload 2
import config as cfg
import numpy as np
import logging
from sca import plots, helpers as h
from sklearn.pipeline import make_pipeline

In [2]:
X, y, pts, ks = h.load_data(cfg.DATA_DIR / 'ascadv_clean.h5', as_df=True)
best_feats_idx = np.load('data/rf_rfe_5_best.npy')
X_best = X.iloc[:, best_feats_idx[:5]]

In [None]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier as RFC

scaler = StandardScaler()
poly = PolynomialFeatures(degree=2, include_bias=False)
rf = RFC(max_depth=5, min_samples_leaf=10, n_jobs=-1, random_state=cfg.SEED)
pl_rfc = make_pipeline(scaler, poly, rf)

score = np.mean(h.cv(pl_rfc, X_best, y, pts, ks, seed=cfg.SEED))
logging.info(f"Mean PI: {score:.2e}")

In [None]:
import pandas as pd
rf = pl_rfc.named_steps['randomforestclassifier']

df_imps = pd.DataFrame({
    'feature': poly.get_feature_names_out(X_best.columns), 
    'importance': rf.feature_importances_
})
df_imps = df_imps.sort_values(by='importance', ascending=False)
df_imps["feature"] = df_imps["feature"].str.replace(r"^([^ ^]+)\^[^ ^]+$", r"\1 \1", regex=True)
df_imps[["feat1", "feat2"]] = df_imps["feature"].str.split(" ", n=1, expand=True)
df_imps = df_imps.drop(columns=['feature'])
df_imps

In [None]:
scaler = StandardScaler()
poly = PolynomialFeatures(degree=(2, 2), include_bias=False)
rf = RFC(max_depth=5, min_samples_leaf=10, n_jobs=-1, random_state=cfg.SEED)
pl_rf = make_pipeline(scaler, poly, rf)

score = np.mean(h.cv(pl_rf, X_best, y, pts, ks, seed=cfg.SEED))
logging.info(f"Mean PI: {score:.2e}")

In [27]:
from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier as RFC
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

pl_lr = make_pipeline(
    RobustScaler(quantile_range=(5, 95)),
    PolynomialFeatures(degree=(2, 2), include_bias=False),
    LogisticRegression(C=10.0, max_iter=1000), 
)
score = np.mean(h.cv(pl_lr, X_best, y, pts, ks, seed=cfg.SEED))
logging.info(f"Mean PI: {score:.2e}")

20:41:38: [1] Mean PI: 0.025
20:41:42: [2] Mean PI: 0.029
20:41:47: [3] Mean PI: 0.029
20:41:54: [4] Mean PI: 0.024
20:41:57: [5] Mean PI: 0.026
20:42:01: [6] Mean PI: 0.030
20:42:06: [7] Mean PI: 0.028
20:42:09: [8] Mean PI: 0.024
20:42:12: [9] Mean PI: 0.025
20:42:16: [10] Mean PI: 0.030
20:42:16: Mean PI: 2.69e-02


In [3]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler, FunctionTransformer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.preprocessing import PowerTransformer, QuantileTransformer

def fidx(feats, names):
    return [list(names).index(f) for f in feats]

def window_interactions(groups, names, k=1):
    def fn(X, y=None):
        W = [np.mean(X[:, fidx(g, names)], 
                     axis=-1, keepdims=True) for g in groups]
        W = np.concat(W, axis=-1)
        idx = np.triu_indices(W.shape[1], k=k)
        M = np.einsum('ij,ik->ijk', W, W)[:, *idx].reshape(W.shape[0], -1)
        D = ((W[:, :, None] - W[:, None, :])**2)[:, *idx].reshape(W.shape[0], -1)
        S = ((W[:, :, None] + W[:, None, :])**2)[:, *idx].reshape(W.shape[0], -1)
        return np.concat([M, D, S], axis=1)
    return fn
        
pl_lr = make_pipeline(
    RobustScaler(quantile_range=(5, 95)),
    FunctionTransformer(window_interactions(
        [["1069", "1070", "1071"], ["187", "188"]], names=X.columns
    )),
    LogisticRegression(C=10.0, max_iter=1000), 
)

score = np.mean(h.cv(pl_lr, X, y, pts, ks, seed=cfg.SEED))
logging.info(f"Mean PI: {score:.2e}")

NameError: name 'LogisticRegression' is not defined

In [26]:
from sklearn.neural_network import MLPClassifier

pl_mlp = make_pipeline(
    RobustScaler(),
    PolynomialFeatures(degree=(2, 2), include_bias=False),
    StandardScaler(),
    MLPClassifier(
        hidden_layer_sizes=(32, 32),
        activation='tanh', 
        solver='adam',
        alpha=0.001, 
        learning_rate_init=1e-3,
        max_iter=5000,
        early_stopping=True
    )
)

score = np.mean(h.cv(pl_mlp, X_best, y, pts, ks, seed=cfg.SEED))
logging.info(f"Mean PI: {score:.2e}")

20:37:10: [1] Mean PI: 0.018
20:37:17: [2] Mean PI: 0.028
20:37:22: [3] Mean PI: 0.026
20:37:30: [4] Mean PI: 0.024
20:37:42: [5] Mean PI: 0.015
20:37:51: [6] Mean PI: 0.023
20:38:00: [7] Mean PI: 0.019
20:38:06: [8] Mean PI: 0.025
20:38:12: [9] Mean PI: 0.027
20:38:22: [10] Mean PI: 0.020
20:38:22: Mean PI: 2.25e-02


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import RandomForestClassifier as RFC

pl_lr_2 = make_pipeline(
    RobustScaler(quantile_range=(5, 95)),
    FunctionTransformer(window_interactions(
        [["1069", "1070", "1071"], ["187"], ["65"]], names=X.columns, k=1
    )),
    LogisticRegression(C=0.5, max_iter=1000), 
)

score = np.mean(h.cv(pl_lr_2, X, y, pts, ks, seed=cfg.SEED))
logging.info(f"Mean PI: {score:.2e}")

01:10:26: [1] Mean PI: 0.030
01:10:34: [2] Mean PI: 0.031
01:10:42: [3] Mean PI: 0.031
01:10:50: [4] Mean PI: 0.030
01:10:58: [5] Mean PI: 0.030
01:11:06: [6] Mean PI: 0.031
01:11:14: [7] Mean PI: 0.029
01:11:23: [8] Mean PI: 0.029
01:11:31: [9] Mean PI: 0.030
01:11:39: [10] Mean PI: 0.031
01:11:39: Mean PI: 3.02e-02
