In [1]:
import pandas as pd
import numpy as np
from predictables.core.src._UnivariateAnalysis import UnivariateAnalysis
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the breast cancer dataset
bcancer = load_breast_cancer()
cancery = pd.Series(bcancer.target, name="y").map({0: "malignant", 1: "benign"})
cancerdf = pd.DataFrame(
    bcancer.data, columns=[c.replace(" ", "_") for c in bcancer.feature_names]
)

# Standardize the data
scaler = StandardScaler()
cancerdf = pd.DataFrame(scaler.fit_transform(cancerdf), columns=cancerdf.columns)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    cancerdf, cancery, test_size=0.2, random_state=42, stratify=cancery
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42, stratify=y_train
)

# Perform PCA
pca = PCA(n_components=2)
X_train_pca = pd.DataFrame(
    pca.fit_transform(X_train), columns=["PC1", "PC2"], index=X_train.index
)
X_val_pca = pd.DataFrame(
    pca.transform(X_val), columns=["PC1", "PC2"], index=X_val.index
)
X_test_pca = pd.DataFrame(
    pca.transform(X_test), columns=["PC1", "PC2"], index=X_test.index
)

# Combine the data
df_train = pd.concat([X_train, X_train_pca], axis=1)
df_val = pd.concat([X_val, X_val_pca], axis=1)
df_test = pd.concat([X_test, X_test_pca], axis=1)

# Add the target variable
df_train["y"] = y_train.map({"malignant": 0, "benign": 1}).values
df_val["y"] = y_val.map({"malignant": 0, "benign": 1}).values
df_test["y"] = y_test.map({"malignant": 0, "benign": 1}).values

# Randomly sort training data into 5 cross-validation folds
df_train["fold"] = np.random.choice(range(5), size=df_train.shape[0]) + 1

df_train.to_parquet("cancer_train.parquet")
df_val.to_parquet("cancer_val.parquet")
df_test.to_parquet("cancer_test.parquet")

df_train.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,PC1,PC2,y,fold
194,0.2081,0.912292,0.347273,0.046959,0.57215,1.774977,1.015706,1.02817,-0.272428,0.55604,...,-0.033692,1.339296,0.895753,0.884571,0.160555,0.169804,2.601883,1.664731,0,1
46,-1.684571,-0.57005,-1.658278,-1.288347,-0.737294,-0.85113,-0.9155,-1.109197,-0.155598,0.316465,...,-0.11698,-0.754239,-0.975761,-1.354653,0.330422,-0.546168,-4.564517,1.565109,1,4
332,-0.825712,0.132725,-0.825,-0.761051,0.643316,-0.692695,-1.052023,-1.066224,0.468713,-0.356897,...,0.439736,-1.002397,-1.241784,-1.437181,0.632947,-1.037706,-3.127716,1.357112,1,3
76,-0.169639,-1.943019,-0.167192,-0.27215,2.329937,0.006804,-0.251467,0.429234,2.1591,0.512094,...,0.558093,-0.740244,-0.89617,-0.617229,-0.308601,-0.666975,-0.807603,2.071959,1,3
124,-0.215082,-0.674768,-0.241747,-0.288361,-1.794101,-0.58922,-0.098925,-0.539588,-1.422476,-0.647506,...,-1.309316,-0.007411,0.28119,-0.378019,-1.379572,-0.424808,-2.212381,-0.936765,1,1


In [2]:
ua = UnivariateAnalysis(
    model_name="Cancer Model",
    df_train=df_train,
    df_val=df_val,
    target_column_name="y",
    feature_column_names=df_train.drop(columns=["y", "fold"]).columns.tolist(),
    cv_column_name="fold",
    has_time_series_structure=False,
)

Performing univariate analysis on 32 features:   0%|          | 0/32 [00:00<?, ?it/s]

In [3]:
# dir(ua.pc1)
ua.pc1.agg_results.collect()

fold,coef,pvalues,se,lower_ci,upper_ci,acc_train,acc_test,auc_train,auc_test,f1_train,f1_test,precision_train,precision_test,recall_train,recall_test,mcc_train,mcc_test,logloss_train,logloss_test
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Fold-1""",-1.105948,2.3399e-17,0.130487,-1.361698,-0.850198,0.904059,0.957143,0.904019,0.945883,0.920732,0.968421,0.937888,0.958333,0.904192,0.978723,0.800198,0.902261,3.458063,1.544728
"""Fold-2""",-1.382352,1.9e-15,0.173931,-1.72325,-1.041454,0.919118,0.84058,0.918121,0.843434,0.937143,0.835821,0.953488,0.903226,0.921348,0.777778,0.824787,0.689778,2.915295,5.74609
"""Fold-3""",-1.372079,7.9436e-15,0.176621,-1.71825,-1.025907,0.925373,0.876712,0.922215,0.883962,0.937888,0.910891,0.937888,0.958333,0.937888,0.867925,0.84443,0.72174,2.689825,4.443738
"""Fold-4""",-1.158268,1.1734e-17,0.135383,-1.423613,-0.892923,0.906367,0.932432,0.908888,0.919923,0.924012,0.946237,0.95,0.916667,0.899408,0.977778,0.804338,0.858812,3.374874,2.435382
"""Fold-5""",-1.134671,2.1417e-18,0.129683,-1.388845,-0.880497,0.909091,0.945455,0.906183,0.94697,0.927374,0.953846,0.937853,0.96875,0.917127,0.939394,0.806279,0.887846,3.276696,1.966017
"""mean""",-1.213653,9.158899999999999e-21,0.129867,-1.468187,-0.959118,0.914956,0.921053,0.913036,0.918277,0.931442,0.93617,0.942584,0.942857,0.920561,0.929577,0.819911,0.832879,3.065296,2.845552
"""std""",0.135108,3.4346e-15,0.023904,0.181739,0.08871,0.009077,0.049772,0.007889,0.044279,0.00775,0.053154,0.007695,0.029112,0.015196,0.085761,0.018478,0.098965,0.327155,1.793951


In [4]:
ua.pc1.get_results()

CV Fold,Fold-1,Fold-2,Fold-3,Fold-4,Fold-5,mean,std
Fitted Coef.,-1.11,-1.38,-1.37,-1.16,-1.13,-1.21,0.14
Fitted p-Value,2.3e-17,1.9e-15,7.9e-15,1.2e-17,2.1e-18,9.2e-21,3.4e-15
Fitted Std. Err.,0.130,0.174,0.177,0.135,0.130,0.130,0.024
Conf. Int. Lower,-1.36,-1.72,-1.72,-1.42,-1.39,-1.47,0.18
Conf. Int. Upper,-0.850,-1.041,-1.026,-0.893,-0.880,-0.959,0.089
Train Accuracy,90.4%,91.9%,92.5%,90.6%,90.9%,91.5%,0.9%
Val Accuracy,95.7%,84.1%,87.7%,93.2%,94.5%,92.1%,5.0%
Train AUC,90.4%,91.8%,92.2%,90.9%,90.6%,91.3%,0.8%
Val AUC,94.6%,84.3%,88.4%,92.0%,94.7%,91.8%,4.4%
Train F1,92.1%,93.7%,93.8%,92.4%,92.7%,93.1%,0.8%


In [5]:
import polars as pl

getattr(ua, "pc1").results.select(
    [
        pl.col("feature").alias("Feature"),
        pl.col("acc_test").alias("Accuracy"),
        pl.col("precision_test").alias("Precision"),
        pl.col("recall_test").alias("Recall"),
        pl.col("auc_test").alias("AUC"),
        pl.col("f1_test").alias("F1"),
        pl.col("mcc_test").alias("MCC"),
        (
            pl.col("acc_test")
            + pl.col("precision_test")
            + pl.col("recall_test")
            + pl.col("auc_test")
            + pl.col("f1_test")
            + pl.col("mcc_test")
        )
        .truediv(6.0)
        .alias("Ave."),
    ]
).collect().to_pandas().set_index("Feature")

Unnamed: 0_level_0,Accuracy,Precision,Recall,AUC,F1,MCC,Ave.
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PC1,0.921053,0.942857,0.929577,0.918277,0.93617,0.832879,0.913469


In [6]:
ua._sort_features_by_ua()

Unnamed: 0_level_0,Accuracy,Precision,Recall,AUC,F1,MCC,Ave.
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
worst_area,0.929825,0.909091,0.985915,0.911562,0.945946,0.852085,0.922404
PC1,0.921053,0.942857,0.929577,0.918277,0.93617,0.832879,0.913469
worst_radius,0.903509,0.916667,0.929577,0.895021,0.923077,0.793822,0.893612
worst_perimeter,0.894737,0.915493,0.915493,0.887979,0.915493,0.775958,0.884192
mean_concave_points,0.885965,0.939394,0.873239,0.890108,0.905109,0.765924,0.876623
mean_area,0.885965,0.881579,0.943662,0.86718,0.911565,0.755046,0.874166
mean_concavity,0.877193,0.913043,0.887324,0.873895,0.9,0.741501,0.865493
mean_radius,0.868421,0.9,0.887324,0.862267,0.893617,0.721351,0.855497
mean_perimeter,0.868421,0.9,0.887324,0.862267,0.893617,0.721351,0.855497
area_error,0.868421,0.858974,0.943662,0.843924,0.899329,0.717228,0.855256


In [7]:
ua.build_report("cancer.pdf", max_per_file=20)

Building 32 univariate analysis reports,and packaging in increments of 20:   0%|          | 0/32 [00:00<?, ?it…