In [1]:
import pandas as pd
import numpy as np
from predictables.core.src._UnivariateAnalysis import UnivariateAnalysis
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the breast cancer dataset
bcancer = load_breast_cancer()
cancery = pd.Series(bcancer.target, name="y").map({0: "malignant", 1: "benign"})
cancerdf = pd.DataFrame(
    bcancer.data, columns=[c.replace(" ", "_") for c in bcancer.feature_names]
)

# Standardize the data
scaler = StandardScaler()
cancerdf = pd.DataFrame(scaler.fit_transform(cancerdf), columns=cancerdf.columns)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    cancerdf, cancery, test_size=0.2, random_state=42, stratify=cancery
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42, stratify=y_train
)

# Perform PCA
pca = PCA(n_components=2)
X_train_pca = pd.DataFrame(
    pca.fit_transform(X_train), columns=["PC1", "PC2"], index=X_train.index
)
X_val_pca = pd.DataFrame(
    pca.transform(X_val), columns=["PC1", "PC2"], index=X_val.index
)
X_test_pca = pd.DataFrame(
    pca.transform(X_test), columns=["PC1", "PC2"], index=X_test.index
)

# Combine the data
df_train = pd.concat([X_train, X_train_pca], axis=1)
df_val = pd.concat([X_val, X_val_pca], axis=1)
df_test = pd.concat([X_test, X_test_pca], axis=1)

# Add the target variable
df_train["y"] = y_train.map({"malignant": 0, "benign": 1}).values
df_val["y"] = y_val.map({"malignant": 0, "benign": 1}).values
df_test["y"] = y_test.map({"malignant": 0, "benign": 1}).values

# Randomly sort training data into 5 cross-validation folds
df_train["fold"] = np.random.choice(range(5), size=df_train.shape[0]) + 1

df_train.to_parquet("cancer_train.parquet")
df_val.to_parquet("cancer_val.parquet")
df_test.to_parquet("cancer_test.parquet")

df_train.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,PC1,PC2,y,fold
194,0.2081,0.912292,0.347273,0.046959,0.57215,1.774977,1.015706,1.02817,-0.272428,0.55604,...,-0.033692,1.339296,0.895753,0.884571,0.160555,0.169804,2.601883,1.664731,0,4
46,-1.684571,-0.57005,-1.658278,-1.288347,-0.737294,-0.85113,-0.9155,-1.109197,-0.155598,0.316465,...,-0.11698,-0.754239,-0.975761,-1.354653,0.330422,-0.546168,-4.564517,1.565109,1,3
332,-0.825712,0.132725,-0.825,-0.761051,0.643316,-0.692695,-1.052023,-1.066224,0.468713,-0.356897,...,0.439736,-1.002397,-1.241784,-1.437181,0.632947,-1.037706,-3.127716,1.357112,1,3
76,-0.169639,-1.943019,-0.167192,-0.27215,2.329937,0.006804,-0.251467,0.429234,2.1591,0.512094,...,0.558093,-0.740244,-0.89617,-0.617229,-0.308601,-0.666975,-0.807603,2.071959,1,1
124,-0.215082,-0.674768,-0.241747,-0.288361,-1.794101,-0.58922,-0.098925,-0.539588,-1.422476,-0.647506,...,-1.309316,-0.007411,0.28119,-0.378019,-1.379572,-0.424808,-2.212381,-0.936765,1,4


In [2]:
ua = UnivariateAnalysis(
    model_name="Cancer Model",
    df_train=df_train,
    df_val=df_val,
    target_column_name="y",
    feature_column_names=df_train.drop(columns=["y", "fold"]).columns.tolist(),
    cv_column_name="fold",
    has_time_series_structure=False,
)

Performing univariate analysis on 32 features:   0%|          | 0/32 [00:00<?, ?it/s]

In [3]:
# dir(ua.pc1)
ua.pc1.agg_results.collect()

fold,coef,pvalues,se,lower_ci,upper_ci,acc_train,acc_test,auc_train,auc_test,f1_train,f1_test,precision_train,precision_test,recall_train,recall_test,mcc_train,mcc_test,logloss_train,logloss_test
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Fold-1""",-1.208444,6.6749e-18,0.140185,-1.483202,-0.933686,0.904594,0.913793,0.902675,0.921717,0.923077,0.927536,0.936416,0.969697,0.910112,0.888889,0.798124,0.826394,3.438794,3.107211
"""Fold-2""",-1.269488,3.8417e-16,0.155891,-1.575028,-0.963948,0.916031,0.886076,0.917679,0.886305,0.934132,0.894118,0.957055,0.904762,0.912281,0.883721,0.820313,0.771117,3.026566,4.106239
"""Fold-3""",-1.242545,4.1053e-16,0.152733,-1.541896,-0.943194,0.917603,0.905405,0.916677,0.8975,0.932099,0.929293,0.94375,0.938776,0.920732,0.92,0.827785,0.786846,2.969889,3.409535
"""Fold-4""",-1.274209,3.2332e-16,0.156071,-1.580102,-0.968315,0.919708,0.895522,0.916097,0.900476,0.935673,0.91358,0.941176,0.948718,0.930233,0.880952,0.828986,0.785398,2.894016,3.765755
"""Fold-5""",-1.101328,4.9335e-18,0.127249,-1.350732,-0.851924,0.902878,0.968254,0.901815,0.963372,0.919881,0.976744,0.933735,0.976744,0.906433,0.976744,0.797228,0.926744,3.500643,1.144243
"""mean""",-1.213653,9.158899999999999e-21,0.129867,-1.468187,-0.959118,0.914956,0.921053,0.913036,0.918277,0.931442,0.93617,0.942584,0.942857,0.920561,0.929577,0.819911,0.832879,3.065296,2.845552
"""std""",0.070904,2.0342e-16,0.012541,0.095095,0.047116,0.007826,0.032165,0.008007,0.030493,0.007048,0.030551,0.009069,0.028512,0.009552,0.040419,0.015706,0.06348,0.282081,1.159351


In [4]:
ua.pc1.get_results()

CV Fold,Fold-1,Fold-2,Fold-3,Fold-4,Fold-5,mean,std
Fitted Coef.,-1.21,-1.27,-1.24,-1.27,-1.10,-1.21,0.07
Fitted p-Value,6.7e-18,3.8e-16,4.1e-16,3.2e-16,4.9e-18,9.2e-21,2.0e-16
Fitted Std. Err.,0.140,0.156,0.153,0.156,0.127,0.130,0.013
Conf. Int. Lower,-1.48,-1.58,-1.54,-1.58,-1.35,-1.47,0.10
Conf. Int. Upper,-0.934,-0.964,-0.943,-0.968,-0.852,-0.959,0.047
Train Accuracy,90.5%,91.6%,91.8%,92.0%,90.3%,91.5%,0.8%
Val Accuracy,91.4%,88.6%,90.5%,89.6%,96.8%,92.1%,3.2%
Train AUC,90.3%,91.8%,91.7%,91.6%,90.2%,91.3%,0.8%
Val AUC,92.2%,88.6%,89.8%,90.0%,96.3%,91.8%,3.0%
Train F1,92.3%,93.4%,93.2%,93.6%,92.0%,93.1%,0.7%


In [5]:
import polars as pl

getattr(ua, "pc1").results.select(
    [
        pl.col("feature").alias("Feature"),
        pl.col("acc_test").alias("Accuracy"),
        pl.col("precision_test").alias("Precision"),
        pl.col("recall_test").alias("Recall"),
        pl.col("auc_test").alias("AUC"),
        pl.col("f1_test").alias("F1"),
        pl.col("mcc_test").alias("MCC"),
        (
            pl.col("acc_test")
            + pl.col("precision_test")
            + pl.col("recall_test")
            + pl.col("auc_test")
            + pl.col("f1_test")
            + pl.col("mcc_test")
        )
        .truediv(6.0)
        .alias("Ave."),
    ]
).collect().to_pandas().set_index("Feature")

Unnamed: 0_level_0,Accuracy,Precision,Recall,AUC,F1,MCC,Ave.
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PC1,0.921053,0.942857,0.929577,0.918277,0.93617,0.832879,0.913469


In [8]:
ua._sort_features_by_ua()

Unnamed: 0_level_0,Accuracy,Precision,Recall,AUC,F1,MCC,Ave.
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
worst_area,0.929825,0.909091,0.985915,0.911562,0.945946,0.852085,0.922404
PC1,0.921053,0.942857,0.929577,0.918277,0.93617,0.832879,0.913469
worst_radius,0.903509,0.916667,0.929577,0.895021,0.923077,0.793822,0.893612
worst_perimeter,0.894737,0.915493,0.915493,0.887979,0.915493,0.775958,0.884192
mean_concave_points,0.885965,0.939394,0.873239,0.890108,0.905109,0.765924,0.876623
mean_area,0.885965,0.881579,0.943662,0.86718,0.911565,0.755046,0.874166
mean_concavity,0.877193,0.913043,0.887324,0.873895,0.9,0.741501,0.865493
mean_radius,0.868421,0.9,0.887324,0.862267,0.893617,0.721351,0.855497
mean_perimeter,0.868421,0.9,0.887324,0.862267,0.893617,0.721351,0.855497
area_error,0.868421,0.858974,0.943662,0.843924,0.899329,0.717228,0.855256


In [7]:
ua.build_report("cancer.pdf", max_per_file=20)

Building 32 univariate analysis reports,and packaging in increments of 20:   0%|          | 0/32 [00:00<?, ?it…