In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np

from PredicTables.util import to_pl_lf
from PredicTables.univariate import Univariate

# load data
df = load_breast_cancer(as_frame=True).frame
df.columns = [c.replace(' ', '_') for c in df.columns]
df['cv'] = np.random.randint(1, 6, df.shape[0])
df = to_pl_lf(df)

df.head().collect()

mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,radius_error,texture_error,perimeter_error,area_error,smoothness_error,compactness_error,concavity_error,concave_points_error,symmetry_error,fractal_dimension_error,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,target,cv
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32,i32
17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0,4
20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0,4
19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0,1
11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0,2
20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0,5


In [2]:
df.collect().shape

(569, 32)

### fit each fold to a model

In [3]:
X, y = df.collect().to_pandas().drop(columns='target'), df.select('target').collect().to_pandas()['target']
X.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,cv
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,4
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,4
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,2
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,5


In [4]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
X_pca = pd.DataFrame(pca.transform(X), columns=[f'comp_{i}' for i in range(1, 3)]).assign(cv=X['cv'])
X_pca

Unnamed: 0,comp_1,comp_2,cv
0,1160.142487,-293.917386,4
1,1269.122356,15.630334,4
2,995.794053,39.156475,1
3,-407.180717,-67.380464,2
4,930.341010,189.341029,5
...,...,...,...
564,1414.126596,110.222648,4
565,1045.018684,77.057878,5
566,314.501588,47.553806,5
567,1124.858195,34.129097,2


In [5]:
m = Univariate(df=pd.concat([X_pca,y], axis=1), fold_col='cv', feature_col='comp_1', target_col='target')
m

<Model(df)>

In [8]:
m.results.round(2)

Unnamed: 0_level_0,coef,pvalues,se,lower_ci,upper_ci,acc_train,acc_test,auc_train,auc_test,f1_train,f1_test,precision_train,precision_test,recall_train,recall_test,mcc_train,mcc_test,logloss_train,logloss_test
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,-0.01,0.0,0.0,-0.01,-0.01,0.91,0.9,0.89,0.89,0.93,0.91,0.9,0.88,0.97,0.95,0.8,0.79,3.27,3.74
2,-0.01,0.0,0.0,-0.01,-0.01,0.9,0.92,0.88,0.91,0.93,0.93,0.89,0.91,0.97,0.96,0.79,0.83,3.47,2.95
3,-0.01,0.0,0.0,-0.01,-0.01,0.9,0.94,0.88,0.91,0.92,0.96,0.88,0.93,0.96,0.99,0.78,0.86,3.69,2.1
4,-0.01,0.0,0.0,-0.01,-0.01,0.91,0.88,0.9,0.85,0.93,0.91,0.9,0.86,0.97,0.96,0.82,0.74,3.09,4.43
5,-0.01,0.0,0.0,-0.01,-0.01,0.91,0.9,0.89,0.86,0.93,0.93,0.9,0.88,0.97,0.97,0.81,0.77,3.28,3.71
mean,-0.01,0.0,0.0,-0.01,-0.01,0.91,0.91,0.89,0.89,0.93,0.93,0.89,0.89,0.97,0.97,0.8,0.8,3.36,3.36
std,0.0,0.0,0.0,0.0,0.0,0.01,0.02,0.01,0.03,0.01,0.02,0.01,0.03,0.0,0.01,0.01,0.05,0.23,0.89


In [9]:
m.yhat_test

0      0.000217
1      0.000098
2      0.000715
3      0.950789
4      0.001151
         ...   
564    0.000034
565    0.000500
566    0.092190
567    0.000280
568    0.996355
Length: 569, dtype: float64

In [10]:
m.yhat_test.round(0).astype(int).sum()

386

In [11]:
m.yhat_train.round(0).astype(int).sum()

386

In [12]:
m.yhat_train

0      0.000217
1      0.000098
2      0.000715
3      0.950789
4      0.001151
         ...   
564    0.000034
565    0.000500
566    0.092190
567    0.000280
568    0.996355
Length: 569, dtype: float64

In [13]:
m.y_train

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: target, Length: 569, dtype: int32

In [16]:
m.mcc_test

0.8001232702095267

In [18]:
m.f1_train

0.9286675639300135

In [19]:
m.recall_train

0.9663865546218487

In [20]:
m.mcc_train

0.8001232702095267