In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, roc_auc_score, roc_curve, precision_recall_curve
from dataclasses import dataclass
from PredicTables.util import get_column_dtype
import pandas as pd
import numpy as np
import polars as pl

import statsmodels.api as sm
from statsmodels.formula.api import ols, glm

from PredicTables.util import to_pl_lf

# load data
df = load_breast_cancer(as_frame=True).frame
df.columns = [c.replace(' ', '_') for c in df.columns]
df['cv'] = np.random.randint(1, 6, df.shape[0])
df = to_pl_lf(df)

df.head().collect()

mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,radius_error,texture_error,perimeter_error,area_error,smoothness_error,compactness_error,concavity_error,concave_points_error,symmetry_error,fractal_dimension_error,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,target,cv
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32,i32
17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0,1
20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0,4
19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0,3
11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0,3
20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0,1


In [2]:
df.collect().shape

(569, 32)

In [3]:
from PredicTables.univariate.BaseModel import Model

In [4]:
class Univariate(Model):
    def __init__(self, df: pl.LazyFrame, fold_col: str = 'cv', feature_col: str = None, target_col: str = None):
        super().__init__(df, fold_col=fold_col, feature_col=feature_col, target_col=target_col)
        self.unique_folds = self.df.select(self.fold_col).unique().collect().to_pandas()[self.fold_col].values
        self.cv = {}
        for fold in self.unique_folds:
            self.cv[fold] = Model(self.df, fold=fold, fold_col=self.fold_col, feature_col=self.feature_col, target_col=self.target_col)


In [5]:
def get_col(self, col):
    attributes = [getattr(self.cv[fold], col) for fold in self.unique_folds]
    sd = pd.Series(attributes).std()
        
    out = attributes + [getattr(self, col)] + [sd]
    return out


class UnivariateResults(Univariate):
    def __init__(self, df: pl.LazyFrame, fold_col: str = 'cv', feature_col: str = None, target_col: str = None):
        super().__init__(df, fold_col=fold_col, feature_col=feature_col, target_col=target_col)
        self.results = pd.DataFrame(index=sorted(self.unique_folds.tolist()) + ['mean', 'std'])
        self.results.index.name = 'fold'
        for attribute in ['coef', 'pvalues', 'se'
        # , 'lower_ci', 'upper_ci'
        , 'n', 'k']:
            self.results[attribute] = get_col(self, attribute)

### fit each fold to a model

In [6]:
df_train = df.select(['mean_radius', 'target', 'cv']).filter(pl.col('cv') != 1).with_columns([((pl.col('mean_radius') - pl.col('mean_radius').mean()) / pl.col('mean_radius').std()).alias('mean_radius')])
df_test = df.select(['mean_radius', 'target', 'cv']).filter(pl.col('cv') == 1).with_columns([((pl.col('mean_radius') - pl.col('mean_radius').mean()) / pl.col('mean_radius').std()).alias('mean_radius')])

X_train = df_train.select('mean_radius').collect().to_pandas()
y_train = df_train.select('target').collect().to_pandas()
X_test = df_test.select('mean_radius').collect().to_pandas()
y_test = df_test.select('target').collect().to_pandas()

df_train.head().collect()

mean_radius,target,cv
f64,i32,i32
1.768956,0,4
1.522224,0,3
-0.796494,0,3
-0.507705,0,5
1.118481,0,2


In [7]:
X, y = df.collect().to_pandas().drop(columns='target cv'.split()), df.select('target').collect().to_pandas()['target']
X

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [8]:
sm.GLM(y, X).fit().summary()

0,1,2,3
Dep. Variable:,target,No. Observations:,569.0
Model:,GLM,Df Residuals:,539.0
Model Family:,Gaussian,Df Model:,29.0
Link Function:,Identity,Scale:,0.060851
Method:,IRLS,Log-Likelihood:,4.4418
Date:,"Wed, 10 Jan 2024",Deviance:,32.799
Time:,13:33:17,Pearson chi2:,32.8
No. Iterations:,3,Pseudo R-squ. (CS):,0.9447
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
mean_radius,0.4200,0.179,2.350,0.019,0.070,0.770
mean_texture,-0.0026,0.008,-0.316,0.752,-0.019,0.014
mean_perimeter,-0.0239,0.026,-0.912,0.362,-0.075,0.027
mean_area,-0.0019,0.000,-3.718,0.000,-0.003,-0.001
mean_smoothness,-0.3506,2.106,-0.166,0.868,-4.479,3.778
mean_compactness,0.1224,1.254,0.098,0.922,-2.335,2.580
mean_concavity,-0.8504,1.089,-0.781,0.435,-2.985,1.284
mean_concave_points,-3.4663,2.058,-1.685,0.092,-7.499,0.567
mean_symmetry,0.7007,0.767,0.914,0.361,-0.802,2.203


In [9]:
lr = LogisticRegression(fit_intercept=False).fit(X[['mean_radius']], y)
x = lr.coef_.ravel()[0]
x
# (1 / (1+np.exp(-x)))

0.010541960356462444

In [10]:
sm.GLM.fit?

[1;31mSignature:[0m
[0msm[0m[1;33m.[0m[0mGLM[0m[1;33m.[0m[0mfit[0m[1;33m([0m[1;33m
[0m    [0mself[0m[1;33m,[0m[1;33m
[0m    [0mstart_params[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmaxiter[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [0mmethod[0m[1;33m=[0m[1;34m'IRLS'[0m[1;33m,[0m[1;33m
[0m    [0mtol[0m[1;33m=[0m[1;36m1e-08[0m[1;33m,[0m[1;33m
[0m    [0mscale[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcov_type[0m[1;33m=[0m[1;34m'nonrobust'[0m[1;33m,[0m[1;33m
[0m    [0mcov_kwds[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0muse_t[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mfull_output[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mdisp[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mmax_start_irls[0m[1;33m=[0m[1;36m3[0m[1;33m,[0m[1;33m
[0m    [1;33m**[0m[0mkwargs[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m

In [11]:
m = Model(df=df_train, fold=1, fold_col='cv', feature_col='mean_radius', target_col='target')

In [12]:
import numpy as np
import pandas as pd
import polars as pl
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression

california_housing = fetch_california_housing()
X, y = (
    pd.DataFrame(california_housing.data, columns=california_housing.feature_names).iloc[:, :1],
    pd.Series(california_housing.target, name="target"),
)

In [13]:
lr = LinearRegression(fit_intercept=False).fit(X, y)
lr.coef_

array([0.51180604])

In [14]:
u = UnivariateResults(df, feature_col='mean_radius', target_col='target')
u.results

Unnamed: 0_level_0,coef,pvalues,se,n,k
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.010213,0.110377,0.006397,462.0,1.0
2,0.014581,0.023515,0.006438,465.0,1.0
3,0.008273,0.200566,0.006464,455.0,1.0
4,0.012145,0.06454,0.00657,448.0,1.0
5,0.007508,0.24481,0.006455,446.0,1.0
mean,0.010542,0.068185,0.00578,569.0,1.0
std,0.002887,0.092415,6.4e-05,8.348653,0.0


In [None]:
from sklearn.linear_model import LogisticRegression
X = np.array([[
    -1, -2, -3, 1, 2, 3
]]).T
y = np.array([
    0, 0, 0, 1, 1, 1
])

lr = LogisticRegression(fit_intercept=False).fit(X, y)
lr.coef_

array([[1.10440413]])

In [None]:
u.cv[1].y_train

0      0
1      0
2      0
3      0
4      0
      ..
443    0
444    0
445    0
446    0
447    1
Name: target, Length: 448, dtype: int32