In [135]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, roc_auc_score, roc_curve, precision_recall_curve
from dataclasses import dataclass
from PredicTables.util import get_column_dtype
import pandas as pd
import numpy as np
import polars as pl

import statsmodels.api as sm
from statsmodels.formula.api import ols, glm

from PredicTables.util import to_pl_lf

# load data
df = load_breast_cancer(as_frame=True).frame
df.columns = [c.replace(' ', '_') for c in df.columns]
df['cv'] = np.random.randint(1, 6, df.shape[0])
df = to_pl_lf(df)

df.head().collect()

mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,radius_error,texture_error,perimeter_error,area_error,smoothness_error,compactness_error,concavity_error,concave_points_error,symmetry_error,fractal_dimension_error,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,target,cv
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64
17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0,3
20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0,2
19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0,3
11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0,2
20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0,4


In [158]:
df.collect().shape

(569, 32)

In [224]:
m = Model(df, 1, 'cv', 'mean_radius', 'target')
m.__dict__.keys()

dict_keys(['df', 'fold', 'fold_col', 'feature_col', 'target_col', 'df_train', 'df_test', 'X_train', 'y_train', 'X_test', 'y_test', 'target_type', 'model', 'fitted', 'yhat_train', 'yhat_test', 'coef', 'pvalues', 'aic', 'se', 'lower_ci', 'upper_ci', 'conf_int', 'df_model', 'df_resid', 'H_diag', 'n', 'k', 't_test', 'summary', 'sk_model', 'sk_coef', 'auc', 'prc'])

In [229]:
m.t_test()

TypeError: LikelihoodModelResults.t_test() missing 1 required positional argument: 'r_matrix'

In [223]:
@dataclass
class Model:
    df: pl.LazyFrame
    fold: int = None
    fold_col: str = 'cv'
    feature_col: str = None
    target_col: str = None

    def __post_init__(self, fold=None):
        if fold is not None:
            self.fold = fold
            
        if self.fold is None:
            self.df_train = self.df.select([self.feature_col, self.target_col])
            self.df_test = self.df.select([self.feature_col, self.target_col])
        else:
            self.df_train = self.df.select([self.feature_col, self.target_col, self.fold_col]).filter(pl.col(self.fold_col) != self.fold)
            self.df_test = self.df.select([self.feature_col, self.target_col, self.fold_col]).filter(pl.col(self.fold_col) == self.fold)
            
        self.X_train = self.df_train.select(self.feature_col).collect().to_pandas()
        self.y_train = self.df_train.select(self.target_col).collect().to_pandas()
        self.X_test = self.df_test.select(self.feature_col).collect().to_pandas()
        self.y_test = self.df_test.select(self.target_col).collect().to_pandas()

        self.target_type = get_column_dtype(self.y_train[self.target_col])

        if self.target_type in ['categorical', 'binary']:
            self.model = sm.GLM(self.y_train.values.ravel(), self.X_train.values, family=sm.families.Binomial()).fit()
        else:
            self.model = sm.OLS(self.y_train, self.X_train).fit()

        self.fitted = self.model.fittedvalues
        self.yhat_train = self.model.predict(self.X_train)
        self.yhat_test = self.model.predict(self.X_test)
        
        
        self.coef = self.model.params[0]
        self.pvalues = self.model.pvalues[0]
        self.aic = self.model.aic
        self.se = self.model.bse[0]
        self.lower_ci = self.model.conf_int()[0][0]
        self.upper_ci = self.model.conf_int()[0][1]
        self.n = self.model.nobs
        self.k = self.model.params.shape[0]

        self.summary = self.model.summary()

        # add a sklearn model to get its metrics\
        if self.target_type in ['categorical', 'binary']:
            self.sk_model = LogisticRegression(fit_intercept=False)
        else:
            self.sk_model = LinearRegression(fit_intercept=False)

        self.sk_model.fit(self.X_train, self.y_train.values.ravel())
        self.sk_coef = self.sk_model.coef_
        
        if self.target_type in ['categorical', 'binary']:
            self.auc = roc_curve(self.y_test, self.yhat_test)
            self.prc = precision_recall_curve(self.y_test, self.yhat_test)  


In [234]:
@dataclass
class Univariate(Model):
    df: pl.LazyFrame
    fold_col: str = 'cv'
    feature_col: str = None
    target_col: str = None

    def __post_init__(self):
        super().__post_init__()
        self.unique_folds = self.df.select(self.fold_col).unique().collect().to_pandas()[self.fold_col].values
        self.cv = {}
        for fold in self.unique_folds:
            self.cv[fold] = Model(self.df, fold=fold, fold_col=self.fold_col, feature_col=self.feature_col, target_col=self.target_col)


In [249]:
def get_col(self, col):
    attributes = [getattr(self.cv[fold], col) for fold in self.unique_folds]
    sd = pd.Series(attributes).std()
        
    out = attributes + [getattr(self, col)] + [sd]
    return out


@dataclass
class UnivariateResults(Univariate):
    df: pl.LazyFrame
    fold_col: str = 'cv'
    feature_col: str = None
    target_col: str = None

    def __post_init__(self):
        super().__post_init__()
        self.results = pd.DataFrame(index=sorted(self.unique_folds.tolist()) + ['mean', 'std'])
        self.results.index.name = 'fold'
        for attribute in ['coef', 'pvalues', 'se', 'lower_ci', 'upper_ci', 'n', 'k']:
            self.results[attribute] = get_col(self, attribute)

### fit each fold to a model

In [152]:
df_train = df.select(['mean_radius', 'target', 'cv']).filter(pl.col('cv') != 1).with_columns([((pl.col('mean_radius') - pl.col('mean_radius').mean()) / pl.col('mean_radius').std()).alias('mean_radius')])
df_test = df.select(['mean_radius', 'target', 'cv']).filter(pl.col('cv') == 1).with_columns([((pl.col('mean_radius') - pl.col('mean_radius').mean()) / pl.col('mean_radius').std()).alias('mean_radius')])

X_train = df_train.select('mean_radius').collect().to_pandas()
y_train = df_train.select('target').collect().to_pandas()
X_test = df_test.select('mean_radius').collect().to_pandas()
y_test = df_test.select('target').collect().to_pandas()

df_train.head().collect()

mean_radius,target,cv
f64,i64,i64
1.025038,0,3
1.742913,0,2
1.498057,0,3
-0.803038,0,2
1.665004,0,4


In [253]:
u = UnivariateResults(df, feature_col='mean_radius', target_col='target')
u.results

Unnamed: 0_level_0,coef,pvalues,se,lower_ci,upper_ci,n,k
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.009598,0.136696,0.006449,-0.003042,0.022238,453.0,1.0
2,0.011553,0.070477,0.006387,-0.000965,0.024071,468.0,1.0
3,0.013235,0.042316,0.006518,0.000459,0.02601,459.0,1.0
4,0.013233,0.044185,0.006576,0.000344,0.026121,446.0,1.0
5,0.00529,0.408327,0.006398,-0.00725,0.017831,450.0,1.0
mean,0.010542,0.068185,0.00578,-0.000787,0.021872,569.0,1.0
std,0.003316,0.154568,8e-05,0.00321,0.003425,8.58487,0.0
