In [23]:
import datetime
import polars as pl

df = (
    pl.scan_csv(
        "/home/aweaver/work/predictables/predictables/encoding/tests/rolling_date_example.csv"
    )
    .drop(
        [
            "month_prior",
            "year_prior",
            "rolling_sum",
            "60_days_prior",
            "360_days_prior",
        ]
    )
    .with_columns(
        [
            pl.col("date").str.to_date("%m/%d/%Y").alias("date"),
            (pl.col("390_days_prior").str.to_date("%m/%d/%Y")).alias("390_days_prior"),
        ]
    )
    .with_columns(
        [
            pl.date_ranges(
                start=pl.col("390_days_prior") + datetime.timedelta(days=1),
                end=pl.col("30_days_prior").str.to_date("%m/%d/%Y"),
                interval="1d",
            ).alias("date_list"),
        ]
    )
)

value_map = {
    d: v for d, v in df.select([pl.col("date"), pl.col("incr_value")]).collect().rows()
}

df = df.with_columns(
    [
        pl.col("date_list")
        .list.eval(pl.element().is_in(list(value_map.keys())).cast(pl.Int32))
        .list.sum()
        .alias("value_list")
    ]
)

# value_map
df.collect().with_columns(
    [(pl.col("sum2") - pl.col("value_list")).alias("diff")]
).with_columns(pl.col("date_list")).filter(pl.col("diff") > 0)

date,incr_value,cat1,cat2,30_days_prior,390_days_prior,sum2,date_list,value_list,diff
date,i64,str,str,str,date,i64,list[date],i32,i64


In [1]:
import pandas as pd
import numpy as np
import polars as pl
from predictables.core.src._UnivariateAnalysis import UnivariateAnalysis
from predictables.util import to_pl_lf
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the breast cancer dataset
bcancer = load_breast_cancer()
cancery = pd.Series(bcancer.target, name="y").map({0: "malignant", 1: "benign"})
cancerdf = pd.DataFrame(
    bcancer.data, columns=[c.replace(" ", "_") for c in bcancer.feature_names]
)

# Standardize the data
scaler = StandardScaler()
cancerdf = pd.DataFrame(scaler.fit_transform(cancerdf), columns=cancerdf.columns)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    cancerdf, cancery, test_size=0.2, random_state=42, stratify=cancery
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42, stratify=y_train
)

# Perform PCA
pca = PCA(n_components=2)
X_train_pca = pd.DataFrame(
    pca.fit_transform(X_train), columns=["pc1", "pc2"], index=X_train.index
)
X_val_pca = pd.DataFrame(
    pca.transform(X_val), columns=["pc1", "pc2"], index=X_val.index
)
X_test_pca = pd.DataFrame(
    pca.transform(X_test), columns=["pc1", "pc2"], index=X_test.index
)

# Combine the data
df_train = pd.concat([X_train, X_train_pca], axis=1)
df_val = pd.concat([X_val, X_val_pca], axis=1)
df_test = pd.concat([X_test, X_test_pca], axis=1)

# Add the target variable
df_train["y"] = y_train.map({"malignant": 0, "benign": 1}).values
df_val["y"] = y_val.map({"malignant": 0, "benign": 1}).values
df_test["y"] = y_test.map({"malignant": 0, "benign": 1}).values

# Randomly sort training data into 5 cross-validation folds
df_train["fold"] = np.random.choice(range(5), size=df_train.shape[0]) + 1

df_train.to_parquet("cancer_train.parquet")
df_val.to_parquet("cancer_val.parquet")
df_test.to_parquet("cancer_test.parquet")


def quintile(x: str) -> pl.Expr:
    return (
        pl.when(pl.col(x) < pl.col(x).quantile(0.2))
        .then(1)
        .otherwise(
            pl.when(pl.col(x) < pl.col(x).quantile(0.4))
            .then(2)
            .otherwise(
                pl.when(pl.col(x) < pl.col(x).quantile(0.6))
                .then(3)
                .otherwise(
                    pl.when(pl.col(x) < pl.col(x).quantile(0.8)).then(4).otherwise(5)
                )
            )
        )
        .cast(pl.Utf8)
        .cast(pl.Categorical)
        .alias(f"{x}_quintile")
    )


cols = [
    "mean_radius",
    "mean_texture",
    "mean_perimeter",
    "mean_area",
    "mean_smoothness",
    "mean_compactness",
    "mean_concavity",
    "mean_concave_points",
    "mean_symmetry",
    "mean_fractal_dimension",
    "radius_error",
    "texture_error",
    "perimeter_error",
    "area_error",
    "smoothness_error",
    "compactness_error",
    "concavity_error",
    "concave_points_error",
    "symmetry_error",
    "fractal_dimension_error",
    "worst_radius",
    "worst_texture",
    "worst_perimeter",
    "worst_area",
    "worst_smoothness",
    "worst_compactness",
    "worst_concavity",
    "worst_concave_points",
    "worst_symmetry",
    "worst_fractal_dimension",
    "pc1",
    "pc2",
]

quintile_expr = [quintile(c) for c in cols]

df_train = to_pl_lf(df_train).with_columns(quintile_expr)
df_val = to_pl_lf(df_val).with_columns(quintile_expr)
df_test = to_pl_lf(df_test).with_columns(quintile_expr)

df_train.head().collect()

mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,radius_error,texture_error,perimeter_error,area_error,smoothness_error,compactness_error,concavity_error,concave_points_error,symmetry_error,fractal_dimension_error,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,pc1,pc2,y,fold,mean_radius_quintile,mean_texture_quintile,mean_perimeter_quintile,mean_area_quintile,mean_smoothness_quintile,mean_compactness_quintile,mean_concavity_quintile,mean_concave_points_quintile,mean_symmetry_quintile,mean_fractal_dimension_quintile,radius_error_quintile,texture_error_quintile,perimeter_error_quintile,area_error_quintile,smoothness_error_quintile,compactness_error_quintile,concavity_error_quintile,concave_points_error_quintile,symmetry_error_quintile,fractal_dimension_error_quintile,worst_radius_quintile,worst_texture_quintile,worst_perimeter_quintile,worst_area_quintile,worst_smoothness_quintile,worst_compactness_quintile,worst_concavity_quintile,worst_concave_points_quintile,worst_symmetry_quintile,worst_fractal_dimension_quintile,pc1_quintile,pc2_quintile
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat
0.2081,0.912292,0.347273,0.046959,0.57215,1.774977,1.015706,1.02817,-0.272428,0.55604,-0.453216,-0.462029,0.358868,-0.333042,0.346688,1.438701,0.783046,1.140961,0.594217,0.195592,-0.039178,0.342424,0.337735,-0.168554,-0.033692,1.339296,0.895753,0.884571,0.160555,0.169804,2.601883,1.664731,0,3,"""4""","""5""","""4""","""4""","""4""","""5""","""5""","""5""","""3""","""4""","""2""","""2""","""4""","""3""","""4""","""5""","""5""","""5""","""4""","""4""","""4""","""4""","""4""","""4""","""3""","""5""","""5""","""4""","""4""","""4""","""4""","""5"""
-1.684571,-0.57005,-1.658278,-1.288347,-0.737294,-0.85113,-0.9155,-1.109197,-0.155598,0.316465,-0.898232,-0.472008,-0.877224,-0.706961,0.642366,-0.50402,-0.530967,-0.953653,0.62933,-0.458783,-1.512777,-0.605327,-1.489328,-1.122222,-0.11698,-0.754239,-0.975761,-1.354653,0.330422,-0.546168,-4.564517,1.565109,1,5,"""1""","""2""","""1""","""1""","""2""","""2""","""1""","""1""","""3""","""4""","""1""","""2""","""1""","""1""","""5""","""2""","""2""","""1""","""5""","""2""","""1""","""2""","""1""","""1""","""3""","""2""","""1""","""1""","""4""","""2""","""1""","""4"""
-0.825712,0.132725,-0.825,-0.761051,0.643316,-0.692695,-1.052023,-1.066224,0.468713,-0.356897,-0.38825,1.35921,-0.449022,-0.455811,1.949753,-0.806941,-0.948182,-1.107752,2.65013,-0.69292,-0.888216,0.016737,-0.904036,-0.781363,0.439736,-1.002397,-1.241784,-1.437181,0.632947,-1.037706,-3.127716,1.357112,1,4,"""1""","""3""","""1""","""1""","""4""","""2""","""1""","""1""","""4""","""3""","""3""","""5""","""2""","""2""","""5""","""1""","""1""","""1""","""5""","""1""","""1""","""3""","""1""","""1""","""4""","""1""","""1""","""1""","""5""","""1""","""1""","""4"""
-0.169639,-1.943019,-0.167192,-0.27215,2.329937,0.006804,-0.251467,0.429234,2.1591,0.512094,0.017786,-0.368046,-0.105966,-0.169129,2.11976,0.162743,-0.672216,-0.577002,0.626908,0.896114,-0.453343,-2.147457,-0.473631,-0.483572,0.558093,-0.740244,-0.89617,-0.617229,-0.308601,-0.666975,-0.807603,2.071959,1,5,"""3""","""1""","""3""","""3""","""5""","""3""","""3""","""4""","""5""","""4""","""4""","""3""","""4""","""4""","""5""","""4""","""1""","""2""","""5""","""5""","""3""","""1""","""3""","""3""","""4""","""2""","""1""","""2""","""3""","""2""","""3""","""5"""
-0.215082,-0.674768,-0.241747,-0.288361,-1.794101,-0.58922,-0.098925,-0.539588,-1.422476,-0.647506,-0.870802,-0.139439,-0.813365,-0.56494,-0.374008,0.403626,0.586093,-0.229711,-1.024602,0.106325,-0.416068,-0.47668,-0.454866,-0.436812,-1.309316,-0.007411,0.28119,-0.378019,-1.379572,-0.424808,-2.212381,-0.936765,1,1,"""3""","""2""","""3""","""3""","""1""","""2""","""3""","""3""","""1""","""2""","""1""","""3""","""1""","""1""","""3""","""4""","""5""","""3""","""1""","""4""","""3""","""2""","""3""","""3""","""1""","""4""","""4""","""3""","""1""","""2""","""2""","""2"""


In [2]:
df_train.head().collect()

mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,radius_error,texture_error,perimeter_error,area_error,smoothness_error,compactness_error,concavity_error,concave_points_error,symmetry_error,fractal_dimension_error,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,pc1,pc2,y,fold,mean_radius_quintile,mean_texture_quintile,mean_perimeter_quintile,mean_area_quintile,mean_smoothness_quintile,mean_compactness_quintile,mean_concavity_quintile,mean_concave_points_quintile,mean_symmetry_quintile,mean_fractal_dimension_quintile,radius_error_quintile,texture_error_quintile,perimeter_error_quintile,area_error_quintile,smoothness_error_quintile,compactness_error_quintile,concavity_error_quintile,concave_points_error_quintile,symmetry_error_quintile,fractal_dimension_error_quintile,worst_radius_quintile,worst_texture_quintile,worst_perimeter_quintile,worst_area_quintile,worst_smoothness_quintile,worst_compactness_quintile,worst_concavity_quintile,worst_concave_points_quintile,worst_symmetry_quintile,worst_fractal_dimension_quintile,pc1_quintile,pc2_quintile
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat
0.2081,0.912292,0.347273,0.046959,0.57215,1.774977,1.015706,1.02817,-0.272428,0.55604,-0.453216,-0.462029,0.358868,-0.333042,0.346688,1.438701,0.783046,1.140961,0.594217,0.195592,-0.039178,0.342424,0.337735,-0.168554,-0.033692,1.339296,0.895753,0.884571,0.160555,0.169804,2.601883,1.664731,0,3,"""4""","""5""","""4""","""4""","""4""","""5""","""5""","""5""","""3""","""4""","""2""","""2""","""4""","""3""","""4""","""5""","""5""","""5""","""4""","""4""","""4""","""4""","""4""","""4""","""3""","""5""","""5""","""4""","""4""","""4""","""4""","""5"""
-1.684571,-0.57005,-1.658278,-1.288347,-0.737294,-0.85113,-0.9155,-1.109197,-0.155598,0.316465,-0.898232,-0.472008,-0.877224,-0.706961,0.642366,-0.50402,-0.530967,-0.953653,0.62933,-0.458783,-1.512777,-0.605327,-1.489328,-1.122222,-0.11698,-0.754239,-0.975761,-1.354653,0.330422,-0.546168,-4.564517,1.565109,1,5,"""1""","""2""","""1""","""1""","""2""","""2""","""1""","""1""","""3""","""4""","""1""","""2""","""1""","""1""","""5""","""2""","""2""","""1""","""5""","""2""","""1""","""2""","""1""","""1""","""3""","""2""","""1""","""1""","""4""","""2""","""1""","""4"""
-0.825712,0.132725,-0.825,-0.761051,0.643316,-0.692695,-1.052023,-1.066224,0.468713,-0.356897,-0.38825,1.35921,-0.449022,-0.455811,1.949753,-0.806941,-0.948182,-1.107752,2.65013,-0.69292,-0.888216,0.016737,-0.904036,-0.781363,0.439736,-1.002397,-1.241784,-1.437181,0.632947,-1.037706,-3.127716,1.357112,1,4,"""1""","""3""","""1""","""1""","""4""","""2""","""1""","""1""","""4""","""3""","""3""","""5""","""2""","""2""","""5""","""1""","""1""","""1""","""5""","""1""","""1""","""3""","""1""","""1""","""4""","""1""","""1""","""1""","""5""","""1""","""1""","""4"""
-0.169639,-1.943019,-0.167192,-0.27215,2.329937,0.006804,-0.251467,0.429234,2.1591,0.512094,0.017786,-0.368046,-0.105966,-0.169129,2.11976,0.162743,-0.672216,-0.577002,0.626908,0.896114,-0.453343,-2.147457,-0.473631,-0.483572,0.558093,-0.740244,-0.89617,-0.617229,-0.308601,-0.666975,-0.807603,2.071959,1,5,"""3""","""1""","""3""","""3""","""5""","""3""","""3""","""4""","""5""","""4""","""4""","""3""","""4""","""4""","""5""","""4""","""1""","""2""","""5""","""5""","""3""","""1""","""3""","""3""","""4""","""2""","""1""","""2""","""3""","""2""","""3""","""5"""
-0.215082,-0.674768,-0.241747,-0.288361,-1.794101,-0.58922,-0.098925,-0.539588,-1.422476,-0.647506,-0.870802,-0.139439,-0.813365,-0.56494,-0.374008,0.403626,0.586093,-0.229711,-1.024602,0.106325,-0.416068,-0.47668,-0.454866,-0.436812,-1.309316,-0.007411,0.28119,-0.378019,-1.379572,-0.424808,-2.212381,-0.936765,1,1,"""3""","""2""","""3""","""3""","""1""","""2""","""3""","""3""","""1""","""2""","""1""","""3""","""1""","""1""","""3""","""4""","""5""","""3""","""1""","""4""","""3""","""2""","""3""","""3""","""1""","""4""","""4""","""3""","""1""","""2""","""2""","""2"""


In [3]:
ua = UnivariateAnalysis(
    model_name="Cancer Model",
    df_train=df_train,
    df_val=df_val,
    target_column_name="y",
    feature_column_names=[c for c in df_train.columns if c != "y" and c != "fold"],
    cv_column_name="fold",
    time_series_validation=False,
)

Performing univariate analysis on 64 features:   0%|          | 0/64 [00:00<?, ?it/s]

TypeError: s must be a pandas or polars series. Got <class 'pandas.core.frame.DataFrame'>.

In [None]:
ua.build_report("cancer.pdf", max_per_file=15)

Building 8 univariate analysis reports:   0%|          | 0/8 [00:00<?, ?it/s]

date,value,category,date2
date,f64,cat,date
2019-01-01,1.0,"""A""",2021-12-31
2019-01-02,1.0,"""A""",2021-12-30
2019-01-03,1.0,"""A""",2021-12-29
2019-01-04,1.0,"""A""",2021-12-28
2019-01-05,1.0,"""A""",2021-12-27


category,rolling_sum,date
cat,f64,date
"""A""",1.0,2019-01-01
"""A""",2.0,2019-01-02
"""A""",3.0,2019-01-03
"""A""",4.0,2019-01-04
"""A""",5.0,2019-01-05
"""A""",6.0,2019-01-06
"""A""",7.0,2019-01-07
"""A""",8.0,2019-01-08
"""A""",9.0,2019-01-09
"""A""",9.0,2019-01-10


In [None]:
df.sort("date", "category").group_by_dynamic(
    "date",
    every="1d",
    period="1y",
    offset="-13mo",
    by="category",
    check_sorted=False,
    #  "date",
    # every="1d",
    # period="1y",
    # offset="-13mo",
    # by="category",
).agg(
    [
        pl.col("value").sum().alias("rolling_sum"),
    ]
).collect().filter(pl.col("date") >= datetime(2019, 1, 1))

category,date,rolling_sum
cat,date,f64
"""A""",2019-01-01,184.0
"""A""",2019-01-02,183.0
"""A""",2019-01-03,182.0
"""A""",2019-01-04,182.0
"""A""",2019-01-05,182.0
"""A""",2019-01-06,182.0
"""A""",2019-01-07,181.0
"""A""",2019-01-08,180.0
"""A""",2019-01-09,180.0
"""A""",2019-01-10,180.0
