In [1]:
import polars as pl
import polars.selectors as cs

from predictables.encoding.src.lagged_mean_encoding.credibility.vhm import vhm
from predictables.encoding.src.lagged_mean_encoding import (
    DynamicRollingSum,
    DynamicRollingCount,
    DynamicRollingMean,
    LaplaceSmoothedMean,
)

In [2]:
lf = pl.scan_parquet("/app/ts_testing.parquet").drop(
    [
        "trans_id",
        "prob",
        "30_days_prior",
        "60_days_prior",
        "90_days_prior",
        "total_30_30_sum",
        "total_30_60_sum",
        "total_60_30_sum",
        "total_30_30_average",
        "total_30_60_average",
        "total_60_30_average",
        "cat_30_30_average",
        "cat_30_60_average",
        "cat_60_30_average",
        "subcat_30_30_average",
        "subcat_30_60_average",
        "subcat_60_30_average",
        "code_30_30_average",
        "code_30_60_average",
        "code_60_30_average",
        "cat_30_30_average_laplace(1)_smoothed",
        "cat_30_60_average_laplace(1)_smoothed",
        "cat_60_30_average_laplace(1)_smoothed",
        "subcat_30_30_average_laplace(1)_smoothed",
        "subcat_30_60_average_laplace(1)_smoothed",
        "subcat_60_30_average_laplace(1)_smoothed",
        "total_30_60_average_laplace(1)_smoothed",
        "total_60_30_average_laplace(1)_smoothed",
        "cat_30_60_average_laplace(1)_smoothed",
        "cat_60_30_average_laplace(1)_smoothed",
        "code_30_60_average_laplace(1)_smoothed",
        "code_60_30_average_laplace(1)_smoothed",
        "cred_wtd_30_60_average",
        "cred_wtd_60_30_average",
        "cred_wtd_30_60_Z",
        "cred_wtd_60_30_Z",
        "cred_wtd_30_60_n",
        "cred_wtd_60_30_n",
    ]
)

lf = lf.with_columns(
    [pl.col(c).cast(pl.Categorical).name.keep() for c in lf.select(cs.string()).columns]
)

lf.head().collect()

index,date,product_cat,product_subcat,product_code,hit,total_30_30_average_laplace(1)_smoothed,code_30_30_average_laplace(1)_smoothed,cred_wtd_30_30_average,cred_wtd_30_30_Z,cred_wtd_30_30_n
u32,date,cat,cat,cat,i64,f64,f64,f64,f64,i64
0,2020-01-01,"""D""","""f""","""D-f""",1,0.5,0.5,0.5,0.0,0
1,2020-01-02,"""B""","""f""","""B-f""",1,0.5,0.5,0.5,0.0,0
2,2020-01-03,"""A""","""d""","""A-d""",0,0.5,0.5,0.5,0.0,0
3,2020-01-03,"""A""","""e""","""A-e""",1,0.5,0.5,0.5,0.0,0
4,2020-01-04,"""D""","""c""","""D-c""",1,0.5,0.5,0.5,0.0,0


In [3]:
lf = (
    DynamicRollingMean()
    .lf(lf)
    .date_col("date")
    .numerator_col("hit")
    .denominator_col("count")
    .cat_col("product_cat")
    .index_col("index")
    .offset(30)
    .window(30)
    .rejoin(True)
    .op("ROLLING_MEAN")
    .run()
)

lf = (
    DynamicRollingMean()
    .lf(lf)
    .date_col("date")
    .numerator_col("hit")
    .denominator_col("count")
    .index_col("index")
    .offset(30)
    .window(30)
    .rejoin(True)
    .op("ROLLING_MEAN")
    .run()
)


lf.head().collect()

index,date,product_cat,product_subcat,product_code,hit,total_30_30_average_laplace(1)_smoothed,code_30_30_average_laplace(1)_smoothed,cred_wtd_30_30_average,cred_wtd_30_30_Z,cred_wtd_30_30_n,count,ROLLING_MEAN(hit[product_cat])[lag:30/win:30],ROLLING_MEAN(hit[ALL])[lag:30/win:30]
i64,date,cat,cat,cat,i64,f64,f64,f64,f64,i64,i32,f64,f64
0,2020-01-01,"""D""","""f""","""D-f""",1,0.5,0.5,0.5,0.0,0,1,0.0,0.0
1,2020-01-02,"""B""","""f""","""B-f""",1,0.5,0.5,0.5,0.0,0,1,0.0,0.0
2,2020-01-03,"""A""","""d""","""A-d""",0,0.5,0.5,0.5,0.0,0,1,0.0,0.0
3,2020-01-03,"""A""","""e""","""A-e""",1,0.5,0.5,0.5,0.0,0,1,0.0,0.0
4,2020-01-04,"""D""","""c""","""D-c""",1,0.5,0.5,0.5,0.0,0,1,0.0,0.0


In [5]:
lf = (
    LaplaceSmoothedMean()
    .lf(lf)
    .date_col("date")
    .numerator_col("hit")
    .denominator_col("count")
    .cat_col("product_cat")
    .index_col("index")
    .offset(30)
    .window(30)
    .rejoin(True)
    .laplace_alpha(2)
    .run()
)

lf = (
    LaplaceSmoothedMean()
    .lf(lf)
    .date_col("date")
    .numerator_col("hit")
    .denominator_col("count")
    .index_col("index")
    .offset(30)
    .window(30)
    .rejoin(True)
    .laplace_alpha(2)
    .run()
)

lf.collect()

index,date,product_cat,product_subcat,product_code,hit,total_30_30_average_laplace(1)_smoothed,code_30_30_average_laplace(1)_smoothed,cred_wtd_30_30_average,cred_wtd_30_30_Z,cred_wtd_30_30_n,count,ROLLING_MEAN(hit[product_cat])[lag:30/win:30],ROLLING_MEAN(hit[ALL])[lag:30/win:30],SMOOTHED_MEAN(hit[product_cat])[lag:30/win:30],SMOOTHED_MEAN(hit[ALL])[lag:30/win:30]
i64,date,cat,cat,cat,i64,f64,f64,f64,f64,i64,i32,f64,f64,f64,f64
0,2020-01-01,"""D""","""f""","""D-f""",1,0.5,0.5,0.5,0.0,0,1,0.0,0.0,0.0,0.0
1,2020-01-02,"""B""","""f""","""B-f""",1,0.5,0.5,0.5,0.0,0,1,0.0,0.0,0.0,0.0
2,2020-01-03,"""A""","""d""","""A-d""",0,0.5,0.5,0.5,0.0,0,1,0.0,0.0,0.0,0.0
3,2020-01-03,"""A""","""e""","""A-e""",1,0.5,0.5,0.5,0.0,0,1,0.0,0.0,0.0,0.0
4,2020-01-04,"""D""","""c""","""D-c""",1,0.5,0.5,0.5,0.0,0,1,0.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2554,2023-08-27,"""A""","""c""","""A-c""",0,0.25,0.2,0.20238,0.44444,4,1,,0.237288,0.2,0.262295
2555,2023-08-28,"""B""","""c""","""B-c""",0,0.25424,0.16667,0.21046,0.5,5,1,,0.241379,0.210526,0.266667
2556,2023-08-28,"""C""","""d""","""C-d""",0,0.25424,0.6,0.28013,0.44444,4,1,,0.241379,0.352941,0.266667
2557,2023-08-28,"""C""","""c""","""C-c""",0,0.25424,0.5,0.26395,0.16667,1,1,,0.241379,0.352941,0.266667
