In [None]:
import polars as pl
import polars_ds as pds
import numpy as np
import time 

In [None]:
df = pl.read_csv("test_dep.csv")
df.head()
# detector = pds.diagnosis.Detector()

In [None]:
from polars_ds.diagnosis import Detector

detector = Detector(df)

In [None]:
detector.infer_corr()

In [None]:
detector.infer_dependency()

In [None]:
detector.plot_dependency()

In [None]:
df = pds.random_data()
df.head()

In [None]:
df.select(
    pds.query_ks_2samp("feature_1", "feature_2")
).unnest("feature_1")

In [None]:
df = pl.DataFrame(dict(x=[ 1,  2, 10], y=[ 2,  5, 10]))
df

In [None]:
df.select(pl.col("x").num._knn_entropy(pl.col('y'), k=1, dist = "l2")).item(0,0)

In [None]:
df.select(
    pds.query_knn_entropy("x", "y", k=1, dist = "l2").sqrt().alias("l2"),
    pds.query_knn_entropy("x", "y", k=1, dist = "inf").alias("inf"),
)

In [None]:
def test_knn_entropy():
    df = pl.DataFrame(dict(x=[ 1,  2, 10], y=[ 2,  5, 10]))
    ent = df.select(pl.col.x.num.knn_entropy(pl.col('y'), k=1))
    assert ent.item(0, 0) == pytest.approx(5.67, abs=0.01)

In [None]:
df = pl.DataFrame({
    "A": pl.int_range(10000, eager=True),
    "B": pl.int_range(10000, eager=True)+1,
    "CC": [1] * 10000,
}).with_row_index()

df = df.with_columns(df.to_dummies('B'))
df = df.with_columns(pl.col('A').rolling_mean(2).alias('C'))
print(df)

In [None]:
from typing import List

def residual_multiple(cols: List[pl.Series], add_constant: bool) -> pl.Series:
    cols = [list(c.struct) if isinstance(c.dtype, pl.Struct) else [c] for c in cols]
    cols = [i.to_numpy() for p in cols for i in p]
    if add_constant:
        cols += [np.ones_like(cols[0])]
    yx = np.vstack(cols).T

    # skip nan
    mask = np.any(np.isnan(yx), axis=1)
    yx_ = yx[~mask, :]

    y = yx_[:, 0]
    x = yx_[:, 1:]
    coef = np.linalg.lstsq(x, y, rcond=None)[0]
    y_hat = np.sum(x * coef, axis=1)
    residual = y - y_hat

    # refill
    out = np.empty_like(yx[:, 0])
    out[~mask] = residual
    out[mask] = np.nan
    return pl.Series(out, nan_to_null=True)


def cs_neutralize_residual_multiple(y: pl.Expr, *more_x: pl.Expr, add_constant: bool = False) -> pl.Expr:
    return pl.map_batches([y, *more_x], lambda xx: residual_multiple(xx, add_constant))

x = df.with_columns([
    cs_neutralize_residual_multiple(pl.col('A'), pl.col('C')).alias('resid1'),
    pl.col('A').num.lstsq(pl.col('C'), return_pred=True, skip_null=True).struct.field('resid').alias('resid2'),
])
print(x)

In [None]:
df = df.select(
    pl.col("feature_1").alias("B_1"),
    pl.col("feature_2").alias("B_2"),
    pl.col("feature_3").alias("B_3"),
    pl.col("feature_4").alias("A"),
)
df.head()

In [None]:
df.select(
    pds.query_lstsq(pl.col("B_1"), pl.col("B_2"), pl.col("B_3"), target = pl.col("A"), return_pred=True, add_bias=True)
)

In [None]:
df.with_columns(
    pl.col('A').num.lstsq(*[pl.col(c) for c in df.columns if c.startswith("B_")], return_pred=True).struct.field('resid')
)

In [None]:
# pl.col('A').num.lstsq(pl.col('^B_.*$'), return_pred=True).struct.field('resid')

In [None]:
print(df.columns)
df.with_columns(
    pl.col('A').num.lstsq(pl.col('^B_.*$'), return_pred=True) # .struct.field('resid')
).head()

In [None]:
print(
    
df.select(
    pl.col("row_num"),
    pds.query_radius_ptwise(
        pl.col("feature_1"), pl.col("feature_2"), pl.col("feature_3"), # Columns used as the coordinates in n-d space
        index = pl.col("row_num"),
        r = 0.1, 
        dist = "l2", # actually this is squared l2
        parallel = True
    ).alias("best friends"),
).with_columns( # -1 to remove the point itself
    (pl.col("best friends").list.len() - 1).alias("best friends count")
).head()

)

In [None]:
        pl.col("val1"), pl.col("val2"), pl.col("val3"), # Columns used as the coordinates in n-d space
        r = 0.1, 
        dist = "l2", # actually this is squared l2
        parallel = True

In [None]:
import scipy

In [None]:
df.select(pl.col("feature_1").num.rfft())

In [None]:
df.select(pl.col("feature_1").num.rfft2())

In [None]:
import polars as pl
import polars_ds as pld  # noqa
from pandas._testing import makeTimeDataFrame

In [None]:
df = makeTimeDataFrame()
df = df.rename(columns={'B': 'B_1', 'C': 'B_2', 'D': 'B_3', })
df = pl.from_pandas(df, include_index=True)