In [1]:
import polars as pl

import polars_ts as pts  # noqa

# Create sample dataframe with columns `unique_id`, `ds`, and `y`.
df = (
    pl.scan_parquet("https://datasets-nixtla.s3.amazonaws.com/m4-hourly.parquet")
    # Get only the first 100 timeseries
    .filter(pl.col("unique_id").str.strip_chars_start("H").cast(pl.Int64) < 100)
    # normalize values per time series
    .with_columns((pl.col("y") - pl.mean("y")) / pl.std("y").over("unique_id"))
    .collect()
)
df

unique_id,ds,y
str,i64,f64
"""H1""",1,-125.26119
"""H1""",2,-125.382489
"""H1""",3,-125.382489
"""H1""",4,-125.55486
"""H1""",5,-125.861299
…,…,…
"""H99""",744,1.012409
"""H99""",745,0.722225
"""H99""",746,0.52868
"""H99""",747,0.31602


### Compute pairwise DTW distances between time series in two DataFrames, using extensive parallelism.

**Highlights:**
- **Less Complex Space:** This version uses O(m) memory instead of allocating the full (n+1)×(m+1) matrix.
- **Avoid Self-comparison:** The code does not compute the distance for the same time series `(i,i)`.
- **Avoid Duplicate Pairs:** If a pair `(i,j)` is computed, the symmetric pair `(j,i)` is not recomputed.
- **Support for Mixed `unique_id` Types:** The algorithm works for unique IDs that are either strings or integers. It also works, if the unique_ids in both dataframes are of different dtype (one df str, the other int).


In [2]:
pts.compute_pairwise_dtw(df, df).sort("id_1", "id_2")

id_1,id_2,dtw
str,str,f64
"""H1""","""H10""",323593.054822
"""H1""","""H11""",95024.421833
"""H1""","""H12""",88756.652224
"""H1""","""H13""",26698.268043
"""H1""","""H14""",395201.77239
…,…,…
"""H96""","""H98""",34955.942545
"""H96""","""H99""",12529.282701
"""H97""","""H98""",50947.917265
"""H97""","""H99""",3011.452282


In [3]:
# Cast the unique_ids of the dataframes to Int.
df_casted = df.with_columns(pl.col("unique_id").str.strip_chars_start("H").cast(pl.Int16))
pts.compute_pairwise_dtw(df_casted, df_casted).sort("id_1", "id_2")

id_1,id_2,dtw
i16,i16,f64
1,2,73422.324601
1,3,41605.126607
1,4,80803.394294
1,5,74254.6651
1,6,80182.83111
…,…,…
96,98,34955.942545
96,99,12529.282701
97,98,50947.917265
97,99,3011.452282


In [4]:
# It also works for mixed types, i.e. one dataframe has str, the other has int for unique_id.
# Now we don't have duplicate calculations to skip, that is why we have more combinations to compute.
pts.compute_pairwise_dtw(df, df_casted).sort("id_1", "id_2")

id_1,id_2,dtw
str,i16,f64
"""H1""",1,0.0
"""H1""",2,73422.324601
"""H1""",3,41605.126607
"""H1""",4,80803.394294
"""H1""",5,74254.6651
…,…,…
"""H99""",95,3135.595726
"""H99""",96,12529.282701
"""H99""",97,3011.452282
"""H99""",98,47485.225245
