In [None]:
import polars as pl
from datetime import datetime, timedelta, date

In [None]:
import polars as pl
from datetime import datetime, timedelta

# Create a date range
start_date = datetime(2022, 1, 1)
end_date = datetime(2022, 1, 31)
dates = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]

# Create fake stock data
stock_data = pl.DataFrame({
    "Date": [str(date) for date in dates],
    "AAPL": [100 + i * 0.5 for i in range(len(dates))],
    "GOOG": [500 + i * 2 for i in range(len(dates))],
    "MSFT": [200 + i * 1 for i in range(len(dates))],
    "AMZN": [300 + i * 1.5 for i in range(len(dates))]
})

Time data

In [None]:
# Casting the string date column to a datetime type
stock_data = stock_data.with_columns(
    pl.col("Date").str.to_date("%Y-%m-%d %H:%M:%S").alias("Date")
)
print(stock_data.head())


In [None]:
# extracting features from a date column
df_with_year = stock_data.with_columns(
    pl.col("Date").dt.year().alias("year"),
)
print(df_with_year)

Filtering

In [None]:
# single data
filtered_df = stock_data.filter(
    pl.col("Date") == datetime(2022, 1, 1)
)
print(filtered_df)

In [None]:
# date range
filtered_range_df = stock_data.filter(
    pl.col("Date").is_between(
        datetime(2022, 1, 1), datetime(2022, 1, 10)
    )
)
print(filtered_range_df)

Grouping

In [None]:
# group by fixed window (dates need to be sorted)
df = stock_data.sort("Date")
df = df.group_by_dynamic(
    "Date",
    every="1w"
).agg(pl.col("AAPL", "GOOG", "MSFT", "AMZN").mean())
df = df.with_columns(
    pl.col("Date").dt.week().alias("Week")
)
print(df)

In [None]:
# expressions in a groupby dynamic
df = (
    pl.date_range(
        start=date(2021, 1, 1),
        end=date(2021, 12, 31),
        interval="1d",
        eager=True,
    )
    .alias("time")
    .to_frame()
)

out = df.group_by_dynamic("time", every="1mo", period="1mo", closed="left").agg(
    pl.col("time").cum_count().reverse().head(3).alias("day/eom"),
    ((pl.col("time") - pl.col("time").first()).last().dt.total_days() + 1).alias(
        "days_in_month"
    ),
)
print(out)

Resampling

In [None]:
df = pl.DataFrame(
    {
        "time": pl.datetime_range(
            start=datetime(2021, 12, 16),
            end=datetime(2021, 12, 16, 3),
            interval="30m",
            eager=True,
        ),
        "groups": ["a", "a", "a", "b", "b", "a", "a"],
        "values": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
    }
)
print(df)

In [None]:
# upsample with forward fill
out1 = df.upsample(
    time_column="time", every="15m"
).fill_null(strategy="forward")
print(out1)

# upsample with linear interpolation
out2 = df.upsample(
    time_column="time", every="15m"
).interpolate().fill_null(strategy="forward")
print(out2)