In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import polars as pl
import numpy as np
import altair as alt
from datetime import datetime, timedelta

from centimators.feature_transformers import (
    RankTransformer,
    LagTransformer,
    MovingAverageTransformer,
    LogReturnTransformer,
)

## Mock price data

In [3]:
dates = [datetime.now() - timedelta(days=i) for i in range(90)]
dates.reverse()

# Generate 5 tickers
tickers = [f"Ticker{i}" for i in range(1, 21)]

# Generate random OHLCV data
data = {
    "ticker": [],
    "date": [],
    "open": [],
    "high": [],
    "low": [],
    "close": [],
    "volume": [],
}

for ticker in tickers:
    # Start with random base price between 10 and 1000
    base_price = np.random.uniform(10, 1000)
    for date in dates:
        # Generate daily price movements
        daily_return = np.random.normal(0.005, 0.03)  # Mean 0.5%, std 3%
        close = base_price * (1 + daily_return)
        high = close * (1 + abs(np.random.normal(0, 0.01)))
        low = close * (1 - abs(np.random.normal(0, 0.01)))
        open_price = close * (1 + np.random.normal(-0.005, 0.005))
        volume = int(np.random.lognormal(10, 1))

        data["ticker"].append(ticker)
        data["date"].append(date)
        data["open"].append(round(open_price, 2))
        data["high"].append(round(high, 2))
        data["low"].append(round(low, 2))
        data["close"].append(round(close, 2))
        data["volume"].append(volume)

        base_price = close  # Use today's close as tomorrow's base price

df_pandas = pd.DataFrame(data)
df_polars = pl.DataFrame(data)

In [None]:
df_polars.plot.line(x="date", y="close", color="ticker").properties(
    width=600, height=400, title="Stock Prices Over Time"
)

## Instantiate transformers

In [None]:
ranker: RankTransformer = RankTransformer()
ranker

lag_windows = [0, 2, 4, 6, 8]
lagger: LagTransformer = LagTransformer(windows=lag_windows)

ma_windows = [5, 10, 20, 40]
ma_transformer = MovingAverageTransformer(windows=ma_windows)

log_return_transformer = LogReturnTransformer()

display(log_return_transformer, ranker, lagger, ma_transformer)

### Use individually (dataframe agnostic)

In [None]:
# Compare Pandas vs Polars performance
import time

# Test with Pandas
start_time = time.time()
result_pd = ranker.fit_transform(df_pandas, date_series=df_pandas["date"])
pandas_time = time.time() - start_time

# Test with Polars
start_time = time.time()
result_pl = ranker.fit_transform(df_polars, date_series=df_polars["date"])
polars_time = time.time() - start_time

print(f"Pandas execution time: {pandas_time:.4f} seconds")
print(f"Polars execution time: {polars_time:.4f} seconds")
print(f"Polars Speedup: {pandas_time/polars_time:.2f}x")

# Display sample of results
print("\nSample of Pandas result:")
display(result_pd.head())
print("\nSample of Polars result:")
display(result_pl.head())

# Verify results are equivalent
pd_result = result_pd
pl_result = result_pl.to_pandas()
assert pd_result.equals(pl_result), "Results should be identical!"

## Or chain them together in a pipeline

In [7]:
from sklearn import set_config
from sklearn.pipeline import make_pipeline

set_config(enable_metadata_routing=True)

In [None]:
# Use scikit-learn metadata routing API (i.e. set_transform_request)
lagger = LagTransformer(windows=lag_windows).set_transform_request(ticker_series=True)
ranker = RankTransformer().set_transform_request(date_series=True)
ma_transformer = MovingAverageTransformer(windows=ma_windows).set_transform_request(
    ticker_series=True
)
log_return_transformer = LogReturnTransformer().set_transform_request(
    ticker_series=True
)

lagged_ranker = make_pipeline(log_return_transformer, ranker, lagger, ma_transformer)
display(lagged_ranker)

In [None]:
feature_names = ["open", "close", "volume"]
transformed_df = lagged_ranker.fit_transform(
    df_polars[feature_names],
    date_series=df_polars["date"],
    ticker_series=df_polars["ticker"],
)
transformed_df

In [None]:
# Visualization of the transformation into features
chart_df = pl.concat([df_polars, transformed_df], how="horizontal")
original_chart = chart_df.plot.line(x="date", y="close", color="ticker").properties(
    width=300, height=300, title="Input: Raw Stock Prices Over Time"
)

transformed_chart = chart_df.plot.line(
    x="date",
    y="close_logreturn_rank_lag0_ma20",
    color="ticker",
).properties(
    width=300, height=300, title="Pipeline Output: Normalized/Smoothed Features"
)
transformed_chart.encoding.y.scale = alt.Scale(domain=[0, 1])

chart = original_chart | transformed_chart
chart.interactive()

In [None]:
def create_feature_visualization(df, columns, title, width=300, height=300):
    melted_df = df.unpivot(
        index=["date"],
        on=columns,
        variable_name="variable",
        value_name="value",
    )
    chart = melted_df.plot.line(x="date", y="value", color="variable").properties(
        width=width, height=height, title=title
    )
    # Set y-axis scale to [0, 1] since these are normalized ranks
    chart.encoding.y.scale = alt.Scale(domain=[0, 1])
    return chart

ticker = "Ticker1"
filtered_df = chart_df.filter(pl.col("ticker") == ticker)

ma_columns = [f"close_logreturn_rank_lag0_ma{w}" for w in ma_windows]
lag_columns = [f"close_logreturn_rank_lag{i}_ma5" for i in lag_windows]

moving_average_chart = create_feature_visualization(
    filtered_df,
    ma_columns,
    f"Different Moving Average Windows for {ticker}"
)
lagged_chart = create_feature_visualization(
    filtered_df, 
    lag_columns,
    f"Different Lag Periods for {ticker}"
)

(moving_average_chart | lagged_chart).interactive()
