In [3]:
!pip install "polars[all]"

Collecting deltalake>=1.0.0 (from polars[all])
  Downloading deltalake-1.1.4-cp39-abi3-win_amd64.whl.metadata (4.5 kB)
Collecting pyiceberg>=0.7.1 (from polars[all])
  Downloading pyiceberg-0.10.0-cp312-cp312-win_amd64.whl.metadata (4.9 kB)
Collecting gevent (from polars[all])
  Downloading gevent-25.9.1-cp312-cp312-win_amd64.whl.metadata (14 kB)
Collecting altair>=5.4.0 (from polars[all])
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting great-tables>=0.8.0 (from polars[all])
  Downloading great_tables-0.18.0-py3-none-any.whl.metadata (12 kB)
Collecting narwhals>=1.14.2 (from altair>=5.4.0->polars[all])
  Downloading narwhals-2.6.0-py3-none-any.whl.metadata (11 kB)
Collecting arro3-core>=0.5.0 (from deltalake>=1.0.0->polars[all])
  Downloading arro3_core-0.6.3-cp312-cp312-win_amd64.whl.metadata (363 bytes)
Collecting deprecated>=1.2.18 (from deltalake>=1.0.0->polars[all])
  Downloading Deprecated-1.2.18-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting commonmark

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.37.1 requires cachetools<6,>=4.0, but you have cachetools 6.2.0 which is incompatible.


In [None]:
import os, datetime
import numpy as np
import polars as pl
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
import pandas as pd

# Settings
work_dir = r"C:\Users\Naifu\Desktop\Finance Hackathon 2025\Hackathon"
ret_var = "stock_ret"
start_date = datetime.date(2005, 1, 1)
end_date   = datetime.date(2026, 1, 1)

print("Imports and settings loaded at", datetime.datetime.now())


Imports and settings loaded at 2025-09-29 10:55:12.518315


In [5]:
fac_path = os.path.join(work_dir, "factor_char_list.csv")
stock_vars = pl.read_csv(fac_path)["variable"].to_list()
print("Loaded predictors:", len(stock_vars))
print(stock_vars[:10])  # preview first 10


Loaded predictors: 147
['age', 'aliq_at', 'aliq_mat', 'ami_126d', 'at_be', 'at_gr1', 'at_me', 'at_turnover', 'be_gr1a', 'be_me']


In [7]:
csv_path = os.path.join(work_dir, "ret_sample.csv")
raw = pl.read_csv(
    csv_path,
    try_parse_dates=True,
    schema_overrides={
        **{v: pl.Float32 for v in stock_vars},  # predictors as floats
        "gvkey": pl.Utf8, "iid": pl.Utf8, "id": pl.Utf8,
    },
    low_memory=True,
    rechunk=True,
)

# Add predictor date (char_date as YYYYMMDD -> pl.Date)
raw = raw.with_columns(
    pl.col("char_date").cast(pl.Utf8).str.strptime(pl.Date, "%Y%m%d").alias("date")
)

raw = raw.filter(pl.col(ret_var).is_not_null())
print("Raw shape:", raw.shape)
print(raw.select(["date", "ret_eom", ret_var]).head(5))


Raw shape: (6401414, 159)
shape: (5, 3)
┌────────────┬──────────┬───────────┐
│ date       ┆ ret_eom  ┆ stock_ret │
│ ---        ┆ ---      ┆ ---       │
│ date       ┆ i64      ┆ f64       │
╞════════════╪══════════╪═══════════╡
│ 2005-01-31 ┆ 20050228 ┆ -0.143457 │
│ 2005-01-31 ┆ 20050228 ┆ 0.028077  │
│ 2005-01-31 ┆ 20050228 ┆ -0.168627 │
│ 2005-01-31 ┆ 20050228 ┆ 0.086271  │
│ 2005-01-31 ┆ 20050228 ┆ 0.149056  │
└────────────┴──────────┴───────────┘


In [8]:
def rank_to_unit(df: pl.DataFrame) -> pl.DataFrame:
    out = df
    for var in stock_vars:
        median = out[var].median()
        out = out.with_columns(
            pl.when(pl.col(var).is_null()).then(pl.lit(median, dtype=pl.Float32)).otherwise(pl.col(var)).alias(var)
        )
        out = out.with_columns((pl.col(var).rank("dense") - 1).cast(pl.Float32).alias(var))
        maxv = out[var].max()
        if maxv is None or maxv == 0:
            out = out.with_columns(pl.lit(0.0, dtype=pl.Float32).alias(var))
        else:
            out = out.with_columns(((pl.col(var) / pl.lit(maxv, dtype=pl.Float32)) * 2.0 - 1.0).alias(var))
    return out

data = raw.group_by("date", maintain_order=True).map_groups(rank_to_unit)
print("Rank-transformed shape:", data.shape)
print(data.head(5))


Rank-transformed shape: (6401414, 159)
shape: (5, 159)
┌────────────┬────────────┬──────────┬────────┬───┬───────────┬───────────┬────────────┬───────────┐
│ id         ┆ date       ┆ ret_eom  ┆ gvkey  ┆ … ┆ qmj       ┆ qmj_prof  ┆ qmj_growth ┆ qmj_safet │
│ ---        ┆ ---        ┆ ---      ┆ ---    ┆   ┆ ---       ┆ ---       ┆ ---        ┆ y         │
│ str        ┆ date       ┆ i64      ┆ str    ┆   ┆ f32       ┆ f32       ┆ f32        ┆ ---       │
│            ┆            ┆          ┆        ┆   ┆           ┆           ┆            ┆ f32       │
╞════════════╪════════════╪══════════╪════════╪═══╪═══════════╪═══════════╪════════════╪═══════════╡
│ comp_00108 ┆ 2005-01-31 ┆ 20050228 ┆ 001081 ┆ … ┆ -0.875554 ┆ -0.582499 ┆ -0.484126  ┆ -0.595185 │
│ 1_01C      ┆            ┆          ┆        ┆   ┆           ┆           ┆            ┆           │
│ comp_00109 ┆ 2005-01-31 ┆ 20050228 ┆ 001096 ┆ … ┆ -0.416403 ┆ -0.155202 ┆ -0.094292  ┆ -0.286774 │
│ 6_01C      ┆            ┆         

In [9]:
pred_frames = []
counter = 0

max_data_date = data.select(pl.col("date").max()).item()
safe_end = min(end_date, (max_data_date - datetime.timedelta(days=365)))

while (start_date + datetime.timedelta(days=365*(11 + counter))) <= safe_end:
    cutoff = [
        start_date,
        start_date + datetime.timedelta(days=365*(8  + counter)),
        start_date + datetime.timedelta(days=365*(10 + counter)),
        start_date + datetime.timedelta(days=365*(11 + counter)),
    ]
    print("Window", counter, cutoff)

    train = data.filter((pl.col("date") >= cutoff[0]) & (pl.col("date") < cutoff[1]))
    val   = data.filter((pl.col("date") >= cutoff[1]) & (pl.col("date") < cutoff[2]))
    test  = data.filter((pl.col("date") >= cutoff[2]) & (pl.col("date") < cutoff[3]))

    if train.height == 0 or val.height == 0 or test.height == 0:
        print("Empty window, skipping")
        counter += 1
        continue

    # Convert to numpy
    X_train = train.select(stock_vars).to_numpy()
    Y_train = train.select(ret_var).to_numpy().ravel()
    X_val   = val.select(stock_vars).to_numpy()
    Y_val   = val.select(ret_var).to_numpy().ravel()
    X_test  = test.select(stock_vars).to_numpy()
    Y_test  = test.select(ret_var).to_numpy().ravel()

    # Standardize
    scaler = StandardScaler().fit(X_train)
    X_train, X_val, X_test = scaler.transform(X_train), scaler.transform(X_val), scaler.transform(X_test)

    Y_mean = Y_train.mean()
    Y_dm   = Y_train - Y_mean

    reg_pred = test.select(["year", "month", "ret_eom", "id", ret_var]).to_pandas()

    # Models
    reg = LinearRegression(fit_intercept=False).fit(X_train, Y_dm)
    reg_pred["ols"] = reg.predict(X_test) + Y_mean

    # lambdas = np.arange(-4, 4.1, 0.1)
    # val_mse = [mean_squared_error(Y_val, Lasso(alpha=10**p, max_iter=1_000_000, fit_intercept=False).fit(X_train, Y_dm).predict(X_val) + Y_mean) for p in lambdas]
    # best = lambdas[int(np.argmin(val_mse))]
    # reg_pred["lasso"] = Lasso(alpha=10**best, max_iter=1_000_000, fit_intercept=False).fit(X_train, Y_dm).predict(X_test) + Y_mean

    lambdas = np.arange(-1, 8.1, 0.1)
    val_mse = [mean_squared_error(Y_val, Ridge(alpha=(10**p)*0.5, fit_intercept=False).fit(X_train, Y_dm).predict(X_val) + Y_mean) for p in lambdas]
    best = lambdas[int(np.argmin(val_mse))]
    reg_pred["ridge"] = Ridge(alpha=(10**best)*0.5, fit_intercept=False).fit(X_train, Y_dm).predict(X_test) + Y_mean

    # lambdas = np.arange(-4, 4.1, 0.1)
    # val_mse = [mean_squared_error(Y_val, ElasticNet(alpha=10**p, max_iter=1_000_000, fit_intercept=False).fit(X_train, Y_dm).predict(X_val) + Y_mean) for p in lambdas]
    # best = lambdas[int(np.argmin(val_mse))]
    # reg_pred["en"] = ElasticNet(alpha=10**best, max_iter=1_000_000, fit_intercept=False).fit(X_train, Y_dm).predict(X_test) + Y_mean

    pred_frames.append(reg_pred)
    counter += 1

print("Finished backtest loop with", counter, "windows")


Window 0 [datetime.date(2005, 1, 1), datetime.date(2012, 12, 30), datetime.date(2014, 12, 30), datetime.date(2015, 12, 30)]
Window 1 [datetime.date(2005, 1, 1), datetime.date(2013, 12, 30), datetime.date(2015, 12, 30), datetime.date(2016, 12, 29)]
Window 2 [datetime.date(2005, 1, 1), datetime.date(2014, 12, 30), datetime.date(2016, 12, 29), datetime.date(2017, 12, 29)]
Window 3 [datetime.date(2005, 1, 1), datetime.date(2015, 12, 30), datetime.date(2017, 12, 29), datetime.date(2018, 12, 29)]
Window 4 [datetime.date(2005, 1, 1), datetime.date(2016, 12, 29), datetime.date(2018, 12, 29), datetime.date(2019, 12, 29)]
Window 5 [datetime.date(2005, 1, 1), datetime.date(2017, 12, 29), datetime.date(2019, 12, 29), datetime.date(2020, 12, 28)]
Window 6 [datetime.date(2005, 1, 1), datetime.date(2018, 12, 29), datetime.date(2020, 12, 28), datetime.date(2021, 12, 28)]
Window 7 [datetime.date(2005, 1, 1), datetime.date(2019, 12, 29), datetime.date(2021, 12, 28), datetime.date(2022, 12, 28)]
Window 8

In [None]:
print(test.columns)


Index(['id', 'date', 'ret_eom', 'gvkey', 'iid', 'excntry', 'stock_ret', 'year',
       'month', 'char_date',
       ...
       'betadown_252d', 'prc_highprc_252d', 'corr_1260d', 'betabab_1260d',
       'rmax5_rvol_21d', 'age', 'qmj', 'qmj_prof', 'qmj_growth', 'qmj_safety'],
      dtype='object', length=159)


In [10]:
pred_out = pd.concat(pred_frames, ignore_index=True)
pred_out_pl = pl.from_pandas(pred_out)
pred_out_pl.write_csv(os.path.join(work_dir, "output.csv"))
print("Saved to output.csv")

yreal = pred_out[ret_var].to_numpy()
ols_and_ridge = ["ols", "ridge"]
all = ["ols", "lasso", "ridge", "en"]
for name in ols_and_ridge:
    ypred = pred_out[name].to_numpy()
    r2 = 1 - np.sum((yreal - ypred)**2) / np.sum(yreal**2)
    print(name, r2)

print(datetime.datetime.now())


Saved to output.csv
ols 4.046042676120187e-06
ridge 2.91401647589673e-06
2025-09-29 21:55:56.130209
