In [None]:
# copy of penalized_linear_hackathon
import datetime
import pandas as pd
import numpy as np
import polars as pl
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error


In [None]:

# Display current time (for timing purposes)
print(datetime.datetime.now())

# Set working directory (adjust as necessary)
work_dir = "C:\\Users\\Naifu\\Desktop\\Hackathon"


# Read predictors
file_path = os.path.join(work_dir, "factor_char_list.csv")
stock_vars = pl.read_csv(file_path)["variable"].to_list()

# Read sample data with Polars
file_path = os.path.join(work_dir, "ret_sample.csv")
raw = pl.read_csv(
    file_path,
    try_parse_dates=True,    # automatically parse "ret_eom" as date
    dtypes={var: pl.Float32 for var in stock_vars},  # enforce float32 for factors
    low_memory=True,
    rechunk=True
)

# Add a predictor date column
raw = raw.with_columns([
    pl.col("char_date").str.strptime(pl.Date, "%Y%m%d").alias("date")
])

# Drop missing stock_ret early
raw = raw.filter(pl.col("stock_ret").is_not_null())




2025-09-27 14:06:41.197054


ValueError: Integer column has NA values in column 3

In [None]:
# Define the return variable
ret_var = "stock_ret"
new_set = raw[raw[ret_var].notna()].copy() # Filter out missing returns
# Use char_date (already in your CSV) as the predictor date
new_set["date"] = pd.to_datetime(new_set["char_date"], format="%Y%m%d")
print(new_set[["date","ret_eom","stock_ret"]].head(10))
print("Date range:", new_set["date"].min(), "to", new_set["date"].max())

monthly = new_set.groupby("date")

MemoryError: Unable to allocate 6.55 GiB for an array with shape (149, 5901414) and data type float64

In [None]:

# Rank-transform each stock variable monthly
data = pd.DataFrame()
for date, monthly_raw in monthly:
    group = monthly_raw.copy()
    for var in stock_vars:
        var_median = group[var].median(skipna=True)
        group[var] = group[var].fillna(var_median)  # Fill missing values with median
        group[var] = group[var].rank(method="dense") - 1
        group_max = group[var].max()
        if group_max > 0:
            group[var] = (group[var] / group_max) * 2 - 1
        else:
            group[var] = 0  # Handle all missing values
            print(f"Warning: {date} {var} set to zero.")

    # Append the adjusted data
    data = pd.concat([data, group], ignore_index=True)

# Set initial training start date
starting = pd.to_datetime("2005-01-01")
pred_out = pd.DataFrame()




In [None]:

data["date"] = pd.to_datetime(data["date"])


In [None]:
print(data.dtypes)
print(data.head())
print(data["date"].min(), data["date"].max())


id                    object
date          datetime64[ns]
ret_eom       datetime64[ns]
gvkey                float64
iid                   object
                   ...      
age                  float64
qmj                  float64
qmj_prof             float64
qmj_growth           float64
qmj_safety           float64
Length: 159, dtype: object
                id       date    ret_eom     gvkey  iid excntry  stock_ret  \
0  comp_257918_01W 2006-11-30 2006-11-30  257918.0  01W     KOR   0.139202   
1  comp_257919_01W 2006-11-30 2006-11-30  257919.0  01W     KOR   0.238227   
2  comp_257920_01W 2006-11-30 2006-11-30  257920.0  01W     KOR   0.192876   
3  comp_257921_01W 2006-11-30 2006-11-30  257921.0  01W     SGP  -0.032908   
4  comp_257922_01W 2006-11-30 2006-11-30  257922.0  01W     KOR   0.156564   

   year  month  char_date  ...  betadown_252d  prc_highprc_252d  corr_1260d  \
0  2006     11   20061031  ...      -0.265839         -0.145427   -0.043652   
1  2006     11   20061031  

In [None]:
print("Starting training")
# Expanding window backtest loop
counter = 0
while (starting + pd.DateOffset(years=11 + counter)) <= pd.to_datetime("2026-01-01"):
    print("Train "+str(counter))
    cutoff = [
        starting,
        starting + pd.DateOffset(years=8 + counter),  # 8 years for training
        starting + pd.DateOffset(years=10 + counter),  # 2 years for validation
        starting + pd.DateOffset(years=11 + counter),  # 1 year for testing
    ]

    # Split the dataset into training, validation, and test sets
    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
    validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])]

    # Skip if any set is empty
    if len(train) == 0 or len(validate) == 0 or len(test) == 0:
        print("Empty")
        counter += 1
        continue
    # Standardize the data
    scaler = StandardScaler().fit(train[stock_vars])
    train[stock_vars] = scaler.transform(train[stock_vars])
    validate[stock_vars] = scaler.transform(validate[stock_vars])
    test[stock_vars] = scaler.transform(test[stock_vars])

    # Prepare training, validation, and test sets
    X_train = train[stock_vars].values
    Y_train = train[ret_var].values
    X_val = validate[stock_vars].values
    Y_val = validate[ret_var].values
    X_test = test[stock_vars].values
    Y_test = test[ret_var].values

    # Demean the returns
    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean

    # Linear regression prediction
    reg = LinearRegression(fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    x_pred = reg.predict(X_test) + Y_mean
    reg_pred = test[["year", "month", "ret_eom", "id", ret_var]].copy()
    reg_pred["ols"] = x_pred

    # Lasso Regression
    lambdas = np.arange(-4, 4.1, 0.1)
    val_mse = np.zeros(len(lambdas))
    for ind, i in enumerate(lambdas):
        reg = Lasso(alpha=(10**i), max_iter=1000000, fit_intercept=False)
        reg.fit(X_train, Y_train_dm)
        val_mse[ind] = mean_squared_error(Y_val, reg.predict(X_val) + Y_mean)

    best_lambda = lambdas[np.argmin(val_mse)]
    reg = Lasso(alpha=(10**best_lambda), max_iter=1000000, fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    x_pred = reg.predict(X_test) + Y_mean
    reg_pred["lasso"] = x_pred

    # Ridge Regression
    lambdas = np.arange(-1, 8.1, 0.1)
    val_mse = np.zeros(len(lambdas))
    for ind, i in enumerate(lambdas):
        reg = Ridge(alpha=(10**i * 0.5), fit_intercept=False)
        reg.fit(X_train, Y_train_dm)
        val_mse[ind] = mean_squared_error(Y_val, reg.predict(X_val) + Y_mean)

    best_lambda = lambdas[np.argmin(val_mse)]
    reg = Ridge(alpha=(10**best_lambda * 0.5), fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    x_pred = reg.predict(X_test) + Y_mean
    reg_pred["ridge"] = x_pred



    # ElasticNet Regression
    lambdas = np.arange(-4, 4.1, 0.1)
    val_mse = np.zeros(len(lambdas))
    for ind, i in enumerate(lambdas):
        reg = ElasticNet(alpha=(10**i), max_iter=1000000, fit_intercept=False)
        reg.fit(X_train, Y_train_dm)
        val_mse[ind] = mean_squared_error(Y_val, reg.predict(X_val) + Y_mean)

    best_lambda = lambdas[np.argmin(val_mse)]
    reg = ElasticNet(alpha=(10**best_lambda), max_iter=1000000, fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    x_pred = reg.predict(X_test) + Y_mean
    reg_pred["en"] = x_pred

    # Append predictions
    pred_out = pd.concat([pred_out, reg_pred], ignore_index=True)
    if ret_var not in pred_out.columns:
        raise ValueError(f"Target column {ret_var} not found in pred_out. Columns are: {pred_out.columns.tolist()}")
    if ret_var not in reg_pred.columns:
        raise ValueError(f"Target column {ret_var} not found in reg_pred. Columns: {reg_pred.columns.tolist()}")

    # Move to the next year
    counter += 1

    print("reg_pred columns:", reg_pred.columns.tolist())
 


Starting training
Train 0
Empty
Train 1
Empty
Train 2
Empty
Train 3
Empty
Train 4
Empty
Train 5
Empty
Train 6
Empty
Train 7
Empty
Train 8
Empty
Train 9
Empty
Train 10
Empty


In [None]:
print(test.columns)


Index(['id', 'date', 'ret_eom', 'gvkey', 'iid', 'excntry', 'stock_ret', 'year',
       'month', 'char_date',
       ...
       'betadown_252d', 'prc_highprc_252d', 'corr_1260d', 'betabab_1260d',
       'rmax5_rvol_21d', 'age', 'qmj', 'qmj_prof', 'qmj_growth', 'qmj_safety'],
      dtype='object', length=159)


In [None]:
# Output the predicted values
pred_out.to_csv("output.csv", index=False)

# Print OOS R2
yreal = pred_out[ret_var].values
for model_name in ["ols", "lasso", "ridge", "en"]:
    ypred = pred_out[model_name].values
    r2 = 1 - np.sum(np.square((yreal - ypred))) / np.sum(np.square(yreal))
    print(model_name, r2)

print(datetime.datetime.now())

KeyError: 'stock_ret'