In [20]:
# import libraries
import numpy as np
import pandas as pd

from warnings import filterwarnings
filterwarnings("ignore")

from pathlib import Path

In [None]:
# Set relative path to the dataset folder
data_path = Path("01_data")

# Read CSV files from data_path
print("\n🔧 Loading CSV files...")
df_train = pd.read_csv(data_path/'train.csv')
df_test = pd.read_csv(data_path/'test.csv')

print(f"   - train.csv: {df_train.shape}")
print(f"   - test.csv: {df_test.shape}")

# Clean column names for all DataFrames: apply lower() and strip()
# Store original columns for reference
original_train_cols = df_train.columns.tolist()
original_test_cols = df_test.columns.tolist()

# Clean column names
df_train.columns = df_train.columns.str.lower().str.strip()
df_test.columns = df_test.columns.str.lower().str.strip()


🔧 Loading CSV files...
   - train.csv: (8990, 98)
   - test.csv: (10, 99)
   ✅ df_train columns cleaned
   ✅ df_test columns cleaned


In [22]:
df_train.head()

Unnamed: 0,date_id,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,v3,v4,v5,v6,v7,v8,v9,forward_returns,risk_free_rate,market_forward_excess_returns
0,0,0,0,0,1,1,0,0,0,1,...,,,,,,,,-0.002421,0.000301,-0.003038
1,1,0,0,0,1,1,0,0,0,1,...,,,,,,,,-0.008495,0.000303,-0.009114
2,2,0,0,0,1,0,0,0,0,1,...,,,,,,,,-0.009624,0.000301,-0.010243
3,3,0,0,0,1,0,0,0,0,0,...,,,,,,,,0.004662,0.000299,0.004046
4,4,0,0,0,1,0,0,0,0,0,...,,,,,,,,-0.011686,0.000299,-0.012301


In [23]:
df_test.head()

Unnamed: 0,date_id,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,v4,v5,v6,v7,v8,v9,is_scored,lagged_forward_returns,lagged_risk_free_rate,lagged_market_forward_excess_returns
0,8980,0,0,0,0,1,0,0,1,0,...,0.828042,0.999172,0.759921,-0.803127,0.170966,-0.751909,True,0.003541,0.000161,0.003068
1,8981,0,0,0,0,1,0,0,1,0,...,0.831349,1.120336,0.556217,-0.686192,0.141865,-0.660326,True,-0.005964,0.000162,-0.006437
2,8982,0,0,0,0,1,0,0,0,1,...,0.832672,1.088992,0.665344,-0.459367,0.199405,-0.510979,True,-0.00741,0.00016,-0.007882
3,8983,0,0,0,0,1,0,0,0,1,...,0.835979,1.040988,0.594577,-0.561643,0.161706,-0.575997,True,0.00542,0.00016,0.004949
4,8984,0,0,0,0,0,0,1,0,1,...,0.839947,0.944593,0.715608,-0.692649,0.124669,-0.654045,True,0.008357,0.000159,0.007887


In [24]:
"""
train.csv Historic market data. 
The coverage stretches back decades; expect to see extensive missing values early on.

date_id - An identifier for a single trading day.
M* - Market Dynamics/Technical features.
E* - Macro Economic features.
I* - Interest Rate features.
P* - Price/Valuation features.
V* - Volatility features.
S* - Sentiment features.
MOM* - Momentum features.
D* - Dummy/Binary features.

forward_returns - The returns from buying the S&P 500 and selling it a day later. Train set only.

risk_free_rate - The federal funds rate. Train set only.

market_forward_excess_returns - Forward returns relative to expectations. 
Computed by subtracting the rolling five-year mean forward returns and winsorizing 
the result using a median absolute deviation (MAD) with a criterion of 4. Train set only.
""";

In [25]:
"""
test.csv 
A mock test set representing the structure of the unseen test set. 
The test set used for the public leaderboard set is a copy of the last 180 date IDs in the train set. 
As a result, the public leaderboard scores are not meaningful. 
The unseen copy of this file served by the evaluation API may be updated during the model training phase.

date_id
[feature_name] - The feature columns are the same as in train.csv.
is_scored - Whether this row is included in the evaluation metric calculation. 
During the model training phase this will be true for the first 180 rows only. Test set only.

lagged_forward_returns - The returns from buying the S&P 500 and selling it a day later, provided with a lag of one day.

lagged_risk_free_rate - The federal funds rate, provided with a lag of one day.

lagged_market_forward_excess_returns - Forward returns relative to expectations. 
Computed by subtracting the rolling five-year mean forward returns and winsorizing the result 
using a median absolute deviation (MAD) with a criterion of 4, provided with a lag of one day.

""";