In [1]:
import pandas as pd
import numpy as np

In [2]:
train_0 = pd.read_parquet('dataset/train_0.parquet')
train_1 = pd.read_parquet('dataset/train_1.parquet')

In [8]:
test_df = pd.read_parquet('dataset/train_2.parquet')
test_df = test_df.dropna(axis=1, how='any')

In [4]:
train_1_na = train_1.dropna(axis=1, how='any')

In [5]:
train_0_na = train_0.dropna(axis=1, how='any')

In [6]:
train_df = pd.concat([train_0_na, train_1_na], axis=0)

In [7]:
train_df = train_df.reset_index(drop=True)

In [11]:
test_df.head()

Unnamed: 0,date_id,time_id,symbol_id,weight,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,...,feature_78,target_9,target_3,target_4,target_5,target_6,target_7,target_1,target_8,target_2
0,170,0,0,2.112212,1.06033,1.515157,0.352634,-0.447763,11,7,...,-0.421823,-0.293646,-0.061842,-0.305413,-0.419151,-0.111796,-0.535104,-0.044332,-0.039061,-0.744789
1,170,0,1,2.760715,0.482468,1.184037,0.171099,-0.247298,11,7,...,3.111076,-0.075267,-0.35936,-1.270054,-0.018332,-0.040286,-1.417509,0.08584,0.487232,-0.124533
2,170,0,2,1.813596,1.020798,1.318752,0.398088,-0.247506,81,2,...,0.458474,-5.0,-5.0,0.194658,-5.0,-5.0,-5.0,1.5834,0.018712,-1.055035
3,170,0,3,0.926893,0.510098,0.645825,0.198428,-0.129691,4,3,...,17.805511,3.336086,2.051951,2.400644,0.96273,-0.939277,1.84587,-2.372452,-1.663179,-4.585349
4,170,0,7,1.665231,0.547458,1.009267,0.178444,-0.172451,11,7,...,-0.249322,-0.707027,-0.344866,-1.248052,-0.129645,-3.145927,-0.452708,0.300044,0.489202,0.242737


In [14]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_white
from sklearn.metrics import r2_score  # For R² calculation

# Select only feature columns
# Find feature columns that exist in both train and test sets
feature_cols = [
    col for col in train_df.columns if col.startswith("feature_") and col in test_df.columns
]

# Select features from both datasets
# Select features from both datasets
X_train = train_df[feature_cols].copy()
X_test = test_df[feature_cols].copy()

# Handle NaNs or infinite values
X_train = X_train.replace([np.inf, -np.inf], np.nan).dropna()
X_test = X_test.replace([np.inf, -np.inf], np.nan).dropna()

# Add constant term for the intercept
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

# Ensure y_train and weights match the filtered X_train
train_df = train_df.loc[X_train.index]  # Keep only valid rows
test_df = test_df.loc[X_test.index]

# Targets
y_train_1 = train_df["target_1"]
y_train_2 = train_df["target_2"]
y_test_1 = test_df["target_1"]
y_test_2 = test_df["target_2"]

# Weights column
weights_train = train_df["weight"]
weights_test = test_df["weight"]

# Add constant for intercept
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

# Function to fit OLS, check heteroscedasticity, apply WLS if needed, and compute metrics
def fit_and_evaluate(y_train, y_test, weights_train, weights_test, target_name):
    print(f"\nTraining model for {target_name}...\n")
    
    # Fit OLS
    ols_model = sm.OLS(y_train, X_train).fit()
    print(f"OLS Summary for {target_name}:\n", ols_model.summary())

    # White's test for heteroscedasticity
    white_test = het_white(ols_model.resid, X_train)
    p_value = white_test[1]  # Second value is the p-value
    print(f"White's Test p-value for {target_name}: {p_value}")

    if p_value < 0.05:
        print(f"Heteroscedasticity detected in {target_name}. Applying WLS...")
        wls_model = sm.WLS(y_train, X_train, weights=1/weights_train).fit()
        print(f"WLS Summary for {target_name}:\n", wls_model.summary())
        final_model = wls_model
    else:
        print(f"No heteroscedasticity detected in {target_name}. Using OLS.")
        final_model = ols_model

    # Predictions
    y_pred = final_model.predict(X_test)

    # Apply sqrt(weights) transformation to both actual and predicted values
    sqrt_weights_test = np.sqrt(weights_test)
    y_test_scaled = y_test * sqrt_weights_test
    y_pred_scaled = y_pred * sqrt_weights_test

    # Compute Weighted MSE
    mse = np.mean(weights_test * (y_test - y_pred) ** 2)
    print(f"Weighted MSE for {target_name}: {mse:.4f}")

    # Compute R² on transformed values
    r2 = r2_score(y_test_scaled, y_pred_scaled)
    print(f"R² on test set for {target_name}: {r2:.4f}")

    return final_model, mse, r2

# Train and evaluate for target_1
model_1, mse_1, r2_1 = fit_and_evaluate(y_train_1, y_test_1, weights_train, weights_test, "target_1")

# Train and evaluate for target_2
model_2, mse_2, r2_2 = fit_and_evaluate(y_train_2, y_test_2, weights_train, weights_test, "target_2")



Training model for target_1...

OLS Summary for target_1:
                             OLS Regression Results                            
Dep. Variable:               target_1   R-squared:                       0.015
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     409.1
Date:                Wed, 12 Feb 2025   Prob (F-statistic):               0.00
Time:                        21:54:58   Log-Likelihood:            -1.2820e+06
No. Observations:              972105   AIC:                         2.564e+06
Df Residuals:                  972068   BIC:                         2.564e+06
Df Model:                          36                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------

KeyboardInterrupt: 

In [None]:
# Select feature columns present in both train_df and test_df
feature_cols = [
    col for col in train_df.columns if col.startswith("feature_") and col in test_df.columns
]

# Select features from both datasets
X_train = train_df[feature_cols].copy()
X_test = test_df[feature_cols].copy()

# Handle NaNs or infinite values
X_train = X_train.replace([np.inf, -np.inf], np.nan).dropna()
X_test = X_test.replace([np.inf, -np.inf], np.nan).dropna()

# Add constant term for the intercept
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

# Ensure y_train and weights match the filtered X_train
train_df = train_df.loc[X_train.index]  # Keep only valid rows
test_df = test_df.loc[X_test.index]

# Targets
y_train_1 = train_df["target_1"]
y_train_2 = train_df["target_2"]
y_test_1 = test_df["target_1"]
y_test_2 = test_df["target_2"]

# Weights column
weights_train = train_df["weights"]
weights_test = test_df["weights"]
