In [4]:
# Cell 1: Imports and Load Data

import pandas as pd
import numpy as np
import os
from pathlib import Path
import datetime as dt
from datetime import timezone, timedelta # <<<<<<<<<<<< ADD timezone HERE
import logging
import json # For saving feature_columns_list
import joblib # For saving the model and scaler

from sklearn.model_selection import train_test_split # We'll do a chronological split manually
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler # For feature scaling
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# --- Logging Setup ---
# ... (rest of logging setup as before) ...
logger_name = f"model_training_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}"
logger = logging.getLogger(logger_name)
if not logger.handlers: # Avoid adding handlers if re-running cell
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s.%(funcName)s:%(lineno)d - %(message)s')
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    logger.addHandler(ch)
else:
    logger.setLevel(logging.INFO)

# --- Configuration ---
# ... (rest of configuration as before) ...
current_notebook_dir = Path.cwd()
FEATURES_DIR = current_notebook_dir.parent / "features"
logger.info(f"Attempting to find feature files in: {FEATURES_DIR.resolve()}")

try:
    if not FEATURES_DIR.exists():
        raise FileNotFoundError(f"The directory {FEATURES_DIR.resolve()} does not exist. Please check the path.")
    feature_files = sorted(FEATURES_DIR.glob("kalshi_btc_features_target_v1_*.csv"), key=os.path.getctime, reverse=True)
    if not feature_files:
        raise FileNotFoundError(f"No feature CSV files found in {FEATURES_DIR.resolve()} matching pattern 'kalshi_btc_features_target_v1_*.csv'")
    FEATURES_CSV_PATH = feature_files[0]
    logger.info(f"Using features CSV: {FEATURES_CSV_PATH.resolve()}")
except FileNotFoundError as e:
    logger.critical(str(e))
    FEATURES_CSV_PATH = None
except Exception as e:
    logger.critical(f"Error finding features CSV: {e}")
    FEATURES_CSV_PATH = None

MODEL_OUTPUT_DIR = current_notebook_dir.parent / "trained_models"
MODEL_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
logger.info(f"Trained models will be saved in: {MODEL_OUTPUT_DIR.resolve()}")


# --- Load the Features DataFrame ---
# ... (rest of data loading as before) ...
df_model_data = pd.DataFrame()

if FEATURES_CSV_PATH and FEATURES_CSV_PATH.exists():
    try:
        df_model_data = pd.read_csv(FEATURES_CSV_PATH)
        logger.info(f"Successfully loaded features data from: {FEATURES_CSV_PATH.resolve()}")
        logger.info(f"Shape of loaded data: {df_model_data.shape}")
        
        print("--- Data Head ---")
        print(df_model_data.head())
        print("\n--- Data Info ---")
        df_model_data.info()
        print("\n--- Data Description (Numerical) ---")
        print(df_model_data.describe().to_string())
        
    except Exception as e:
        logger.critical(f"Error loading features CSV {FEATURES_CSV_PATH.resolve()}: {e}")
else:
    if FEATURES_CSV_PATH:
         logger.critical(f"Features CSV file not found at the specified path: {FEATURES_CSV_PATH.resolve()}")
    else:
         logger.critical("FEATURES_CSV_PATH was not set (likely due to an error finding the file). Cannot load data.")

if df_model_data.empty:
    logger.warning("DataFrame df_model_data is empty. Subsequent cells might fail.")

2025-05-19 12:26:10,265 - INFO - model_training_20250519_122610.<module>:35 - Attempting to find feature files in: /Users/omarabul-hassan/Desktop/projects/kalshi/notebooks/features
2025-05-19 12:26:10,267 - INFO - model_training_20250519_122610.<module>:44 - Using features CSV: /Users/omarabul-hassan/Desktop/projects/kalshi/notebooks/features/kalshi_btc_features_target_v1_20250519_121558.csv
2025-05-19 12:26:10,267 - INFO - model_training_20250519_122610.<module>:54 - Trained models will be saved in: /Users/omarabul-hassan/Desktop/projects/kalshi/notebooks/trained_models
2025-05-19 12:26:12,299 - INFO - model_training_20250519_122610.<module>:64 - Successfully loaded features data from: /Users/omarabul-hassan/Desktop/projects/kalshi/notebooks/features/kalshi_btc_features_target_v1_20250519_121558.csv
2025-05-19 12:26:12,299 - INFO - model_training_20250519_122610.<module>:65 - Shape of loaded data: (1294800, 29)


--- Data Head ---
          kalshi_market_ticker  decision_point_ts_utc  kalshi_strike_price  \
0  KXBTCD-25MAY1522-T106249.99             1747357200            106249.99   
1  KXBTCD-25MAY1522-T106249.99             1747357260            106249.99   
2  KXBTCD-25MAY1522-T106249.99             1747357320            106249.99   
3  KXBTCD-25MAY1522-T106249.99             1747357380            106249.99   
4  KXBTCD-25MAY1522-T106249.99             1747357440            106249.99   

   btc_price_t_minus_1  btc_mom_5m  btc_mom_10m  btc_mom_15m  btc_mom_30m  \
0            103764.81       73.29       -69.79      -182.17        22.80   
1            103709.10       17.67      -111.60      -246.90       -68.90   
2            103785.66       83.21         7.89      -114.10      -109.26   
3            103691.25      -34.40         5.91      -227.89      -285.74   
4            103629.36     -110.19       -98.42      -303.61      -299.95   

   btc_vol_15m  btc_sma_10m  ...  TARGET_btc_diff_

In [5]:
# Cell 2: Data Preprocessing, Feature Selection, and Splitting

if df_model_data.empty:
    logger.error("df_model_data is empty. Cannot proceed with preprocessing and splitting. Please ensure Cell 1 ran correctly and loaded data.")
    # Depending on your workflow, you might want to raise an error or stop execution here
    # For now, we'll let it proceed, but subsequent steps will likely fail or do nothing.
else:
    logger.info(f"Starting preprocessing for df_model_data with shape: {df_model_data.shape}")

    # --- 1. Ensure Chronological Order ---
    # This should have been done when saving, but good to double-check or re-apply
    df_model_data.sort_values(by='decision_point_ts_utc', inplace=True)
    df_model_data.reset_index(drop=True, inplace=True)
    logger.info("Data sorted by 'decision_point_ts_utc'.")

    # --- 2. Handle Missing Values (NaNs) ---
    # Review NaN counts from Cell 1's output (df_model_data.info() and describe())
    # and the feature generation process.

    # Let's list the columns we expect to be features
    # Exclude identifiers and the target variable itself
    identifier_cols = ['kalshi_market_ticker', 'decision_point_ts_utc', 'kalshi_strike_price']
    target_col = 'TARGET_btc_diff_from_strike'
    
    # All other columns are potential features
    feature_columns = [col for col in df_model_data.columns if col not in identifier_cols + [target_col]]
    
    logger.info(f"Potential feature columns ({len(feature_columns)}): {feature_columns}")

    # NaN Handling Strategy:
    # For a first pass with Linear Regression, we typically need complete data.
    # Option A: Drop rows with any NaNs in the selected feature_columns.
    # Option B: Impute.
    
    # Let's check NaN counts for our selected feature_columns
    nan_summary = df_model_data[feature_columns].isnull().sum()
    nan_summary = nan_summary[nan_summary > 0].sort_values(ascending=False)
    if not nan_summary.empty:
        logger.warning(f"NaN values found in feature columns:\n{nan_summary}")
        
        # --- Imputation Strategy (Example - can be refined) ---
        # For Kalshi price features (bid, ask, spread, mid, changes), NaNs often mean no market activity.
        # Imputing with 0 or a special value might be an option.
        # For TA indicators, initial NaNs are expected.
        
        cols_to_fill_zero = [
            col for col in feature_columns if 'kalshi_mid_chg' in col or \
            'btc_mom' in col # Momentum can be zero if no change or at start
        ]
        cols_to_fill_median = [ # Median is often more robust to outliers than mean
            col for col in feature_columns if 'btc_vol' in col or \
            'btc_sma' in col or 'btc_ema' in col 
        ]
        cols_to_fill_rsi_neutral = [col for col in feature_columns if 'btc_rsi' in col]

        # Kalshi bid/ask/spread/mid_price NaNs are tricky.
        # If NaN, it means no quote. For now, let's fill with a value that might indicate this.
        # Or, consider creating a binary feature "kalshi_quotes_available".
        # For simplicity, let's fill yes_bid with 0, yes_ask with 100 (max spread, low confidence)
        # and mid_price with 50. Spread would then be 100.
        # This is a very basic strategy and might introduce bias or noise.
        
        if 'kalshi_yes_bid' in df_model_data.columns:
            df_model_data['kalshi_yes_bid'] = df_model_data['kalshi_yes_bid'].fillna(0) # Assign back
            logger.info("Filled NaNs in 'kalshi_yes_bid' with 0.")
        if 'kalshi_yes_ask' in df_model_data.columns:
            df_model_data['kalshi_yes_ask'] = df_model_data['kalshi_yes_ask'].fillna(100) # Assign back
            logger.info("Filled NaNs in 'kalshi_yes_ask' with 100.")
        
        # Re-calculate spread and mid_price if they existed and bids/asks were filled
        if 'kalshi_yes_bid' in df_model_data.columns and 'kalshi_yes_ask' in df_model_data.columns:
            if 'kalshi_spread' in df_model_data.columns:
                df_model_data['kalshi_spread'] = df_model_data['kalshi_yes_ask'] - df_model_data['kalshi_yes_bid']
                logger.info("Recalculated 'kalshi_spread' after filling bid/ask.")
            if 'kalshi_mid_price' in df_model_data.columns:
                 df_model_data['kalshi_mid_price'] = (df_model_data['kalshi_yes_bid'] + df_model_data['kalshi_yes_ask']) / 2
                 logger.info("Recalculated 'kalshi_mid_price' after filling bid/ask.")
        
        for col in cols_to_fill_zero:
            if col in df_model_data.columns:
                df_model_data[col] = df_model_data[col].fillna(0) # Assign back
                logger.info(f"Filled NaNs in '{col}' with 0.")

        for col in cols_to_fill_median:
            if col in df_model_data.columns:
                median_val = df_model_data[col].median()
                df_model_data[col] = df_model_data[col].fillna(median_val) # Assign back
                logger.info(f"Filled NaNs in '{col}' with its median ({median_val:.4f}).")

        for col in cols_to_fill_rsi_neutral:
            if col in df_model_data.columns:
                df_model_data[col] = df_model_data[col].fillna(50) # Assign back
                logger.info(f"Filled NaNs in '{col}' with 50.")

        # For remaining NaNs in features (e.g., volume, open interest if sparse), drop rows
        original_row_count = len(df_model_data)
        df_model_data.dropna(subset=feature_columns, inplace=True)
        logger.info(f"Dropped {original_row_count - len(df_model_data)} rows due to remaining NaNs in feature columns after imputation attempts.")
        
        final_nan_summary = df_model_data[feature_columns].isnull().sum()
        final_nan_summary = final_nan_summary[final_nan_summary > 0]
        if not final_nan_summary.empty:
            logger.error(f"Still have NaNs after processing! Columns:\n{final_nan_summary}")
        else:
            logger.info("Successfully handled NaNs in feature columns.")

    else:
        logger.info("No NaNs found in the selected feature columns.")
        
    # --- 3. Define Features (X) and Target (y) ---
    if not df_model_data.empty:
        X = df_model_data[feature_columns].copy() # Ensure we use the cleaned feature_columns
        y = df_model_data[target_col].copy()
        logger.info(f"Defined X (features) with shape: {X.shape}")
        logger.info(f"Defined y (target) with shape: {y.shape}")

        # --- 4. Split Data (Chronological) ---
        # We'll use roughly 80% for training, 20% for testing.
        # The data is already sorted by 'decision_point_ts_utc'.
        split_ratio = 0.8
        split_index = int(len(X) * split_ratio)

        X_train = X.iloc[:split_index]
        y_train = y.iloc[:split_index]
        X_test = X.iloc[split_index:]
        y_test = y.iloc[split_index:]

        logger.info(f"Data split chronologically:")
        logger.info(f"  X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
        logger.info(f"  X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
        
        # Log the time range for train and test sets
        train_start_ts = df_model_data['decision_point_ts_utc'].iloc[0]
        train_end_ts = df_model_data['decision_point_ts_utc'].iloc[split_index - 1]
        test_start_ts = df_model_data['decision_point_ts_utc'].iloc[split_index]
        test_end_ts = df_model_data['decision_point_ts_utc'].iloc[-1]

        logger.info(f"  Training data from: {dt.datetime.fromtimestamp(train_start_ts, tz=timezone.utc).isoformat()} to {dt.datetime.fromtimestamp(train_end_ts, tz=timezone.utc).isoformat()}")
        logger.info(f"  Test data from:     {dt.datetime.fromtimestamp(test_start_ts, tz=timezone.utc).isoformat()} to {dt.datetime.fromtimestamp(test_end_ts, tz=timezone.utc).isoformat()}")
        
        # --- 5. Feature Scaling ---
        # Linear models often benefit from scaling.
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test) # Use the scaler fitted on training data

        # Convert scaled arrays back to DataFrames with original column names for easier inspection (optional)
        X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
        X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

        logger.info("Features scaled using StandardScaler.")
        print("\nSample of scaled training features (X_train_scaled_df head):")
        print(X_train_scaled_df.head())
        
        # Save the scaler
        scaler_path = MODEL_OUTPUT_DIR / "feature_scaler_v1.joblib"
        joblib.dump(scaler, scaler_path)
        logger.info(f"Scaler saved to: {scaler_path}")
        
        # Also save the list of feature columns used for training (in order)
        # This is CRITICAL for the backtester to use the same features.
        feature_columns_list_path = MODEL_OUTPUT_DIR / "feature_columns_v1.json"
        with open(feature_columns_list_path, 'w') as f:
            json.dump(feature_columns.tolist() if isinstance(feature_columns, pd.Index) else feature_columns, f)
        logger.info(f"List of feature columns saved to: {feature_columns_list_path}")

    else:
        logger.error("df_model_data is empty after NaN handling. Cannot proceed to define X, y, or split.")
        # Initialize X, y, etc. as empty or None to prevent errors in later cells if run out of order
        X, y, X_train, y_train, X_test, y_test, X_train_scaled_df, X_test_scaled_df = [pd.DataFrame()]*8 
        scaler = None

2025-05-19 12:27:20,651 - INFO - model_training_20250519_122610.<module>:8 - Starting preprocessing for df_model_data with shape: (1294800, 29)
2025-05-19 12:27:20,762 - INFO - model_training_20250519_122610.<module>:14 - Data sorted by 'decision_point_ts_utc'.
2025-05-19 12:27:20,763 - INFO - model_training_20250519_122610.<module>:28 - Potential feature columns (25): ['btc_price_t_minus_1', 'btc_mom_5m', 'btc_mom_10m', 'btc_mom_15m', 'btc_mom_30m', 'btc_vol_15m', 'btc_sma_10m', 'btc_sma_30m', 'btc_ema_12m', 'btc_ema_26m', 'btc_rsi', 'distance_to_strike', 'time_until_market_close_min', 'hour_of_day_utc', 'day_of_week_utc', 'hour_of_day_edt', 'kalshi_yes_bid', 'kalshi_yes_ask', 'kalshi_spread', 'kalshi_mid_price', 'kalshi_volume_t_minus_1', 'kalshi_open_interest_t_minus_1', 'kalshi_mid_chg_1m', 'kalshi_mid_chg_3m', 'kalshi_mid_chg_5m']
kalshi_mid_chg_5m                 500715
kalshi_mid_chg_3m                 485432
kalshi_mid_chg_1m                 467088
kalshi_yes_bid               


Sample of scaled training features (X_train_scaled_df head):
    btc_price_t_minus_1  btc_mom_5m  btc_mom_10m  btc_mom_15m  btc_mom_30m  \
18            -1.880955   -0.267215    -0.671256    -0.499148      0.23473   
19            -1.880955   -0.267215    -0.671256    -0.499148      0.23473   
20            -1.880955   -0.267215    -0.671256    -0.499148      0.23473   
21            -1.880955   -0.267215    -0.671256    -0.499148      0.23473   
22            -1.880955   -0.267215    -0.671256    -0.499148      0.23473   

    btc_vol_15m  btc_sma_10m  btc_sma_30m  btc_ema_12m  btc_ema_26m  ...  \
18    -0.365124     -1.86631    -1.870939    -1.867321    -1.868163  ...   
19    -0.365124     -1.86631    -1.870939    -1.867321    -1.868163  ...   
20    -0.365124     -1.86631    -1.870939    -1.867321    -1.868163  ...   
21    -0.365124     -1.86631    -1.870939    -1.867321    -1.868163  ...   
22    -0.365124     -1.86631    -1.870939    -1.867321    -1.868163  ...   

    hour_of_

In [6]:
# Cell 3: Model Training and Evaluation

if 'X_train_scaled_df' not in globals() or X_train_scaled_df.empty:
    logger.error("Scaled training data (X_train_scaled_df) not found or is empty. Please ensure Cell 2 ran successfully.")
    # Optionally, raise an error or stop notebook execution
    # For now, this cell will likely fail if data is missing.
else:
    logger.info("--- Starting Model Training ---")

    # --- 1. Initialize and Train Linear Regression Model ---
    linear_model = LinearRegression()
    logger.info(f"Training LinearRegression model on {X_train_scaled_df.shape[0]} samples...")
    
    # Ensure y_train is available
    if 'y_train' not in globals() or y_train.empty:
        logger.error("y_train is not available. Cannot train model.")
    else:
        try:
            linear_model.fit(X_train_scaled_df, y_train)
            logger.info("LinearRegression model training complete.")

            # --- 2. Make Predictions on the Test Set ---
            logger.info(f"Making predictions on the test set ({X_test_scaled_df.shape[0]} samples)...")
            y_pred_test = linear_model.predict(X_test_scaled_df)

            # --- 3. Evaluate Model Performance (Regression Metrics) ---
            if 'y_test' not in globals() or y_test.empty:
                logger.error("y_test is not available. Cannot evaluate model.")
            else:
                mae = mean_absolute_error(y_test, y_pred_test)
                mse = mean_squared_error(y_test, y_pred_test)
                rmse = np.sqrt(mse)
                r2 = r2_score(y_test, y_pred_test)

                logger.info("\n--- Regression Model Evaluation Metrics (Test Set) ---")
                logger.info(f"  Mean Absolute Error (MAE):      {mae:.4f}")
                logger.info(f"  Mean Squared Error (MSE):       {mse:.4f}")
                logger.info(f"  Root Mean Squared Error (RMSE): {rmse:.4f}")
                logger.info(f"  R-squared (R2 Score):           {r2:.4f}")

                # --- 4. Custom Evaluation (Trading-Oriented) ---
                # The target is BTC_price_at_resolution - kalshi_strike_price
                # Prediction > 0 implies model thinks BTC will be above strike (favors Kalshi YES)
                # Prediction < 0 implies model thinks BTC will be below strike (favors Kalshi NO)

                # Actual outcome sign:
                actual_outcome_sign = np.sign(y_test) # 1 if >0 (YES), -1 if <0 (NO), 0 if exactly on strike

                # Predicted outcome sign:
                predicted_outcome_sign = np.sign(y_pred_test)

                # Accuracy of predicting the correct side of the strike
                correct_side_predictions = np.sum(actual_outcome_sign == predicted_outcome_sign)
                # Exclude cases where actual_outcome_sign is 0 (BTC landed exactly on strike) for a clearer accuracy
                valid_outcomes_for_sign_accuracy = actual_outcome_sign[actual_outcome_sign != 0]
                valid_predictions_for_sign_accuracy = predicted_outcome_sign[actual_outcome_sign != 0]
                
                if len(valid_outcomes_for_sign_accuracy) > 0:
                    sign_accuracy = np.sum(valid_outcomes_for_sign_accuracy == valid_predictions_for_sign_accuracy) / len(valid_outcomes_for_sign_accuracy)
                    logger.info(f"  Accuracy (Predicting Side of Strike): {sign_accuracy:.4f} (on {len(valid_outcomes_for_sign_accuracy)} non-zero outcome samples)")
                else:
                    logger.info("  Accuracy (Predicting Side of Strike): N/A (no non-zero actual outcomes)")

                # Create a DataFrame for easier analysis of predictions vs actuals
                df_results = pd.DataFrame({
                    'actual_target': y_test,
                    'predicted_target': y_pred_test,
                    'actual_sign': actual_outcome_sign,
                    'predicted_sign': predicted_outcome_sign
                })
                print("\n--- Sample of Test Set Predictions vs Actuals ---")
                print(df_results.head(10).to_string())

                # --- 5. Inspect Model Coefficients ---
                logger.info("\n--- Model Coefficients ---")
                logger.info(f"Intercept: {linear_model.intercept_:.4f}")
                
                # Ensure feature_columns is available (should be from Cell 2, or loaded from JSON)
                if 'feature_columns' not in globals():
                    feature_columns_list_path = MODEL_OUTPUT_DIR / "feature_columns_v1.json"
                    if feature_columns_list_path.exists():
                        with open(feature_columns_list_path, 'r') as f:
                            feature_columns = json.load(f)
                        logger.info(f"Loaded feature_columns list from {feature_columns_list_path}")
                    else:
                        logger.warning("feature_columns list not found. Cannot display coefficient names.")
                        feature_columns = [f"feature_{i}" for i in range(len(linear_model.coef_))]

                coefficients = pd.DataFrame({'feature': feature_columns, 'coefficient': linear_model.coef_})
                coefficients['abs_coefficient'] = np.abs(coefficients['coefficient'])
                coefficients.sort_values(by='abs_coefficient', ascending=False, inplace=True)
                
                print("\nTop Coefficients (by absolute value):")
                print(coefficients.head(20).to_string()) # Print top N coefficients

                # --- 6. Save the Trained Model ---
                model_path = MODEL_OUTPUT_DIR / "linear_regression_btc_predictor_v1.joblib"
                joblib.dump(linear_model, model_path)
                logger.info(f"Trained Linear Regression model saved to: {model_path}")

                # Save model parameters (intercept, coefficients) to a JSON file for backtest.py
                # This is an alternative to loading the whole joblib model in backtest.py
                # and can be simpler if linreg_strategy.py only needs these.
                model_params_for_backtest = {
                    'intercept': linear_model.intercept_,
                    'coefficients': dict(zip(feature_columns, linear_model.coef_)),
                    'feature_order': feature_columns # Store the order for consistent dot product
                }
                params_path = MODEL_OUTPUT_DIR / "lr_model_params_v1.json"
                with open(params_path, 'w') as f:
                    json.dump(model_params_for_backtest, f, indent=4)
                logger.info(f"Model parameters (intercept, coefs, feature_order) saved to: {params_path}")

        except Exception as e:
            logger.critical(f"An error occurred during model training or evaluation: {e}")
            if 'linear_model' in locals():
                 logger.info("Model training might have partially completed or failed during evaluation.")

2025-05-19 12:45:29,800 - INFO - model_training_20250519_122610.<module>:8 - --- Starting Model Training ---
2025-05-19 12:45:29,801 - INFO - model_training_20250519_122610.<module>:12 - Training LinearRegression model on 669507 samples...
2025-05-19 12:45:29,956 - INFO - model_training_20250519_122610.<module>:20 - LinearRegression model training complete.
2025-05-19 12:45:29,956 - INFO - model_training_20250519_122610.<module>:23 - Making predictions on the test set (167377 samples)...
2025-05-19 12:45:29,964 - INFO - model_training_20250519_122610.<module>:35 - 
--- Regression Model Evaluation Metrics (Test Set) ---
2025-05-19 12:45:29,964 - INFO - model_training_20250519_122610.<module>:36 -   Mean Absolute Error (MAE):      628.1029
2025-05-19 12:45:29,964 - INFO - model_training_20250519_122610.<module>:37 -   Mean Squared Error (MSE):       612007.2330
2025-05-19 12:45:29,965 - INFO - model_training_20250519_122610.<module>:38 -   Root Mean Squared Error (RMSE): 782.3089
2025-05


--- Sample of Test Set Predictions vs Actuals ---
         actual_target  predicted_target  actual_sign  predicted_sign
1105142        -608.82       -692.555904         -1.0            -1.0
1105143       -2053.99       -708.475295         -1.0            -1.0
1105144        -858.82       -827.559232         -1.0            -1.0
1105145         -53.99        880.466277         -1.0             1.0
1105146        1446.01       1423.703398          1.0             1.0
1105147       -1858.82      -1058.190248         -1.0            -1.0
1105148         891.18       1294.841098          1.0             1.0
1105149        -608.82       -697.171473         -1.0            -1.0
1105150       -2108.82      -1120.463571         -1.0            -1.0
1105151         641.18       1237.183344          1.0             1.0

Top Coefficients (by absolute value):
                           feature  coefficient  abs_coefficient
0              btc_price_t_minus_1  3084.456147      3084.456147
9         