# README_pipeline_guide.md

# File 1: preprocess_data.py

In [None]:
# --- preprocess_data.py ---
import numpy as np
import pandas as pd
import math
import sklearn
import sklearn.preprocessing
import datetime
from scipy.stats.mstats import winsorize

pd.options.mode.chained_assignment = None

# *** DEFINE HELPER FUNCTION HERE ***
def find_col(df, potential_names, default=None):
    """Helper to find the first matching column name from a list."""
    for name in potential_names:
        if name in df.columns: return name
    print(f"  Warning: Could not find column using names: {potential_names}") # Added warning
    return default
# *** END HELPER FUNCTION DEFINITION ***

# --- Configuration ---
INPUT_FILE = "Cleaned_OSEFX_Market_Macro_Data.csv"
OUTPUT_FILE = "Cleaned_OSEFX_Market_Macro_Data_PREPROCESSED.csv" # Output for the pipeline
TARGET_COL_NAME = "TargetReturn_t+1"         # Standard name for the pipeline
NEXT_RAW_RET_COL_NAME = "NextMonthlyReturn_t+1" # Raw return for portfolio eval
MKT_CAP_ORIG_COL_NAME = "MarketCap_orig"        # Original market cap for portfolio eval

# --- Load Data ---
print(f"Loading raw data from: {INPUT_FILE}")
try:
    df = pd.read_csv(INPUT_FILE)
    df["Date"] = pd.to_datetime(df["Date"])
    print(f"Raw data loaded. Shape: {df.shape}")
except FileNotFoundError:
    print(f"ERROR: File not found: {INPUT_FILE}"); exit()
except Exception as e:
    print(f"ERROR loading data: {e}"); exit()

# --- Data Preparation ---
print("Sorting and preparing columns...")
df = df.sort_values(by=["Instrument", "Date"]).reset_index(drop=True)

# *** COLUMN CLEANING STEP ***
print("  Cleaning original column names...")
df.columns = df.columns.str.replace("[^A-Za-z0-9_]+", "_", regex=True).str.strip('_').str.replace('__', '_')
# Find the potentially renamed columns AFTER cleaning
norges_bank_10y_col = find_col(df, ['NorgesBank10Y', 'norgesbank10y']) # Helper function is now defined
if not norges_bank_10y_col:
    print("ERROR: Cannot find NorgesBank10Y column after cleaning.")
    exit()
close_price_col = find_col(df, ['ClosePrice', 'closeprice'])
common_shares_col = find_col(df, ['CommonSharesOutstanding', 'commonsharesoutstanding'])
if not close_price_col or not common_shares_col:
     print(f"ERROR: Cannot find ClosePrice/CommonSharesOutstanding after cleaning.")
     exit()
nibor3m_col = find_col(df, ['NIBOR3M', 'nibor3m'])
if not nibor3m_col:
    print("ERROR: Cannot find NIBOR3M column after cleaning.")
    exit()
print("  Original column names cleaned.")
# *** END OF CLEANING AND FINDING RENAMED COLS ***

# --- Continue with Calculations using found column names ---
# Calculate Monthly Return (t) and Winsorize EARLY
print("Calculating returns...")
df["MonthlyReturn_t"] = df.groupby("Instrument")[close_price_col].pct_change() # Use found name
# ... (rest of return calculation as before) ...
df["MonthlyReturn_t"].replace([np.inf, -np.inf], np.nan, inplace=True)
df["MonthlyReturn_t"].fillna(0, inplace=True)
df["MonthlyReturn_t"] = winsorize(df["MonthlyReturn_t"].values, limits=[0.01, 0.01])
print("  MonthlyReturn_t calculated and winsorized.")

# Calculate Risk-Free Rate (t) using the cleaned column name
df.loc[:, "MonthlyRiskFreeRate_t"] = df[norges_bank_10y_col] / 12 / 100 # Use found name
print("  MonthlyRiskFreeRate_t calculated.")

# Calculate Adjusted Return (Excess Return for month t)
df["AdjustedReturn_t"] = df["MonthlyReturn_t"] - df["MonthlyRiskFreeRate_t"]
print("  AdjustedReturn_t (Excess Return t) calculated.")

# --- Create Target and Necessary Lead Variables ---
# *** NOW THESE COLUMNS WILL BE CREATED WITH '+' ***
print("Calculating lead variables (Target and Next Raw Return)...")
df[TARGET_COL_NAME] = df.groupby("Instrument")["AdjustedReturn_t"].shift(-1)
print(f"  Target variable '{TARGET_COL_NAME}' created.")
df[NEXT_RAW_RET_COL_NAME] = df.groupby("Instrument")["MonthlyReturn_t"].shift(-1)
print(f"  Next raw return variable '{NEXT_RAW_RET_COL_NAME}' created.")
# *** END OF TARGET CREATION ***

# Drop rows where the TARGET variable is NaN (essential!)
initial_rows = len(df)
df.dropna(subset=[TARGET_COL_NAME], inplace=True)
print(f"  Dropped {initial_rows - len(df)} rows with missing target '{TARGET_COL_NAME}'.")
if df.empty: print("ERROR: DataFrame empty after dropping missing target."); exit()

# --- Feature Engineering ---
print("Performing feature engineering (Market Cap, Term Spread, Log Transforms)...")
# Recalculate MarketCap (t) using cleaned column names
df["MarketCap"] = df[close_price_col] * df[common_shares_col] # Use found names
df.loc[df["MarketCap"] <= 0, "MarketCap"] = np.nan
df[MKT_CAP_ORIG_COL_NAME] = df["MarketCap"].copy()
print(f"  MarketCap calculated and original stored in '{MKT_CAP_ORIG_COL_NAME}'.")

# Create term spread (t) using cleaned column names
df["TermSpread"] = df[norges_bank_10y_col] - df[nibor3m_col] # Use found names
print("  TermSpread calculated.")

# Log-transform relevant variables (using potentially cleaned names)
vars_to_log = ["MarketCap", "BM", "ClosePrice", "Volume", "CommonSharesOutstanding"]
print(f"  Log-transforming: {vars_to_log}")
for var in vars_to_log:
    # Find the potentially cleaned column name - use more robust check now
    potential_names = [var, var.lower(), var.replace("_","")] # Add different variations if needed
    cleaned_var_name = find_col(df, potential_names)
    if cleaned_var_name:
        df[cleaned_var_name] = pd.to_numeric(df[cleaned_var_name], errors='coerce')
        original_nan_mask = df[cleaned_var_name].isna()
        log_col = f"log_{cleaned_var_name}" # Create log name based on found name
        df[log_col] = np.nan
        positive_mask = (~original_nan_mask) & (df[cleaned_var_name] > 1e-9)
        df.loc[positive_mask, log_col] = np.log(df.loc[positive_mask, cleaned_var_name])
        print(f"    - Logged {positive_mask.sum()} positive values for {cleaned_var_name} -> {log_col}.")
    else:
        print(f"    - Warning: Column for '{var}' not found for log transform (using {potential_names}).")


# --- Final Checks and Save ---
print("Final checks and saving preprocessed data...")
# Ensure essential ID/Date columns have standard names if possible
if 'Instrument' not in df.columns and 'instrument' in df.columns:
    df = df.rename(columns={'instrument': 'Instrument'})
if 'Date' not in df.columns and 'date' in df.columns:
    df = df.rename(columns={'date': 'Date'})

# Verify essential columns exist before saving
essential_final = ['Date', 'Instrument', TARGET_COL_NAME, NEXT_RAW_RET_COL_NAME, MKT_CAP_ORIG_COL_NAME]
missing = [c for c in essential_final if c not in df.columns]
if missing:
    print(f"ERROR: Essential columns missing before saving: {missing}")
    print(f"Available columns: {df.columns.tolist()}")
    exit()

# Drop rows where original market cap is non-positive or NaN
initial_rows = len(df)
df = df.dropna(subset=[MKT_CAP_ORIG_COL_NAME])
df = df[df[MKT_CAP_ORIG_COL_NAME] > 0]
rows_removed = initial_rows - len(df)
if rows_removed > 0:
    print(f"  Dropped {rows_removed} rows with missing or non-positive '{MKT_CAP_ORIG_COL_NAME}'.")

if df.empty: print("ERROR: DataFrame empty after final checks."); exit()

# Convert numeric columns to float32
numeric_cols_final = df.select_dtypes(include=[np.number]).columns
df[numeric_cols_final] = df[numeric_cols_final].astype("float32")

df.to_csv(OUTPUT_FILE, index=False)
print(f"Preprocessing complete. Final shape: {df.shape}")
print(f"Preprocessed data saved to: {OUTPUT_FILE}")
print("\nFinal Data Info:")
df.info(verbose=True)

# File 2: config.py

In [None]:
# --- config.py ---
# Central configuration file for the ML Asset Pricing Pipeline.
# Edit the settings below to match your data, desired models, and analysis parameters.

import numpy as np
import os

# <<< FILE PATHS >>>
# --------------------------------------------------------------------------
# *** POINT TO THE PREPROCESSED FILE ***
DATA_FILE = "Cleaned_OSEFX_Market_Macro_Data_PREPROCESSED.csv"
BENCHMARK_FILE = None
FF_FACTOR_FILE = "Europe_4_Factors_Monthly.csv" # <--- Path to Fama-French factor CSV (optional)
PORTFOLIO_DEFS_FILE = None
OUTPUT_DIR = "ML_Pipeline_Results_Yearly_Percentile_Preprocessed" # Changed output dir name

# <<< DATA PREPARATION CONFIG >>>
# --------------------------------------------------------------------------
# --- Column Names (These should match names in the *PREPROCESSED* CSV) ---
# *** SIMPLIFIED: Only map ID/Date if they aren't standard, target is defined below ***
COLUMN_CONFIG = {
    'date': ['Date', 'date'],
    'id': ['Instrument', 'instrument'],
    # Other columns should now have clean names from preprocess_data.py
    # We don't need mappings for price, shares, rf, book_market etc. here
    # as they are either used in preprocessing or already logged/transformed.
    'EconomicSector': ['EconomicSector'] # Keep if sector dummies are needed
}

# --- Feature Engineering (Now done externally) ---
# VARS_TO_LOG = [] # Logging is done in preprocess_data.py
TARGET_VARIABLE = "TargetReturn_t+1"         # *** MATCHES PREPROCESSING SCRIPT OUTPUT ***
NEXT_RETURN_VARIABLE = "NextMonthlyReturn_t+1" # *** MATCHES PREPROCESSING SCRIPT OUTPUT ***
MARKET_CAP_ORIG_VARIABLE = "MarketCap_orig"    # *** MATCHES PREPROCESSING SCRIPT OUTPUT ***

# --- Data Cleaning & Filtering (Within Pipeline) ---
# WINSORIZE_LIMITS = [] # Winsorizing of raw returns done externally
# Imputation will happen in clean_data on features
# Dropping NaNs focuses on target/ID/next_ret/mkt_cap_orig (essentials for modeling/analysis)
ESSENTIAL_COLS_FOR_DROPNA = ['Date', 'Instrument', TARGET_VARIABLE, NEXT_RETURN_VARIABLE, MARKET_CAP_ORIG_VARIABLE]

# <<< ROLLING WINDOW CONFIG >>>
# --------------------------------------------------------------------------
INITIAL_TRAIN_YEARS = 9
VALIDATION_YEARS = 6
TEST_YEARS_PER_WINDOW = 1

# <<< MODEL CONFIGURATION >>>
# --------------------------------------------------------------------------
# --- Model Selection ---
RUN_MODELS = {
    'OLS': True,        'OLS3H': True,      'PLS': True,        'PCR': True,
    'ENET': True,       'GLM_H': False,      'RF': False,         'GBRT_H': False,
    'NN1': False,        'NN2': False,        'NN3': False,         'NN4': False,
    'NN5': False,
}

# --- Feature Sets ---
# *** THESE MUST MATCH THE COLUMN NAMES IN THE *PREPROCESSED* CSV ***
OLS3_FEATURE_NAMES = ["log_BM", "Momentum_12M", "log_MarketCap"] # Verify these exist in the preprocessed file
MODEL_FEATURE_MAP = { # Which feature set does each model use?
    'OLS': 'all_numeric', 'OLS3H': 'ols3_features', 'PLS': 'all_numeric',
    'PCR': 'all_numeric', 'ENET': 'all_numeric', 'GLM_H': 'all_numeric',
    'RF': 'all_numeric', 'GBRT_H': 'all_numeric',
    'NN1': 'all_numeric', 'NN2': 'all_numeric', 'NN3': 'all_numeric',
    'NN4': 'all_numeric', 'NN5': 'all_numeric',
}

# --- Model Hyperparameters (Keep as is, or adjust) ---
MODEL_PARAMS = {
    'OLS': {},
    'OLS3H': {'maxiter': 100, 'tol': 1e-6},
    'PLS': {'n_components_grid': [1, 3, 5, 8, 10, 15]},
    'PCR': {'n_components_grid': [1, 5, 10, 15, 20, 25]},
    'ENET': {'alphas': np.logspace(-6, 1, 8), 'l1_ratio': [0.1, 0.5, 0.9, 0.99, 1.0], 'cv_folds': 3, 'max_iter': 1000, 'tol': 0.001, 'n_jobs': -1},
    'GLM_H': {'param_grid': {'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0], 'epsilon': [1.1, 1.35, 1.5, 2.0]}, 'max_iter': 300},
    'RF': {'param_grid': {'n_estimators': [100], 'max_depth': [3, 6, 10], 'min_samples_leaf': [50, 100], 'max_features': ['sqrt', 0.33]}, 'n_jobs': -1, 'random_state': 42},
    'GBRT_H': {'param_grid': {'n_estimators': [100], 'learning_rate': [0.1], 'max_depth': [3, 5], 'min_samples_leaf': [50, 100], 'alpha': [0.9]}, 'loss': 'huber', 'random_state': 42},
    'NN_SHARED': {'param_grid': {'lambda1': [1e-5, 1e-4, 1e-3], 'learning_rate': [0.001, 0.01]}, 'epochs': 100, 'batch_size': 10000, 'patience': 5, 'ensemble_size': 10, 'random_seed_base': 42},
    'NN1': {'name': 'NN1', 'hidden_units': [32]},
    'NN2': {'name': 'NN2', 'hidden_units': [64, 32]},
    'NN3': {'name': 'NN3', 'hidden_units': [96, 64, 32]},
    'NN4': {'name': 'NN4', 'hidden_units': [128, 96, 64, 32]},
    'NN5': {'name': 'NN5', 'hidden_units': [128, 96, 64, 32, 16]},
}

# <<< ANALYSIS & REPORTING CONFIG >>>
# --------------------------------------------------------------------------
SUBSETS_TO_RUN = ['all', 'big', 'small']
BIG_FIRM_TOP_PERCENT = 30
SMALL_FIRM_BOTTOM_PERCENT = 30
# Use the specific market cap column saved for this purpose
FILTER_SMALL_CAPS_PORTFOLIO = False
ANNUALIZATION_FACTOR = 12

# --- Variable Importance ---
CALCULATE_VI = True
VI_METHOD = 'permutation_zero'
VI_PLOT_TOP_N = 20
MODEL_VI_STRATEGY = {
    'OLS': 'per_window', 'OLS3H': 'per_window', 'PLS': 'per_window',
    'PCR': 'per_window', 'ENET': 'per_window', 'GLM_H': 'per_window',
    'RF': 'last_window', 'GBRT_H': 'last_window',
    'NN1': 'last_window', 'NN2': 'last_window', 'NN3': 'last_window',
    'NN4': 'last_window', 'NN5': 'last_window',
}

# --- Complexity Plotting ---
COMPLEXITY_PARAMS_TO_PLOT = {
    'PLS': ['optim_n_components'], 'PCR': ['optim_n_components'],
    'ENET': ['optim_alpha', 'optim_l1_ratio'], 'GLM_H': ['optim_alpha', 'optim_epsilon'],
    'RF': ['optim_max_depth'],
    'GBRT_H': ['optim_max_depth'],
    'NN1': ['optim_lambda1', 'optim_learning_rate'], 'NN2': ['optim_lambda1', 'optim_learning_rate'],
    'NN3': ['optim_lambda1', 'optim_learning_rate'], 'NN4': ['optim_lambda1', 'optim_learning_rate'],
    'NN5': ['optim_lambda1', 'optim_learning_rate'],
}

# --- Seeds ---
GENERAL_SEED = 42
TF_SEED = 42

# --- Create Output Directory ---
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"Created output directory: {OUTPUT_DIR}")

print("Configuration loaded from config.py")

# File 3: pipeline_utils.py

In [None]:
# --- pipeline_utils.py ---
# Shared utility functions for the ML Asset Pricing Pipeline.
# Contains functions for data loading/prep, feature definition,
# standardization, cleaning, splitting, portfolio analysis, VI, plotting, saving.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# from scipy.stats.mstats import winsorize # No longer needed here
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, ElasticNet, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

import warnings
import os
import time
from collections import defaultdict
import random
import traceback
import re

# Import config AFTER it's defined
import config

# --- Suppress specific warnings ---
# (Keep suppression settings as they are)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="Mean of empty slice")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered in log")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="Maximum number of iterations reached.*")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="divide by zero encountered.*")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered.*divide")
pd.options.mode.chained_assignment = None

# Set general random seed
random.seed(config.GENERAL_SEED)
np.random.seed(config.GENERAL_SEED)


# === Stage 1: Data Loading and Preparation (SIMPLIFIED) ===
def find_col(df, potential_names, default=None):
    """Helper to find the first matching column name from a list."""
    for name in potential_names:
        if name in df.columns: return name
    return default

def load_prepare_data(file_path, column_config, target_var_name, next_ret_var_name, mkt_cap_orig_var_name):
    """
    Loads the PREPROCESSED data file.
    Performs minimal checks: date conversion, finds essential columns (Date, ID, Target, NextReturn, MktCapOrig).
    Does NOT calculate returns, targets, log transforms, etc. as this is assumed done externally.
    """
    print(f"\n--- 1. Laster Forhåndsbehandlet Data ---")
    print(f"Laster data fra: {file_path}")
    try:
        # Load data, ensuring Date is parsed
        df = pd.read_csv(file_path, parse_dates=['Date']) # Assume 'Date' is the date column
        print(f"Forhåndsbehandlet data lastet inn. Form: {df.shape}")
    except FileNotFoundError:
        print(f"FEIL: Fil '{file_path}' ikke funnet."); return None
    except Exception as e:
        print(f"FEIL under lasting av forhåndsbehandlet data: {e}"); return None

    # --- 1. Find and Standardize Essential Column Names ---
    std_names_map = {
        'date': 'Date',
        'id': 'Instrument',
        # Add other potential mappings from config if needed, but keep it minimal
    }
    rename_dict = {}
    found_cols = {}

    # Find Date column
    date_col_found = find_col(df, column_config.get('date', ['Date', 'date']))
    if not date_col_found: print("FEIL: Datokolonne ikke funnet."); return None
    if date_col_found != 'Date': rename_dict[date_col_found] = 'Date'
    found_cols['date'] = 'Date'

    # Find ID column
    id_col_found = find_col(df, column_config.get('id', ['Instrument', 'instrument']))
    if not id_col_found: print("FEIL: Instrument ID kolonne ikke funnet."); return None
    if id_col_found != 'Instrument': rename_dict[id_col_found] = 'Instrument'
    found_cols['id'] = 'Instrument'

    # Apply renames if necessary
    if rename_dict:
        df = df.rename(columns=rename_dict)
        print(f"Standardiserte essensielle kolonner: {list(rename_dict.values())}")

    # --- 2. Verify Essential Columns Exist ---
    essential_cols = ['Date', 'Instrument', target_var_name, next_ret_var_name, mkt_cap_orig_var_name]
    # Check if sector column exists IF it's needed later (e.g., for dummies, although dummies should be pre-created now)
    if 'EconomicSector' in column_config:
         sector_col_cand = find_col(df, column_config['EconomicSector'])
         if sector_col_cand: essential_cols.append(sector_col_cand)

    missing_essential = [col for col in essential_cols if col not in df.columns]
    if missing_essential:
        print(f"FEIL: Essensielle kolonner mangler i forhåndsbehandlet fil: {missing_essential}")
        print(f"Tilgjengelige kolonner: {df.columns.tolist()}")
        return None
    print("Essensielle kolonner funnet.")

    # --- 3. Ensure Date is Datetime and Sort ---
    if not pd.api.types.is_datetime64_any_dtype(df['Date']):
        try:
            df['Date'] = pd.to_datetime(df['Date'])
        except Exception as e:
            print(f"FEIL: Kunne ikke konvertere 'Date' kolonne til datetime: {e}"); return None
    df = df.sort_values(by=['Instrument', 'Date']).reset_index(drop=True)
    print("Data sortert etter Instrument og Dato.")

    # --- 4. Optional: Create Sector Dummies (if not already done in preprocessing) ---
    # Check if sector column exists and if dummy columns DON'T already exist
    sector_col_std = find_col(df, column_config.get('EconomicSector', []))
    if sector_col_std and not any(col.startswith("Sector_") for col in df.columns):
         print(f"  INFO: Oppretter Sektor dummy-variabler fra '{sector_col_std}'...")
         df = pd.get_dummies(df, columns=[sector_col_std], prefix="Sector", dtype=int)
         print("  Sektor dummy-variabler opprettet.")

    print(f"Lasting og grunnleggende sjekk fullført. Form: {df.shape}")
    # print(f"Final Columns: {df.columns.tolist()}") # Uncomment for detailed debug
    return df


# === Stage 2: Feature Definition ===
def define_features(df, ols3_feature_names, base_exclusions):
    """
    Identifies numeric features from the PREPROCESSED data,
    excluding specified base columns (target, IDs, intermediate calcs that might remain).
    """
    print("\n--- 2. Definerer Features (fra forhåndsbehandlet data) ---")
    if df is None or df.empty: print(" FEIL: DataFrame tom."); return [], [], []

    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    print(f"  Funnet {len(numeric_cols)} num. kolonner.")

    # Define columns to exclude from features
    cols_to_exclude = set()

    # Add columns explicitly passed for exclusion (target, next_ret, mkt_cap_orig, etc.)
    base_exclusions_present = [col for col in base_exclusions if col in df.columns]
    cols_to_exclude.update(base_exclusions_present)

    # Add standard identifiers and intermediate calculation columns that might *still* be present
    # (AdjustedReturn_t is likely still there from the preprocessor script)
    date_col = 'Date' # Should be standardized now
    id_col = 'Instrument' # Should be standardized now
    std_exclusions = [date_col, id_col, 'level_0','index','Year','MonthYear',
                      'AdjustedReturn_t', # Keep this exclusion
                      'MonthlyReturn_t', 'MonthlyRiskFreeRate_t'] # Keep these
    cols_to_exclude.update([col for col in std_exclusions if col in df.columns])

    # Add portfolio helper columns if they might exist (unlikely now)
    pf_cols=['Rank','DecileRank','Decile','ew_weights','vw_weights','rank']
    cols_to_exclude.update([c for c in pf_cols if c in df.columns])

    # Exclude original versions of logged variables IF the log version exists
    # (This check remains useful, e.g., excludes MarketCap if log_MarketCap exists)
    log_cols = {c for c in numeric_cols if c.startswith('log_')}
    # List potential raw names based on how log names are created (log_VARNAME)
    raw_names_from_logs = {c.replace('log_', '') for c in log_cols}
    for raw_name in raw_names_from_logs:
        if raw_name in df.columns and f"log_{raw_name}" in log_cols:
            cols_to_exclude.add(raw_name)
            # Specific exclusions if MarketCap or ClosePrice were logged
            if raw_name in ['MarketCap', 'ClosePrice'] and 'ClosePrice' in df.columns:
                 cols_to_exclude.add('ClosePrice') # Exclude if log exists
            if raw_name == 'MarketCap' and 'CommonSharesOutstanding' in df.columns:
                 cols_to_exclude.add('CommonSharesOutstanding') # Exclude if log exists

    # Exclude raw NorgesBank10Y, NIBOR3M if TermSpread exists
    if 'TermSpread' in df.columns:
         if 'NorgesBank10Y' in df.columns: cols_to_exclude.add('NorgesBank10Y')
         if 'NIBOR3M' in df.columns: cols_to_exclude.add('NIBOR3M')

    # Identify final numeric features
    potential_features = [c for c in numeric_cols if c not in cols_to_exclude]
    final_features = []
    for col in potential_features:
        if col not in df.columns: continue
        valid = df[col].dropna()
        # Check for variance and multiple unique values
        # Use a slightly larger tolerance for std dev check with float32
        if len(valid) > 1 and valid.nunique() > 1 and valid.std() > 1e-7:
            final_features.append(col)
        # else: print(f"  -> Dropping potential feature '{col}' due to no variance or <=1 unique value.")

    final_features = sorted(list(set(final_features)))
    print(f"  Identifisert {len(final_features)} features totalt etter ekskludering og validitetssjekk.")
    print(f"  Features sample: {final_features[:5]}...{final_features[-5:]}") # Show sample
    # print(f"  Ekskluderte kolonner: {sorted(list(cols_to_exclude.intersection(df.columns)))}") # Optional Debug

    # Check OLS3 features against the *final* feature list
    # Use the names specified in config.OLS3_FEATURE_NAMES directly
    ols3_features_final = [f for f in ols3_feature_names if f in final_features]
    missing_ols3 = [f for f in ols3_feature_names if f not in ols3_features_final]

    if missing_ols3: print(f"  ADVARSEL: OLS3 mangler features fra config: {missing_ols3}")
    if not ols3_features_final: print("  ADVARSEL: Ingen av de spesifiserte OLS3 features er gyldige endelige features.")
    elif len(ols3_features_final) < len(ols3_feature_names): print(f"  ADVARSEL: Kunne finne deler av OLS3 features: {ols3_features_final}")
    else: print(f"  Valide OLS3 features funnet: {ols3_features_final}")

    # Return all valid features, valid OLS3 features, and all valid features again
    all_needed_final = sorted(list(set(final_features)))
    return all_needed_final, ols3_features_final, all_needed_final


# === Stage 3: Standardization ===
# (Keep rank_standardize_features as is - it operates on the defined features)
def rank_standardize_features(df, features_to_standardize):
    print("\n--- 3. Rank Standardiserer Features ---");
    date_col = 'Date' # Assumes Date column exists and is named 'Date'
    if date_col not in df.columns: print(f"FEIL: Datokolonne ('{date_col}') mangler for standardisering."); return df
    features=[f for f in features_to_standardize if f in df.columns];
    if not features: print("  Ingen features funnet å standardisere."); return df
    print(f"Standardiserer {len(features)} features...")
    def rank_transform(x):
        x_num=pd.to_numeric(x,errors='coerce')
        if x_num.isnull().all(): return x_num # Return NaNs if all are NaN
        # Rank, converting ranks to [-1, 1] range
        r=x_num.rank(pct=True, na_option='keep')
        # Fill remaining NaNs (e.g., from single non-NaN value) with 0 AFTER scaling
        return (r * 2 - 1).fillna(0)

    try:
        # Group by Date and apply the rank transform to each feature column
        # Using transform should be efficient
        df[features] = df.groupby(date_col)[features].transform(rank_transform)
    except Exception as e:
        print(f" ADVARSEL under transform (prøver apply): {e}. Standardisering kan være ufullstendig.");
        # Fallback might be needed for complex cases, but transform is preferred
        try:
            df_s = df.set_index(date_col)
            for col in features:
                 df_s[col] = df_s.groupby(level=0)[col].apply(rank_transform)
            df = df_s.reset_index() # Bring Date back as a column
        except Exception as e2:
             print(f" FEIL under apply: {e2}. Standardisering kan være ufullstendig."); return df # Return potentially partially processed df

    print("Rank standardisering fullført."); return df


# === Stage 4: Data Cleaning (Post-Standardization) ===
def clean_data(df, numeric_features_to_impute, essential_cols_for_dropna, mkt_cap_orig_var):
    """
    Cleans data AFTER standardization.
    1. Replaces inf with NaN in features.
    2. Imputes NaN in FEATURE columns using the overall median.
    3. Drops rows with NaN in ESSENTIAL columns (target, IDs, next_ret, mkt_cap_orig).
    4. Drops rows with non-positive original market cap.
    """
    print("\n--- 4. Renser Data (Post-Standardisering) ---"); initial_rows=len(df)
    features=[f for f in numeric_features_to_impute if f in df.columns];

    if features:
        # Replace inf values first within feature columns
        inf_mask = df[features].isin([np.inf, -np.inf])
        if inf_mask.any().any():
            inf_cols = df[features].columns[inf_mask.any(axis=0)].tolist()
            print(f"  Erstatter +/-inf med NaN i features: {inf_cols}...")
            df[features] = df[features].replace([np.inf, -np.inf], np.nan)

        # Impute NaNs in FEATURES with OVERALL MEDIAN (robust to outliers after standardization)
        # Calculate medians ONLY for the feature columns that have NaNs
        cols_with_nan = df[features].isnull().any()
        features_to_impute_now = cols_with_nan[cols_with_nan].index.tolist()

        if features_to_impute_now:
            print(f"  Imputerer NaNs i {len(features_to_impute_now)} features med overall median...")
            medians = df[features_to_impute_now].median(skipna=True) # Calculate median for each feature column

            # Fill NaNs using the calculated medians
            df[features_to_impute_now] = df[features_to_impute_now].fillna(medians)

            # If median itself is NaN (e.g., all NaNs in a column), fill remaining NaNs with 0
            if medians.isnull().any():
                cols_nan_median = medians[medians.isnull()].index.tolist()
                print(f"  ADVARSEL: Median var NaN for features: {cols_nan_median}. Fyller resterende NaNs i disse kolonnene med 0.")
                df[cols_nan_median] = df[cols_nan_median].fillna(0)
            print(f"  NaNs i features imputert.")
        else:
            print("  Ingen NaNs funnet i features som trenger imputering.")

    # Drop rows with NaNs in ESSENTIAL columns (target, IDs, next return, original market cap)
    # These columns should have been created by the preprocessing script.
    essentials_present = [c for c in essential_cols_for_dropna if c in df.columns]
    if essentials_present:
        rows0 = len(df)
        df = df.dropna(subset=essentials_present)
        dropped_count = rows0 - len(df)
        if dropped_count > 0:
            print(f"  Fjernet {dropped_count} rader pga NaN i essensielle kolonner: {essentials_present}.")
        else:
            print(f"  Ingen rader fjernet pga NaN i essensielle kolonner ({essentials_present}).")
    else:
         print(f"  ADVARSEL: Kunne ikke sjekke NaN i essensielle kolonner (mangler): {[c for c in essential_cols_for_dropna if c not in df.columns]}")


    # Drop rows where original market cap is non-positive (already done in preprocessor, but good safety check)
    if mkt_cap_orig_var in df.columns:
        rows0 = len(df)
        df = df[df[mkt_cap_orig_var] > 0]
        dropped_count = rows0 - len(df)
        if dropped_count > 0:
            print(f"  Fjernet {dropped_count} rader der '{mkt_cap_orig_var}' <= 0 (sikkerhetssjekk).")
    else:
        print(f"ADVARSEL: Kolonne '{mkt_cap_orig_var}' ikke funnet for sjekk av positiv verdi.")

    print(f"Datarensing ferdig. Form: {df.shape}. Totalt fjernet {initial_rows-len(df)} rader i dette steget.");
    if df.empty: print("FEIL: DataFrame er tom etter rensing."); return None
    return df


# === Stage 5: Data Splitting ===
# (Keep get_yearly_rolling_splits as is)
def get_yearly_rolling_splits(df, initial_train_years, val_years, test_years):
    print("\n--- 5. Setter opp Årlige Rullerende Vinduer ---")
    date_col = 'Date' # Assumes standard name
    if date_col not in df.columns: raise ValueError(f"'{date_col}' kolonnen mangler for splitting.")

    if not pd.api.types.is_datetime64_any_dtype(df[date_col]):
         try: df[date_col] = pd.to_datetime(df[date_col])
         except Exception as e: raise ValueError(f"Kunne ikke konvertere '{date_col}' til datetime: {e}")

    df['Year']=df[date_col].dt.year; unique_years=sorted(df["Year"].unique()); n_years=len(unique_years)
    print(f"Funnet {n_years} unike år i data ({unique_years[0]}-{unique_years[-1]})")

    min_years_needed = initial_train_years + val_years + test_years
    if n_years < min_years_needed:
        df.drop(columns=['Year'],inplace=True,errors='ignore'); # Clean up temp column
        raise ValueError(f"Ikke nok år ({n_years}) for den spesifiserte splitten (trenger minst {min_years_needed}).")

    first_test_year_idx = initial_train_years + val_years
    if first_test_year_idx >= n_years:
        df.drop(columns=['Year'],inplace=True,errors='ignore');
        raise ValueError(f"Kombinasjonen av initial_train ({initial_train_years}) og validation ({val_years}) dekker alle ({n_years}) eller flere år. Ingen testår igjen.")

    first_test_year = unique_years[first_test_year_idx]
    last_test_start_year = unique_years[n_years - test_years]
    num_windows = last_test_start_year - first_test_year + 1

    if num_windows <= 0:
        df.drop(columns=['Year'],inplace=True,errors='ignore');
        raise ValueError(f"Negativt eller null antall vinduer beregnet ({num_windows}). Sjekk årskonfigurasjon. First test year: {first_test_year}, Last possible test start year: {last_test_start_year}")

    print(f"Genererer {num_windows} rullerende vinduer.")
    print(f"  Første vindu testår: {first_test_year} (slutter {first_test_year+test_years-1})")
    print(f"  Siste vindu testår: {last_test_start_year} (slutter {last_test_start_year+test_years-1})")

    splits_info=[] # Store tuples of (train_idx, val_idx, test_idx, train_dates, val_dates, test_dates)
    for i in range(num_windows):
        test_start_year = first_test_year + i
        test_end_year = test_start_year + test_years - 1
        val_end_year = test_start_year - 1
        val_start_year = val_end_year - val_years + 1
        train_end_year = val_start_year - 1
        train_start_year = unique_years[0] # Train from the beginning

        train_indices = df[(df['Year'] >= train_start_year) & (df['Year'] <= train_end_year)].index
        val_indices = df[(df['Year'] >= val_start_year) & (df['Year'] <= val_end_year)].index
        test_indices = df[(df['Year'] >= test_start_year) & (df['Year'] <= test_end_year)].index

        train_dates = df.loc[train_indices, date_col].agg(['min','max']) if not train_indices.empty else None
        val_dates = df.loc[val_indices, date_col].agg(['min','max']) if not val_indices.empty else None
        test_dates = df.loc[test_indices, date_col].agg(['min','max']) if not test_indices.empty else None

        splits_info.append((
            train_indices, val_indices, test_indices,
            train_dates, val_dates, test_dates,
            train_start_year, train_end_year, val_start_year, val_end_year, test_start_year, test_end_year
        ))

    print("\n--- Split Detaljer per Vindu ---")
    for i,split_data in enumerate(splits_info):
        tr_idx, v_idx, te_idx, tr_d, v_d, t_d, tr_s, tr_e, v_s, v_e, t_s, t_e = split_data
        print(f"  Vindu {i+1}/{num_windows}:")
        print(f"    Train: {tr_s}-{tr_e} ({len(tr_idx)} obs) [{tr_d['min'].date() if tr_d is not None else 'N/A'} -> {tr_d['max'].date() if tr_d is not None else 'N/A'}]")
        print(f"    Val:   {v_s}-{v_e} ({len(v_idx)} obs) [{v_d['min'].date() if v_d is not None else 'N/A'} -> {v_d['max'].date() if v_d is not None else 'N/A'}]")
        print(f"    Test:  {t_s}-{t_e} ({len(te_idx)} obs) [{t_d['min'].date() if t_d is not None else 'N/A'} -> {t_d['max'].date() if t_d is not None else 'N/A'}]")
        yield tr_idx, v_idx, te_idx, tr_d, v_d, t_d # Yield indices and date ranges

    df.drop(columns=['Year'],inplace=True,errors='ignore')


# === Stage 6: Model Evaluation Metrics ===
# (Keep calculate_oos_r2 and calculate_sharpe_of_predictions as is)
def calculate_oos_r2(y_true, y_pred):
    """ Calculates OOS R2 based on Gu, Kelly, Xiu (2020) definition: 1 - SSR/SST0. """
    if y_true is None or y_pred is None: return np.nan
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
    if len(y_true) < 2 or len(y_pred) < 2 or len(y_true) != len(y_pred): return np.nan

    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    y_t = y_true[mask]; y_p = y_pred[mask]

    if len(y_t) < 2: return np.nan
    ss_res = np.sum((y_t - y_p)**2)
    ss_tot = np.sum(y_t**2) # SST0

    if ss_tot < 1e-15:
        return 1.0 if ss_res < 1e-15 else np.nan
    return 1.0 - (ss_res / ss_tot)

def calculate_sharpe_of_predictions(y_pred, annualization_factor=12):
    """ Calculates annualized Sharpe ratio of the *predictions* themselves. """
    if y_pred is None: return np.nan
    y_pred = np.asarray(y_pred)
    if len(y_pred) < 2: return np.nan

    mask = np.isfinite(y_pred)
    y_p = y_pred[mask]
    if len(y_p) < 2: return np.nan
    mean_pred = np.mean(y_p)
    std_pred = np.std(y_p)
    if std_pred < 1e-9: return np.nan
    return (mean_pred / std_pred) * np.sqrt(annualization_factor)


# === Stage 7: Portfolio Analysis ===
# (Keep perform_detailed_portfolio_analysis mostly as is, but ensure column names passed are correct)
# --> Key change: The `original_df_subset` argument will now be the main `df_clean` DataFrame
#     loaded by the pipeline, which contains the preprocessed data including
#     NEXT_RETURN_VARIABLE and MARKET_CAP_ORIG_VARIABLE created by `preprocess_data.py`.
def MDD(returns):
    """ Calculates Maximum Drawdown from a pandas Series of returns. """
    returns = pd.Series(returns).fillna(0) # Ensure it's a series and fill NaNs with 0
    if returns.empty or len(returns) < 2: return np.nan

    cumulative_returns = (1 + returns).cumprod()
    peak = cumulative_returns.cummax()
    drawdown = (cumulative_returns / peak) - 1
    max_drawdown = drawdown.min() # MDD is the minimum value in the drawdown series

    return max_drawdown * 100 if pd.notna(max_drawdown) else np.nan

def perform_detailed_portfolio_analysis(results_df, # Contains Date, Instrument, yhat_*
                                        full_preprocessed_df, # Contains Date, Instrument, Target, NextRawReturn, MktCqpOrig etc.
                                        prediction_cols,
                                        mkt_cap_orig_var, # Name of the original market cap column
                                        next_ret_var,     # Name of the NEXT month's RAW return column
                                        # monthly_rf_var, # Not strictly needed if using excess returns from results_df
                                        filter_small_caps=False, annualization_factor=12,
                                        benchmark_file=None, ff_factor_file=None):
    print("\n--- 7. Detaljert Porteføljeanalyse (Desiler) ---")
    if filter_small_caps: print(">>> Filtrering av små selskaper (basert på market cap ved t) er AKTIVERT for porteføljedanning <<<")

    # --- Input Data Validation ---
    date_col = 'Date' # Standard name
    id_col = 'Instrument' # Standard name
    target_var = config.TARGET_VARIABLE # From config, should match results_df

    required_orig = [date_col, id_col, mkt_cap_orig_var, next_ret_var] # Core needs from preprocessed data
    missing_orig = [c for c in required_orig if c not in full_preprocessed_df.columns]
    if missing_orig: print(f"FEIL: Mangler påkrevde kolonner i full_preprocessed_df: {missing_orig}."); return {},{},{}

    required_res = [date_col, id_col, target_var] + prediction_cols
    missing_res = [c for c in required_res if c not in results_df.columns]
    if missing_res: print(f"FEIL: Mangler påkrevde kolonner i results_df: {missing_res}."); return {},{},{}

    # --- Data Preparation & Merging ---
    print("Forbereder data for porteføljeanalyse...")
    # Ensure correct dtypes and standard columns before merge
    results_df[date_col] = pd.to_datetime(results_df[date_col])
    results_df[id_col] = results_df[id_col].astype(str)
    full_preprocessed_df[date_col] = pd.to_datetime(full_preprocessed_df[date_col])
    full_preprocessed_df[id_col] = full_preprocessed_df[id_col].astype(str)

    # Select necessary columns from the full preprocessed data
    # We need Date, Instrument, the original market cap (t), and the next raw return (t+1)
    df_orig_sub = full_preprocessed_df[required_orig].drop_duplicates(subset=[date_col, id_col], keep='first')

    # Select necessary columns from the results (predictions at t for portfolio formed at t)
    results_sub = results_df[[date_col, id_col] + prediction_cols + [target_var]].drop_duplicates(subset=[date_col, id_col], keep='first')

    # Merge predictions (at time t) with market cap (at t) and NEXT month's return (t+1)
    portfolio_data = pd.merge(results_sub, df_orig_sub, on=[date_col, id_col], how='inner')
    print(f"Data for analyse etter merge: {portfolio_data.shape}")

    # Rename columns for clarity in portfolio context
    # me = market equity (market cap) at time t
    # ret_t+1 = raw return realized in month t+1 (obtained from preprocessed data)
    # target = target variable (excess return t+1, used for sorting checks maybe, but ret_t+1 used for perf)
    portfolio_data = portfolio_data.rename(columns={
        mkt_cap_orig_var: 'me',
        next_ret_var: 'ret_t+1',
        target_var: 'target_ret_t+1' # Rename target for clarity
    })

    # --- Calculate Excess Returns (t+1) for analysis ---
    # We need the risk-free rate corresponding to the ret_t+1 period.
    # The easiest way is to get it from the target_ret_t+1 calculation:
    # target_ret_t+1 = ret_t+1 - rf_t+1  =>  rf_t+1 = ret_t+1 - target_ret_t+1
    if 'ret_t+1' in portfolio_data.columns and 'target_ret_t+1' in portfolio_data.columns:
         portfolio_data['rf_t+1'] = portfolio_data['ret_t+1'] - portfolio_data['target_ret_t+1']
         # Calculate excess return for t+1 (should be very close to target_ret_t+1, good check)
         portfolio_data['excess_ret_t+1'] = portfolio_data['ret_t+1'] - portfolio_data['rf_t+1']
         print("  Beregnet excess_ret_t+1 for porteføljeanalyse.")
    else:
         print("FEIL: Kan ikke beregne excess_ret_t+1 - mangler ret_t+1 eller target_ret_t+1.")
         return {}, {}, {}


    # --- Data Cleaning Post-Merge ---
    # Critical columns needed for decile sorts and return calculations
    crit_cols = prediction_cols + ['excess_ret_t+1', 'ret_t+1', 'me'] # rf_t+1 not strictly needed if excess_ret exists
    initial_rows = len(portfolio_data)
    portfolio_data = portfolio_data.dropna(subset=crit_cols)
    # Ensure valid market cap for weighting and potential filtering
    portfolio_data = portfolio_data[portfolio_data['me'] > 0]
    rows_removed = initial_rows - len(portfolio_data)
    if rows_removed > 0:
        print(f"  Fjernet {rows_removed} rader pga NaNs i kritiske kolonner ({crit_cols}) eller me <= 0.")
    if portfolio_data.empty: print("FEIL: Ingen gyldige data igjen etter sammenslåing og rensing."); return {},{},{}

    # Use Month-Year Period for grouping
    portfolio_data['MonthYear'] = portfolio_data[date_col].dt.to_period('M')

    # --- Decile Sorting and Monthly Returns Calculation ---
    # (The rest of this function remains largely the same, as it operates on the prepared portfolio_data)
    all_monthly_results = []
    monthly_weights_all = [] # To store weights for turnover calculation
    model_names_processed = [] # Track models actually processed
    hl_monthly_dfs_plotting = {} # Store H-L returns for plotting
    long_monthly_dfs_plotting = {} # Store Long-only (D10) returns for plotting

    unique_months = sorted(portfolio_data['MonthYear'].unique())
    print(f"Itererer gjennom {len(unique_months)} måneder for desil-sortering...")

    for month in unique_months:
        monthly_data_full = portfolio_data[portfolio_data['MonthYear'] == month].copy()

        if filter_small_caps:
            mc_cutoff = monthly_data_full['me'].quantile(config.SMALL_FIRM_BOTTOM_PERCENT / 100.0)
            monthly_data_filtered = monthly_data_full[monthly_data_full['me'] > mc_cutoff].copy()
            if monthly_data_filtered.empty and not monthly_data_full.empty: continue
            elif len(monthly_data_filtered) < 10: continue
            else: monthly_data = monthly_data_filtered
        else:
            monthly_data = monthly_data_full

        if len(monthly_data) < 10: continue

        for model_pred_col in prediction_cols:
            model_name = model_pred_col.replace('yhat_', '').upper().replace('_', '-')
            if model_name not in model_names_processed: model_names_processed.append(model_name)

            monthly_data_model = monthly_data.dropna(subset=[model_pred_col]).copy()
            if len(monthly_data_model) < 10: continue

            monthly_data_model['Rank'] = monthly_data_model[model_pred_col].rank(method='first')
            try:
                monthly_data_model['Decile'] = pd.qcut(monthly_data_model['Rank'], 10, labels=False, duplicates='drop') + 1
            except ValueError: continue

            if monthly_data_model['Decile'].nunique() < 2: continue

            monthly_data_model['ew_weights'] = 1 / monthly_data_model.groupby('Decile')[id_col].transform('size')
            mc_sum_decile = monthly_data_model.groupby('Decile')['me'].transform('sum')
            monthly_data_model['vw_weights'] = np.where(mc_sum_decile > 1e-9, monthly_data_model['me'] / mc_sum_decile, 0)

            # Use 'excess_ret_t+1' and 'ret_t+1' directly now
            monthly_data_model['ew_excess_ret'] = monthly_data_model['excess_ret_t+1'] * monthly_data_model['ew_weights']
            monthly_data_model['vw_excess_ret'] = monthly_data_model['excess_ret_t+1'] * monthly_data_model['vw_weights']
            monthly_data_model['ew_raw_ret'] = monthly_data_model['ret_t+1'] * monthly_data_model['ew_weights']
            monthly_data_model['vw_raw_ret'] = monthly_data_model['ret_t+1'] * monthly_data_model['vw_weights']
            monthly_data_model['ew_pred_ret'] = monthly_data_model[model_pred_col] * monthly_data_model['ew_weights']
            monthly_data_model['vw_pred_ret'] = monthly_data_model[model_pred_col] * monthly_data_model['vw_weights']

            weights_m = monthly_data_model[[id_col, 'Decile', 'ew_weights', 'vw_weights']].copy()
            weights_m['Model'] = model_name
            weights_m['MonthYear'] = month
            monthly_weights_all.append(weights_m)

            agg_results = monthly_data_model.groupby('Decile').agg(
                ew_excess_ret = ('ew_excess_ret', 'sum'),
                vw_excess_ret = ('vw_excess_ret', 'sum'),
                ew_raw_ret = ('ew_raw_ret', 'sum'),
                vw_raw_ret = ('vw_raw_ret', 'sum'),
                ew_pred_ret = ('ew_pred_ret', 'sum'),
                vw_pred_ret = ('vw_pred_ret', 'sum'),
                n_stocks = (id_col, 'size')
            ).reset_index()

            agg_results['MonthYear'] = month
            agg_results['Model'] = model_name
            all_monthly_results.append(agg_results)

    if not all_monthly_results: print("FEIL: Ingen månedsresultater ble generert."); return {},{},{}

    # --- Combine Monthly Results and Calculate Turnover ---
    # (Turnover calculation logic remains the same)
    combined_results_df = pd.concat(all_monthly_results).reset_index(drop=True)
    turnover_results = defaultdict(lambda: {'ew': np.nan, 'vw': np.nan, 'long_ew': np.nan, 'long_vw': np.nan})

    if monthly_weights_all:
        all_weights_df = pd.concat(monthly_weights_all).sort_values(['Model', 'MonthYear', id_col])
        print(f"\nBeregner porteføljeomsetning (turnover) for {len(model_names_processed)} modeller...")
        for mn in model_names_processed:
            model_weights = all_weights_df[all_weights_df['Model'] == mn].copy()
            if model_weights.empty: continue

            # H-L Turnover
            long_weights = model_weights[model_weights['Decile'] == 10]
            short_weights = model_weights[model_weights['Decile'] == 1].assign(ew_weights=lambda x: -x.ew_weights, vw_weights=lambda x: -x.vw_weights)
            hl_weights = pd.concat([long_weights, short_weights]).sort_values([id_col, 'MonthYear'])
            hl_weights['ew_w_next'] = hl_weights.groupby(id_col)['ew_weights'].shift(-1).fillna(0)
            hl_weights['vw_w_next'] = hl_weights.groupby(id_col)['vw_weights'].shift(-1).fillna(0)
            hl_weights['trade_ew'] = abs(hl_weights['ew_w_next'] - hl_weights['ew_weights'])
            hl_weights['trade_vw'] = abs(hl_weights['vw_w_next'] - hl_weights['vw_weights'])
            last_month_hl = hl_weights['MonthYear'].max()
            monthly_turnover_hl = hl_weights[hl_weights['MonthYear'] != last_month_hl].groupby('MonthYear').agg(sum_trade_ew=('trade_ew', 'sum'), sum_trade_vw=('trade_vw', 'sum'))
            if not monthly_turnover_hl.empty:
                turnover_results[mn]['ew'] = monthly_turnover_hl['sum_trade_ew'].mean() / 2
                turnover_results[mn]['vw'] = monthly_turnover_hl['sum_trade_vw'].mean() / 2

            # Long-Only Turnover
            long_only_weights = model_weights[model_weights['Decile'] == 10].sort_values([id_col, 'MonthYear'])
            if not long_only_weights.empty:
                 long_only_weights['ew_w_next'] = long_only_weights.groupby(id_col)['ew_weights'].shift(-1).fillna(0)
                 long_only_weights['vw_w_next'] = long_only_weights.groupby(id_col)['vw_weights'].shift(-1).fillna(0)
                 long_only_weights['trade_ew'] = abs(long_only_weights['ew_w_next'] - long_only_weights['ew_weights'])
                 long_only_weights['trade_vw'] = abs(long_only_weights['vw_w_next'] - long_only_weights['vw_weights'])
                 last_month_lo = long_only_weights['MonthYear'].max()
                 monthly_turnover_lo = long_only_weights[long_only_weights['MonthYear'] != last_month_lo].groupby('MonthYear').agg(sum_trade_ew=('trade_ew', 'sum'), sum_trade_vw=('trade_vw', 'sum'))
                 if not monthly_turnover_lo.empty:
                     turnover_results[mn]['long_ew'] = monthly_turnover_lo['sum_trade_ew'].mean() / 2
                     turnover_results[mn]['long_vw'] = monthly_turnover_lo['sum_trade_vw'].mean() / 2
        print("Omsetningsberegning fullført.")
    else: print("Advarsel: Ingen vektdata funnet, kan ikke beregne omsetning.")

    # --- Aggregate Performance and Generate Tables/Plots ---
    # (Calculation and formatting logic remains the same)
    decile_tables = {}
    hl_risk_tables = {}
    long_risk_tables = {}
    performance_summary_list = []

    print(f"\nGenererer ytelsestabeller for {len(model_names_processed)} modeller...")
    for model_name in model_names_processed:
        model_results = combined_results_df[combined_results_df['Model'] == model_name].copy()
        if model_results.empty: continue

        # Decile Performance
        decile_perf = model_results.groupby('Decile').agg(
            ew_pred_mean=('ew_pred_ret', 'mean'), vw_pred_mean=('vw_pred_ret', 'mean'),
            ew_excess_mean=('ew_excess_ret', 'mean'), vw_excess_mean=('vw_excess_ret', 'mean'),
            ew_raw_std=('ew_raw_ret', 'std'), vw_raw_std=('vw_raw_ret', 'std'), # Use raw std for SR
            n_months=('MonthYear', 'nunique'), avg_stocks=('n_stocks','mean')
        ).reset_index()
        decile_perf['ew_sharpe'] = (decile_perf['ew_excess_mean'] / decile_perf['ew_raw_std']) * np.sqrt(annualization_factor)
        decile_perf['vw_sharpe'] = (decile_perf['vw_excess_mean'] / decile_perf['vw_raw_std']) * np.sqrt(annualization_factor)

        # H-L Performance
        hl_stats_df = pd.DataFrame(); hl_monthly = pd.DataFrame()
        if 1 in model_results['Decile'].values and 10 in model_results['Decile'].values:
            long_monthly = model_results[model_results['Decile'] == 10].set_index('MonthYear')
            short_monthly = model_results[model_results['Decile'] == 1].set_index('MonthYear')
            common_index = long_monthly.index.intersection(short_monthly.index)
            if not common_index.empty:
                hl_monthly = pd.DataFrame({
                    'ew_excess_ret_HL': long_monthly.loc[common_index, 'ew_excess_ret'].sub(short_monthly.loc[common_index, 'ew_excess_ret'], fill_value=0),
                    'vw_excess_ret_HL': long_monthly.loc[common_index, 'vw_excess_ret'].sub(short_monthly.loc[common_index, 'vw_excess_ret'], fill_value=0),
                    'ew_raw_ret_HL': long_monthly.loc[common_index, 'ew_raw_ret'].sub(short_monthly.loc[common_index, 'ew_raw_ret'], fill_value=0),
                    'vw_raw_ret_HL': long_monthly.loc[common_index, 'vw_raw_ret'].sub(short_monthly.loc[common_index, 'vw_raw_ret'], fill_value=0),
                    'ew_pred_ret_HL': long_monthly.loc[common_index, 'ew_pred_ret'].sub(short_monthly.loc[common_index, 'ew_pred_ret'], fill_value=0),
                    'vw_pred_ret_HL': long_monthly.loc[common_index, 'vw_pred_ret'].sub(short_monthly.loc[common_index, 'vw_pred_ret'], fill_value=0)
                }).reset_index()
                hl_monthly_dfs_plotting[model_name] = hl_monthly.copy()

                ew_excess_mean_hl = hl_monthly['ew_excess_ret_HL'].mean(); vw_excess_mean_hl = hl_monthly['vw_excess_ret_HL'].mean()
                ew_raw_std_hl = hl_monthly['ew_raw_ret_HL'].std(); vw_raw_std_hl = hl_monthly['vw_raw_ret_HL'].std()
                ew_sharpe_hl = (ew_excess_mean_hl / ew_raw_std_hl) * np.sqrt(annualization_factor) if ew_raw_std_hl > 1e-9 else np.nan
                vw_sharpe_hl = (vw_excess_mean_hl / vw_raw_std_hl) * np.sqrt(annualization_factor) if vw_raw_std_hl > 1e-9 else np.nan
                mdd_ew_hl = MDD(hl_monthly['ew_excess_ret_HL']); mdd_vw_hl = MDD(hl_monthly['vw_excess_ret_HL'])
                # Factor model placeholders
                alpha_ew_hl, tstat_ew_hl, r2_ew_hl = np.nan, np.nan, np.nan
                alpha_vw_hl, tstat_vw_hl, r2_vw_hl = np.nan, np.nan, np.nan

                hl_stats_df = pd.DataFrame({
                    'ew_pred_mean': [hl_monthly['ew_pred_ret_HL'].mean()],'vw_pred_mean': [hl_monthly['vw_pred_ret_HL'].mean()],
                    'ew_excess_mean': [ew_excess_mean_hl],'vw_excess_mean': [vw_excess_mean_hl],
                    'ew_raw_std': [ew_raw_std_hl],'vw_raw_std': [vw_raw_std_hl],
                    'n_months': [len(hl_monthly)], 'ew_sharpe': [ew_sharpe_hl],'vw_sharpe': [vw_sharpe_hl],
                    'avg_stocks': [np.nan], 'Decile': ['H-L']
                })

        model_summary = pd.concat([decile_perf, hl_stats_df], ignore_index=True)
        performance_summary_list.append(model_summary)

        # Formatting Functions (keep as is)
        def format_decile_table(summary_df, weight_scheme):
            prefix = 'ew_' if weight_scheme == 'EW' else 'vw_'
            cols_map = {f'{prefix}pred_mean': 'Pred', f'{prefix}excess_mean': 'Avg Ex Ret', f'{prefix}raw_std': 'SD (Raw Ret)', f'{prefix}sharpe': 'Ann SR', 'avg_stocks': 'Avg N'}
            relevant_cols = [c for c in cols_map if c in summary_df.columns]
            if not relevant_cols or 'Decile' not in summary_df.columns: return pd.DataFrame()
            sub_df = summary_df[['Decile'] + relevant_cols].rename(columns=cols_map).copy().set_index('Decile')
            for col in ['Pred', 'Avg Ex Ret', 'SD (Raw Ret)']:
                 if col in sub_df.columns: sub_df[col] = pd.to_numeric(sub_df[col], errors='coerce') * 100
            if 'Ann SR' in sub_df.columns: sub_df['Ann SR'] = pd.to_numeric(sub_df['Ann SR'], errors='coerce')
            if 'Avg N' in sub_df.columns: sub_df['Avg N'] = pd.to_numeric(sub_df['Avg N'], errors='coerce')
            def map_idx(x): return 'Low (L)' if str(x) == '1' else ('High (H)' if str(x) == '10' else str(x))
            sub_df.index = sub_df.index.map(map_idx)
            desired_order = ['Low (L)','2','3','4','5','6','7','8','9','High (H)','H-L']
            sub_df = sub_df.reindex([i for i in desired_order if i in sub_df.index])
            final_cols = [c for c in ['Pred', 'Avg Ex Ret', 'SD (Raw Ret)', 'Ann SR', 'Avg N'] if c in sub_df.columns]
            sub_df_formatted = sub_df[final_cols].copy()
            for col in ['Pred', 'Avg Ex Ret', 'SD (Raw Ret)']:
                 if col in sub_df_formatted.columns: sub_df_formatted[col] = sub_df_formatted[col].map('{:.2f}%'.format).replace('nan%','N/A')
            if 'Ann SR' in sub_df_formatted.columns: sub_df_formatted['Ann SR'] = sub_df_formatted['Ann SR'].map('{:.2f}'.format).replace('nan','N/A')
            if 'Avg N' in sub_df_formatted.columns: sub_df_formatted['Avg N'] = sub_df_formatted['Avg N'].map('{:.0f}'.format).replace('nan','N/A')
            return sub_df_formatted[final_cols]

        def format_risk_table(data_dict, table_index):
             df_risk = pd.DataFrame(data_dict, index=table_index)
             for idx in df_risk.index:
                  is_percent = '%' in idx; num_decimals = 2 if is_percent else 3; suffix = '%' if is_percent else ''
                  try: df_risk.loc[idx] = df_risk.loc[idx].map(f'{{:.{num_decimals}f}}'.format).astype(str) + suffix
                  except (ValueError, TypeError): df_risk.loc[idx] = df_risk.loc[idx].apply(lambda x: f'{float(x):.{num_decimals}f}{suffix}' if pd.notna(x) and isinstance(x,(int,float)) else 'N/A')
                  df_risk.loc[idx] = df_risk.loc[idx].replace(['nan%', 'nan', ''], 'N/A', regex=False)
             return df_risk

        # Generate and Print Decile Tables
        ew_table = format_decile_table(model_summary, 'EW'); decile_tables[f'{model_name}_EW'] = ew_table
        vw_table = format_decile_table(model_summary, 'VW'); decile_tables[f'{model_name}_VW'] = vw_table
        print(f"\n--- Ytelsestabell (Desiler): {model_name} - EW ---"); print(ew_table)
        print(f"\n--- Ytelsestabell (Desiler): {model_name} - VW ---"); print(vw_table)

        # Generate and Print H-L Risk/Performance Table
        if not hl_stats_df.empty and not hl_monthly.empty:
            hl_res = hl_stats_df.iloc[0]
            turnover_ew_hl = turnover_results.get(model_name, {}).get('ew', np.nan)
            turnover_vw_hl = turnover_results.get(model_name, {}).get('vw', np.nan)
            max_loss_1m_ew_hl = hl_monthly['ew_excess_ret_HL'].min() * 100 if not hl_monthly.empty else np.nan
            max_loss_1m_vw_hl = hl_monthly['vw_excess_ret_HL'].min() * 100 if not hl_monthly.empty else np.nan
            risk_idx_hl = ["Mean Excess Return [%]", 'Std Dev (Raw) [%]', "Ann. Sharpe Ratio", "Max Drawdown (Excess) [%]", "Max 1M Loss (Excess) [%]", "Avg Monthly Turnover [%]", "Factor Model Alpha [%]", "t(Alpha)", "Factor Model Adj R2", "Info Ratio"]
            ew_data_hl = {f'{model_name} H-L EW': [hl_res.get('ew_excess_mean', np.nan) * 100, hl_res.get('ew_raw_std', np.nan) * 100, hl_res.get('ew_sharpe', np.nan), abs(mdd_ew_hl), max_loss_1m_ew_hl, turnover_ew_hl * 100, alpha_ew_hl, tstat_ew_hl, r2_ew_hl, np.nan]}
            vw_data_hl = {f'{model_name} H-L VW': [hl_res.get('vw_excess_mean', np.nan) * 100, hl_res.get('vw_raw_std', np.nan) * 100, hl_res.get('vw_sharpe', np.nan), abs(mdd_vw_hl), max_loss_1m_vw_hl, turnover_vw_hl * 100, alpha_vw_hl, tstat_vw_hl, r2_vw_hl, np.nan]}
            ew_chart_hl = format_risk_table(ew_data_hl, risk_idx_hl); hl_risk_tables[f'{model_name}_EW'] = ew_chart_hl
            vw_chart_hl = format_risk_table(vw_data_hl, risk_idx_hl); hl_risk_tables[f'{model_name}_VW'] = vw_chart_hl
            print(f"\n--- H-L Portefølje Risk/Performance ({model_name} EW) ---"); print(ew_chart_hl)
            print(f"\n--- H-L Portefølje Risk/Performance ({model_name} VW) ---"); print(vw_chart_hl)

        # Generate and Print Long-Only (Decile 10) Risk/Performance Table
        long_res_row = decile_perf[decile_perf['Decile'] == 10]
        if not long_res_row.empty:
            long_res = long_res_row.iloc[0]
            long_monthly = model_results[model_results['Decile'] == 10].set_index('MonthYear')
            if not long_monthly.empty: long_monthly_dfs_plotting[model_name] = long_monthly.reset_index()
            mdd_ew_long = MDD(long_monthly['ew_excess_ret']) if not long_monthly.empty else np.nan
            mdd_vw_long = MDD(long_monthly['vw_excess_ret']) if not long_monthly.empty else np.nan
            max_loss_1m_ew_long = long_monthly['ew_excess_ret'].min() * 100 if not long_monthly.empty else np.nan
            max_loss_1m_vw_long = long_monthly['vw_excess_ret'].min() * 100 if not long_monthly.empty else np.nan
            turnover_ew_long = turnover_results.get(model_name, {}).get('long_ew', np.nan)
            turnover_vw_long = turnover_results.get(model_name, {}).get('long_vw', np.nan)
            alpha_long_ew, tstat_long_ew, r2_long_ew = np.nan, np.nan, np.nan # Factor placeholders
            alpha_long_vw, tstat_long_vw, r2_long_vw = np.nan, np.nan, np.nan # Factor placeholders
            risk_idx_long = ["Mean Excess Return [%]", 'Std Dev (Raw) [%]', "Ann. Sharpe Ratio", "Max Drawdown (Excess) [%]", "Max 1M Loss (Excess) [%]", "Avg Monthly Turnover [%]", "Factor Model Alpha [%]", "t(Alpha)", "Factor Model Adj R2", "Info Ratio"]
            ew_data_long = {f'{model_name} Long EW': [long_res.get('ew_excess_mean', np.nan) * 100, long_res.get('ew_raw_std', np.nan) * 100, long_res.get('ew_sharpe', np.nan), abs(mdd_ew_long), max_loss_1m_ew_long, turnover_ew_long * 100, alpha_long_ew, tstat_long_ew, r2_long_ew, np.nan]}
            vw_data_long = {f'{model_name} Long VW': [long_res.get('vw_excess_mean', np.nan) * 100, long_res.get('vw_raw_std', np.nan) * 100, long_res.get('vw_sharpe', np.nan), abs(mdd_vw_long), max_loss_1m_vw_long, turnover_vw_long * 100, alpha_long_vw, tstat_long_vw, r2_long_vw, np.nan]}
            ew_chart_long = format_risk_table(ew_data_long, risk_idx_long); long_risk_tables[f'{model_name}_EW'] = ew_chart_long
            vw_chart_long = format_risk_table(vw_data_long, risk_idx_long); long_risk_tables[f'{model_name}_VW'] = vw_chart_long
            print(f"\n--- Long-Only (D10) Risk/Performance ({model_name} EW) ---"); print(ew_chart_long)
            print(f"\n--- Long-Only (D10) Risk/Performance ({model_name} VW) ---"); print(vw_chart_long)
        else: print(f"  Advarsel: Ingen data for Desil 10 funnet for modell {model_name}.")

    # Plotting Cumulative Returns (keep as is)
    fig_hl, ax_hl = plt.subplots(figsize=(14, 7)); plotted_hl = 0
    sorted_models_hl = sorted(hl_monthly_dfs_plotting.keys())
    for model_name in sorted_models_hl:
        df_hl = hl_monthly_dfs_plotting[model_name]
        if 'MonthYear' in df_hl.columns and not df_hl.empty:
             df_hl['PlotDate'] = df_hl['MonthYear'].dt.to_timestamp() if pd.api.types.is_period_dtype(df_hl['MonthYear']) else pd.to_datetime(df_hl['MonthYear'])
             df_hl = df_hl.set_index('PlotDate').sort_index()
             if 'ew_excess_ret_HL' in df_hl.columns:
                  ret_ew = df_hl['ew_excess_ret_HL'].dropna()
                  if not ret_ew.empty: (1 + ret_ew).cumprod().plot(ax=ax_hl, label=f'{model_name} H-L EW'); plotted_hl += 1
             if 'vw_excess_ret_HL' in df_hl.columns:
                  ret_vw = df_hl['vw_excess_ret_HL'].dropna()
                  if not ret_vw.empty: (1 + ret_vw).cumprod().plot(ax=ax_hl, label=f'{model_name} H-L VW', linestyle='--'); plotted_hl += 1
    if plotted_hl > 0:
        ax_hl.set_title('Kumulativ Excess Avkastning (H-L Portefølje, t+1)'); ax_hl.set_ylabel('Kumulativ Verdi (Log Skala)'); ax_hl.set_xlabel('Dato'); ax_hl.set_yscale('log'); ax_hl.legend(loc='center left', bbox_to_anchor=(1, 0.5)); ax_hl.grid(True, which='both', linestyle='--', linewidth=0.5); fig_hl.tight_layout(rect=[0, 0, 0.85, 1]); plt.show()
    else: plt.close(fig_hl)

    fig_long, ax_long = plt.subplots(figsize=(14, 7)); plotted_long = 0
    sorted_models_long = sorted(long_monthly_dfs_plotting.keys())
    for model_name in sorted_models_long:
        df_long = long_monthly_dfs_plotting[model_name]
        if 'MonthYear' in df_long.columns and not df_long.empty:
             df_long['PlotDate'] = df_long['MonthYear'].dt.to_timestamp() if pd.api.types.is_period_dtype(df_long['MonthYear']) else pd.to_datetime(df_long['MonthYear'])
             df_long = df_long.set_index('PlotDate').sort_index()
             if 'ew_excess_ret' in df_long.columns:
                  ret_ew = df_long['ew_excess_ret'].dropna()
                  if not ret_ew.empty: (1 + ret_ew).cumprod().plot(ax=ax_long, label=f'{model_name} Long EW'); plotted_long += 1
             if 'vw_excess_ret' in df_long.columns:
                  ret_vw = df_long['vw_excess_ret'].dropna()
                  if not ret_vw.empty: (1 + ret_vw).cumprod().plot(ax=ax_long, label=f'{model_name} Long VW', linestyle='--'); plotted_long += 1
    if plotted_long > 0:
        ax_long.set_title('Kumulativ Excess Avkastning (Long-Only Portefølje [D10], t+1)'); ax_long.set_ylabel('Kumulativ Verdi (Log Skala)'); ax_long.set_xlabel('Dato'); ax_long.set_yscale('log'); ax_long.legend(loc='center left', bbox_to_anchor=(1, 0.5)); ax_long.grid(True, which='both', linestyle='--', linewidth=0.5); fig_long.tight_layout(rect=[0, 0, 0.85, 1]); plt.show()
    else: plt.close(fig_long)

    print("--- Detaljert Porteføljeanalyse Fullført ---")
    return decile_tables, hl_risk_tables, long_risk_tables


# === Stage 8: Variable Importance ===
# (Keep calculate_variable_importance as is)
def calculate_variable_importance(model_name, fitted_model, X_eval, y_eval, features, base_r2_is, vi_method='permutation_zero', model_params=None):
    start_vi = time.time()
    if vi_method != 'permutation_zero': print(f"    FEIL: VI metode '{vi_method}' støttes ikke."); return pd.DataFrame()
    if fitted_model is None: print(f"    FEIL: Ingen modell for VI."); return pd.DataFrame()
    if pd.isna(base_r2_is): print(f"    ADVARSEL: Basis IS R2 NaN."); return pd.DataFrame({'Feature': features, 'Importance': 0.0})
    if len(features)==0 or X_eval.shape[0]==0 or y_eval.shape[0]==0 or X_eval.shape[1]!=len(features): print(f"    FEIL: Ugyldige VI data dim."); return pd.DataFrame({'Feature': features, 'Importance': 0.0})

    importance_results = {}
    y_eval_finite = y_eval[np.isfinite(y_eval)]
    ss_tot_zero = np.sum(y_eval_finite**2)
    if ss_tot_zero < 1e-15: return pd.DataFrame({'Feature': features, 'Importance': 0.0})

    params_retrain = model_params if model_params else {}
    if model_name == 'ENET' and hasattr(fitted_model, 'alpha_'): params_retrain = {'alpha': fitted_model.alpha_, 'l1_ratio': fitted_model.l1_ratio_, **config.MODEL_PARAMS.get('ENET', {})}
    elif model_name == 'PLS' and hasattr(fitted_model, 'n_components'): params_retrain = {'n_components': fitted_model.n_components, 'scale': False}
    elif model_name == 'PCR' and hasattr(fitted_model, 'named_steps'):
        try: params_retrain = {'n_components': fitted_model.named_steps['pca'].n_components_}
        except KeyError: params_retrain = {'n_components': 1}
    elif model_name in ['GLM_H', 'RF', 'GBRT_H'] and hasattr(fitted_model, 'get_params'):
        params_retrain = fitted_model.get_params()
        if model_params: params_retrain.update(model_params) # Use optimal params if provided
    elif model_name == 'OLS3H': params_retrain = {k: v for k, v in config.MODEL_PARAMS.get('OLS3H', {}).items() if k != 'M'}

    for idx, feat_name in enumerate(features):
        X_permuted = X_eval.copy(); X_permuted[:, idx] = 0
        permuted_model = None; permuted_preds = None; permuted_r2 = -np.inf
        try:
            if model_name == 'OLS': permuted_model = LinearRegression(fit_intercept=True).fit(X_permuted, y_eval)
            elif model_name == 'OLS3H' and sm:
                 X_perm_c = sm.add_constant(X_permuted)
                 permuted_model_rlm = sm.RLM(y_eval, X_perm_c, M=sm.robust.norms.HuberT())
                 permuted_model = permuted_model_rlm.fit(**params_retrain)
                 permuted_preds = permuted_model.predict(X_perm_c)
            elif model_name == 'PLS': permuted_model = PLSRegression(**params_retrain).fit(X_permuted, y_eval)
            elif model_name == 'PCR': permuted_model = Pipeline([('pca', PCA(n_components=params_retrain.get('n_components', 1))), ('lr', LinearRegression())]).fit(X_permuted, y_eval)
            elif model_name == 'ENET': permuted_model = ElasticNet(**params_retrain, fit_intercept=True).fit(X_permuted, y_eval)
            elif model_name == 'GLM_H': permuted_model = HuberRegressor(**params_retrain, fit_intercept=True).fit(X_permuted, y_eval)
            elif model_name == 'RF': permuted_model = RandomForestRegressor(**params_retrain).fit(X_permuted, y_eval)
            elif model_name == 'GBRT_H': permuted_model = GradientBoostingRegressor(**params_retrain).fit(X_permuted, y_eval)

            if permuted_model and model_name != 'OLS3H': permuted_preds = permuted_model.predict(X_permuted).flatten()
            if permuted_preds is not None:
                preds_finite = permuted_preds[np.isfinite(y_eval)]
                if len(preds_finite) == len(y_eval_finite) and np.all(np.isfinite(preds_finite)):
                    ss_res_permuted = np.sum((y_eval_finite - preds_finite)**2)
                    permuted_r2 = 1.0 - (ss_res_permuted / ss_tot_zero)
        except Exception as e: print(f"    ADVARSEL: Unntak VI '{feat_name}' i {model_name}: {e}")
        reduction = base_r2_is - permuted_r2
        importance_results[feat_name] = max(0, reduction) if pd.notna(reduction) else 0.0

    if not importance_results: return pd.DataFrame({'Feature': features, 'Importance': 0.0})
    importance_df = pd.DataFrame(importance_results.items(), columns=['Feature', 'R2_Reduction'])
    total_reduction = importance_df['R2_Reduction'].sum()
    importance_df['Importance'] = importance_df['R2_Reduction'] / total_reduction if total_reduction > 1e-9 else 0.0
    return importance_df[['Feature', 'Importance']]


# === Stage 9: Complexity Plotting ===
# (Keep plot_time_varying_complexity as is)
def plot_time_varying_complexity(model_metrics, complexity_params_to_plot):
     print("\n--- 9. Plotter Tidsvarierende Modellkompleksitet ---")
     plotted_any = False
     for model_name, param_keys in complexity_params_to_plot.items():
         if model_name not in model_metrics: continue
         print(f"\n  --- Modell: {model_name} ---")
         model_data = model_metrics[model_name]
         for param_key in param_keys:
             if param_key in model_data:
                 values = model_data[param_key]
                 valid_data = [(i + 1, v) for i, v in enumerate(values) if v is not None and pd.notna(v)]
                 if valid_data:
                     plotted_any = True
                     windows, param_values = zip(*valid_data)
                     param_label = param_key.replace('optim_', '').replace('_', ' ').title()
                     data_table = pd.DataFrame({param_label: param_values}, index=pd.Index(windows, name='Vindu Nr.'))
                     print(f"    Optimal {param_label} per Vindu:"); print(data_table.round(4))
                     plt.figure(figsize=(10, 5))
                     plt.plot(windows, param_values, marker='o', linestyle='-')
                     plot_title = f"Tidsvariasjon i Optimal {param_label} for {model_name}"
                     y_axis_label = f"Optimal {param_label}"
                     if 'alpha' in param_key.lower() or 'lambda' in param_key.lower():
                          if all(v > 1e-9 for v in param_values): plt.yscale('log'); y_axis_label += " (Log Skala)"
                          else: print(f"    Advarsel: Kan ikke bruke log-skala for {param_label}.")
                     plt.xlabel("Vindu Nr."); plt.ylabel(y_axis_label); plt.title(plot_title); plt.grid(True, which='both', linestyle='--', linewidth=0.5); plt.tight_layout(); plt.show()
                 else: print(f"    Ingen gyldige verdier funnet for '{param_label}' for {model_name}.")
             else: print(f"    Metrikk '{param_key}' ikke funnet for {model_name}.")
     if not plotted_any: print("  Ingen kompleksitetsparametere å plotte.")


# === Stage 10: Reporting & Saving ===
# (Keep create_summary_table and save_results as is)
def create_summary_table(model_metrics, annualization_factor=12):
    print("\n--- 10a. Lager Oppsummerende Resultattabell ---")
    summary_data = []
    model_order = ['OLS','OLS3H','PLS','PCR','ENET','GLM_H','RF','GBRT_H','NN1','NN2','NN3','NN4','NN5']
    models_in_results = list(model_metrics.keys())
    models_sorted = [m for m in model_order if m in models_in_results] + \
                    [m for m in sorted(models_in_results) if m not in model_order]
    if not models_sorted: print("Ingen modelldata funnet."); return pd.DataFrame()

    for model_name in models_sorted:
        metrics = model_metrics[model_name]
        avg_is_r2 = np.nanmean(metrics.get('is_r2_train_val', [])) if metrics.get('is_r2_train_val') else np.nan
        avg_oos_r2 = np.nanmean(metrics.get('oos_r2', [])) if metrics.get('oos_r2') else np.nan
        avg_oos_sharpe = np.nanmean(metrics.get('oos_sharpe', [])) if metrics.get('oos_sharpe') else np.nan
        overall_oos_r2_gu = metrics.get('oos_r2_overall_gu', np.nan)
        avg_optim_params_str = ""
        optim_parts = []
        for k, v in metrics.items():
            if k.startswith('optim_') and v:
                 numeric_v = [item for item in v if isinstance(item, (int, float, np.number)) and pd.notna(item)]
                 if numeric_v:
                     try:
                          mean_val = np.nanmean(numeric_v)
                          if not np.isnan(mean_val): optim_parts.append(f"{k.replace('optim_', '')}={mean_val:.2g}")
                     except Exception as e_mean: print(f"  Advarsel: mean calc error for {k} {model_name}: {e_mean}.")
        avg_optim_params_str = ", ".join(optim_parts)

        summary_data.append({
            'Modell': model_name,
            'Avg IS R² (%)': avg_is_r2 * 100 if pd.notna(avg_is_r2) else np.nan,
            'Avg Window OOS R² (%)': avg_oos_r2 * 100 if pd.notna(avg_oos_r2) else np.nan,
            'Overall OOS R² (%)': overall_oos_r2_gu * 100 if pd.notna(overall_oos_r2_gu) else np.nan,
            'Avg Pred Sharpe (OOS)': avg_oos_sharpe if pd.notna(avg_oos_sharpe) else np.nan,
            'Avg Optim Params': avg_optim_params_str
        })
    if not summary_data: print("Ingen data å inkludere."); return pd.DataFrame()
    summary_df = pd.DataFrame(summary_data).set_index('Modell')
    print("\n--- Oppsummeringstabell ---")
    print(summary_df.to_string(float_format=lambda x: f"{x:.3f}" if pd.notna(x) else "N/A", na_rep="N/A"))
    return summary_df

def save_results(output_dir, subset_label, results_dict):
    print(f"\n--- 10b. Lagrer Resultater for Subset: {subset_label} ---")
    subset_dir = os.path.join(output_dir, subset_label)
    try:
        if not os.path.exists(subset_dir): os.makedirs(subset_dir); print(f"  Opprettet mappe: {subset_dir}")
    except OSError as e: print(f"  FEIL: Kunne ikke opprette mappe {subset_dir}: {e}"); return

    for name, data in results_dict.items():
        base_filename = os.path.join(subset_dir, f"{name}")
        try:
            if isinstance(data, pd.DataFrame):
                if not data.empty: filename = f"{base_filename}.csv"; data.to_csv(filename); print(f"  -> Lagret DataFrame: {filename}")
                else: print(f"  -> Hoppet over tom DataFrame: {name}")
            elif isinstance(data, dict):
                saved_sub = False
                for sub_name, sub_data in data.items():
                     if isinstance(sub_data, pd.DataFrame):
                         if not sub_data.empty: sub_filename = f"{base_filename}_{sub_name}.csv"; sub_data.to_csv(sub_filename); print(f"  -> Lagret Dict->DataFrame: {sub_filename}"); saved_sub = True
                     elif isinstance(sub_data, dict): print(f"  -> Hoppet over Dict->Dict: {name}_{sub_name}")
                     else: print(f"  -> Hoppet over ukjent datatype i dict: {name}_{sub_name} (Type: {type(sub_data)})")
            else: print(f"  -> Hoppet over ukjent datatype: {name} (Type: {type(data)})")
        except Exception as e: print(f"  FEIL under lagring av '{name}' til '{subset_dir}': {e}"); traceback.print_exc(limit=1)
    print(f"--- Lagring for {subset_label} fullført (eller forsøkt) ---")

# File 4: main_runner.py

In [None]:
# --- main_runner.py ---
# Main orchestration script for running the ML asset pricing pipeline.
# Imports config and utils, defines model training logic, runs the pipeline loops.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import datetime
import time
import traceback
from collections import defaultdict
import random
import re

# --- Import Configuration & Utilities ---
import config
import pipeline_utils as utils

# --- Import Model Specific Libraries ---
from sklearn.linear_model import LinearRegression, ElasticNetCV, ElasticNet, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid, KFold
from sklearn.metrics import mean_squared_error

try:
    import statsmodels.api as sm
    STATSMODELS_AVAILABLE = True
except ImportError: STATSMODELS_AVAILABLE = False; print("ADVARSEL: Statsmodels ikke funnet.")
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers, regularizers, callbacks, backend as K
    from tensorflow.keras.optimizers import Adam # Correct specific import if needed
    TENSORFLOW_AVAILABLE = True
    # Set TF seeds and deterministic options (keep as is)
    os.environ['PYTHONHASHSEED']=str(config.TF_SEED)
    os.environ['TF_CUDNN_DETERMINISTIC']='1' # Note: May impact performance
    random.seed(config.TF_SEED)
    np.random.seed(config.TF_SEED)
    tf.random.set_seed(config.TF_SEED)
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True)
            print(f"GPUs funnet ({len(gpus)}), minnevekst aktivert.")
        except RuntimeError as e: print(f"Kunne ikke sette minnevekst for GPU: {e}")
except ImportError: TENSORFLOW_AVAILABLE = False; print("ADVARSEL: TensorFlow/Keras ikke funnet.")

# ==========================================================================
# --- MODEL TRAINING/EVALUATION FUNCTIONS (Per Window) ---
# (Keep ALL train_evaluate_* functions exactly as they were in the initially provided code)
# Define OLS
def train_evaluate_ols(X_train_val, y_train_val, X_test, model_params):
    try:
        model = LinearRegression(fit_intercept=True).fit(X_train_val, y_train_val)
        preds_oos = model.predict(X_test) if X_test.shape[0] > 0 else np.array([])
        preds_is = model.predict(X_train_val)
        return model, preds_oos, preds_is, {}
    except Exception as e: print(f"    FEIL OLS: {e}"); return None, np.array([]), np.array([]), {}

# Define OLS3H
def train_evaluate_ols3h(X_train_val, y_train_val, X_test, model_params):
    if not STATSMODELS_AVAILABLE: return None, np.array([]), np.array([]), {}
    try:
        X_tv_const = sm.add_constant(X_train_val, prepend=True)
        X_test_const = sm.add_constant(X_test, prepend=True) if X_test.shape[0] > 0 else None
        rlm_model = sm.RLM(y_train_val, X_tv_const, M=sm.robust.norms.HuberT())
        valid_fit_params = {k: v for k, v in model_params.items() if k in ['maxiter', 'tol']}
        fitted_model = rlm_model.fit(**valid_fit_params)
        preds_oos = fitted_model.predict(X_test_const) if X_test_const is not None else np.array([])
        preds_is = fitted_model.predict(X_tv_const)
        optim_params = {'M': 'HuberT', **valid_fit_params}
        return fitted_model, preds_oos, preds_is, optim_params
    except Exception as e: print(f"    FEIL OLS3H: {e}"); traceback.print_exc(limit=1); return None, np.array([]), np.array([]), {}

# Define _tune_simple_model helper
def _tune_simple_model(ModelClass, X_train, y_train, X_val, y_val, param_grid_dict):
    best_mse = np.inf; best_param_value = None; param_name = list(param_grid_dict.keys())[0]
    param_values = param_grid_dict[param_name]
    max_components = X_train.shape[1] # Simplified constraint for example
    valid_grid = [p for p in param_values if 0 < p <= max_components]
    if not valid_grid: valid_grid = [1]
    for p_val in valid_grid:
        try:
            if ModelClass == Pipeline: model_val = Pipeline([('pca', PCA(n_components=p_val)), ('lr', LinearRegression())])
            else: model_val = ModelClass(**{param_name: p_val, 'scale': False})
            model_val.fit(X_train, y_train)
            y_pred_val = model_val.predict(X_val).flatten()
            if not np.all(np.isfinite(y_pred_val)): continue
            mse = mean_squared_error(y_val, y_pred_val)
            if not np.isnan(mse) and mse < best_mse: best_mse = mse; best_param_value = p_val
        except Exception as e: continue # print(f"    FEIL tuning {param_name}={p_val}: {e}")
    if best_param_value is None: print(f"    FEIL: Tuning mislyktes for {ModelClass.__name__}.")
    return best_param_value

# Define PLS
def train_evaluate_pls(X_train, y_train, X_val, y_val, X_test, model_params):
    optimal_params = {}
    try:
        best_n = _tune_simple_model(PLSRegression, X_train, y_train, X_val, y_val, {'n_components': model_params['n_components_grid']})
        if best_n is None: raise ValueError("PLS tuning failed.")
        optimal_params = {'n_components': best_n}
        X_train_val = np.vstack((X_train, X_val)); y_train_val = np.concatenate((y_train, y_val))
        final_model = PLSRegression(n_components=best_n, scale=False).fit(X_train_val, y_train_val)
        preds_oos = final_model.predict(X_test).flatten() if X_test.shape[0] > 0 else np.array([])
        preds_is = final_model.predict(X_train_val).flatten()
        return final_model, preds_oos, preds_is, optimal_params
    except Exception as e: print(f"    FEIL PLS: {e}"); traceback.print_exc(limit=1); return None, np.array([]), np.array([]), {}

# Define PCR
def train_evaluate_pcr(X_train, y_train, X_val, y_val, X_test, model_params):
    optimal_params = {}
    try:
        best_n = _tune_simple_model(Pipeline, X_train, y_train, X_val, y_val, {'n_components': model_params['n_components_grid']})
        if best_n is None: raise ValueError("PCR tuning failed.")
        optimal_params = {'n_components': best_n}
        final_model = Pipeline([('pca', PCA(n_components=best_n)), ('lr', LinearRegression())])
        X_train_val = np.vstack((X_train, X_val)); y_train_val = np.concatenate((y_train, y_val))
        final_model.fit(X_train_val, y_train_val)
        preds_oos = final_model.predict(X_test) if X_test.shape[0] > 0 else np.array([])
        preds_is = final_model.predict(X_train_val)
        return final_model, preds_oos, preds_is, optimal_params
    except Exception as e: print(f"    FEIL PCR: {e}"); traceback.print_exc(limit=1); return None, np.array([]), np.array([]), {}

# Define ENET
def train_evaluate_enet(X_train, y_train, X_test, model_params):
    optimal_params = {}
    try:
        cv_strategy = KFold(n_splits=model_params['cv_folds'], shuffle=True, random_state=config.GENERAL_SEED)
        enet_cv = ElasticNetCV(alphas=model_params['alphas'], l1_ratio=model_params['l1_ratio'], fit_intercept=True, cv=cv_strategy, n_jobs=model_params.get('n_jobs', -1), max_iter=model_params.get('max_iter', 1000), tol=model_params.get('tol', 0.001), random_state=config.GENERAL_SEED)
        enet_cv.fit(X_train, y_train)
        optimal_params = {'alpha': enet_cv.alpha_, 'l1_ratio': enet_cv.l1_ratio_}
        final_model = enet_cv
        preds_oos = final_model.predict(X_test) if X_test.shape[0] > 0 else np.array([])
        preds_is = final_model.predict(X_train)
        return final_model, preds_oos, preds_is, optimal_params
    except Exception as e: print(f"    FEIL ENET: {e}"); traceback.print_exc(limit=1); return None, np.array([]), np.array([]), {}

# Define GLM_H
def train_evaluate_glm_h(X_train, y_train, X_val, y_val, X_test, model_params):
    optimal_params = {}; best_mse = np.inf; optim_found_params = None
    grid = list(ParameterGrid(model_params['param_grid'])); max_iter = model_params.get('max_iter', 300)
    for params in grid:
        try:
            model_val = HuberRegressor(fit_intercept=True, **params, max_iter=max_iter).fit(X_train, y_train)
            y_pred_val = model_val.predict(X_val)
            if not np.all(np.isfinite(y_pred_val)): continue
            mse = mean_squared_error(y_val, y_pred_val)
            if not np.isnan(mse) and mse < best_mse: best_mse = mse; optim_found_params = params
        except Exception as e: continue # print(f"    FEIL GLM_H tuning params {params}: {e}")
    if optim_found_params is None: print("    FEIL: GLM_H tuning mislyktes."); return None, np.array([]), np.array([]), {}
    optimal_params = optim_found_params.copy()
    try:
        X_train_val = np.vstack((X_train, X_val)); y_train_val = np.concatenate((y_train, y_val))
        final_model = HuberRegressor(fit_intercept=True, **optimal_params, max_iter=max_iter).fit(X_train_val, y_train_val)
        preds_oos = final_model.predict(X_test) if X_test.shape[0] > 0 else np.array([])
        preds_is = final_model.predict(X_train_val)
        return final_model, preds_oos, preds_is, optimal_params
    except Exception as e: print(f"    FEIL GLM_H final: {e}"); traceback.print_exc(limit=1); return None, np.array([]), np.array([]), {}

# Define _tune_tree_model helper
def _tune_tree_model(ModelClass, X_train, y_train, X_val, y_val, model_params):
    best_mse = np.inf; best_params = None; param_grid = list(ParameterGrid(model_params['param_grid']))
    base_params = {k: v for k, v in model_params.items() if k != 'param_grid'}
    for params in param_grid:
        try:
            current_params = {**base_params, **params}
            model_val = ModelClass(**current_params).fit(X_train, y_train)
            y_pred_val = model_val.predict(X_val)
            if not np.all(np.isfinite(y_pred_val)): continue
            mse = mean_squared_error(y_val, y_pred_val)
            if not np.isnan(mse) and mse < best_mse: best_mse = mse; best_params = params
        except Exception as e: continue # print(f"    FEIL {ModelClass.__name__} tuning params {params}: {e}")
    if best_params is None: print(f"    FEIL: Tuning mislyktes for {ModelClass.__name__}.")
    return best_params

# Define RF
def train_evaluate_rf(X_train, y_train, X_val, y_val, X_test, model_params):
    optimal_params = {}
    try:
        best_grid_params = _tune_tree_model(RandomForestRegressor, X_train, y_train, X_val, y_val, model_params)
        if best_grid_params is None: raise ValueError("RF tuning failed.")
        optimal_params = best_grid_params.copy()
        final_params = {**{k:v for k,v in model_params.items() if k!='param_grid'}, **optimal_params}
        X_train_val = np.vstack((X_train, X_val)); y_train_val = np.concatenate((y_train, y_val))
        final_model = RandomForestRegressor(**final_params).fit(X_train_val, y_train_val)
        preds_oos = final_model.predict(X_test) if X_test.shape[0] > 0 else np.array([])
        preds_is = final_model.predict(X_train_val)
        return final_model, preds_oos, preds_is, optimal_params
    except Exception as e: print(f"    FEIL RF: {e}"); traceback.print_exc(limit=1); return None, np.array([]), np.array([]), {}

# Define GBRT_H
def train_evaluate_gbrt_h(X_train, y_train, X_val, y_val, X_test, model_params):
    optimal_params = {}
    try:
        gbrt_params = model_params.copy(); gbrt_params['loss'] = 'huber'
        best_grid_params = _tune_tree_model(GradientBoostingRegressor, X_train, y_train, X_val, y_val, gbrt_params)
        if best_grid_params is None: raise ValueError("GBRT tuning failed.")
        optimal_params = best_grid_params.copy()
        final_params = {**{k:v for k,v in gbrt_params.items() if k!='param_grid'}, **optimal_params}
        X_train_val = np.vstack((X_train, X_val)); y_train_val = np.concatenate((y_train, y_val))
        final_model = GradientBoostingRegressor(**final_params).fit(X_train_val, y_train_val)
        preds_oos = final_model.predict(X_test) if X_test.shape[0] > 0 else np.array([])
        preds_is = final_model.predict(X_train_val)
        return final_model, preds_oos, preds_is, optimal_params
    except Exception as e: print(f"    FEIL GBRT: {e}"); traceback.print_exc(limit=1); return None, np.array([]), np.array([]), {}

# Define NN functions (if TF available)
if TENSORFLOW_AVAILABLE:
    def build_nn_model(input_shape, nn_config, lambda1):
        model = keras.Sequential(name=nn_config['name'])
        model.add(layers.Input(shape=(input_shape,)))
        for units in nn_config['hidden_units']:
            model.add(layers.Dense(units, activation='relu', kernel_regularizer=regularizers.l1(lambda1)))
        model.add(layers.Dense(1, activation='linear'))
        return model

    def train_evaluate_nn(X_train, y_train, X_val, y_val, X_test, model_params, nn_specific_config):
        model_name = nn_specific_config['name']; optimal_params = {}; best_val_mse = np.inf; optim_found_params = None
        input_shape = X_train.shape[1]; shared_params = model_params['NN_SHARED']
        param_grid = list(ParameterGrid(shared_params['param_grid'])); epochs = shared_params['epochs']; batch_size = shared_params['batch_size']
        patience = shared_params['patience']; ensemble_size = shared_params['ensemble_size']; base_seed = shared_params['random_seed_base']
        # Tuning Loop
        for params in param_grid:
            lambda1 = params['lambda1']; learning_rate = params['learning_rate']; val_preds_ensemble = []
            try:
                for i in range(ensemble_size):
                    K.clear_session(); tf.random.set_seed(base_seed + i); np.random.seed(base_seed + i); random.seed(base_seed + i)
                    nn_model = build_nn_model(input_shape, nn_specific_config, lambda1)
                    nn_model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')
                    early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True, verbose=0)
                    history = nn_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=[early_stopping, callbacks.TerminateOnNaN()], verbose=0)
                    if not np.isnan(history.history['val_loss']).any() and 'val_loss' in history.history and history.history['val_loss']:
                        val_preds_member = nn_model.predict(X_val, batch_size=batch_size).flatten()
                        if np.all(np.isfinite(val_preds_member)): val_preds_ensemble.append(val_preds_member)
                        else: val_preds_ensemble = []; break
                    else: val_preds_ensemble = []; break
                if not val_preds_ensemble: continue
                avg_val_preds = np.mean(np.array(val_preds_ensemble), axis=0)
                finite_mask = np.isfinite(avg_val_preds) & np.isfinite(y_val)
                if np.sum(finite_mask) == 0: continue
                val_mse = mean_squared_error(y_val[finite_mask], avg_val_preds[finite_mask])
                if not np.isnan(val_mse) and val_mse < best_val_mse: best_val_mse = val_mse; optim_found_params = params
            except Exception as e: print(f"    FEIL NN tuning {params}: {e}"); continue
        # Check Tuning Success
        if optim_found_params is None: print(f"    FEIL: NN tuning mislyktes for {model_name}."); return None, np.array([]), np.array([]), {}
        optimal_params = optim_found_params.copy(); opt_lambda1 = optimal_params['lambda1']; opt_lr = optimal_params['learning_rate']
        # Final Ensemble Training
        final_model = None; test_preds_ensemble = []; is_preds_ensemble = []
        try:
            X_train_val = np.vstack((X_train, X_val)); y_train_val = np.concatenate((y_train, y_val))
            for i in range(ensemble_size):
                K.clear_session(); final_seed = base_seed + i + ensemble_size; tf.random.set_seed(final_seed); np.random.seed(final_seed); random.seed(final_seed)
                nn_model_final = build_nn_model(input_shape, nn_specific_config, opt_lambda1)
                nn_model_final.compile(optimizer=Adam(learning_rate=opt_lr), loss='mse')
                history_final = nn_model_final.fit(X_train_val, y_train_val, epochs=epochs, batch_size=batch_size, callbacks=[callbacks.TerminateOnNaN()], verbose=0)
                if np.isnan(history_final.history['loss']).any(): test_preds_ensemble = []; is_preds_ensemble = []; break
                if X_test.shape[0] > 0:
                    preds_t = nn_model_final.predict(X_test, batch_size=batch_size).flatten()
                    if np.all(np.isfinite(preds_t)): test_preds_ensemble.append(preds_t)
                    else: print(f"   Advarsel: Ikke-finite OOS pred NN"); test_preds_ensemble = []; break
                preds_i = nn_model_final.predict(X_train_val, batch_size=batch_size).flatten()
                if np.all(np.isfinite(preds_i)): is_preds_ensemble.append(preds_i)
                else: print(f"   Advarsel: Ikke-finite IS pred NN"); is_preds_ensemble = []; break
                if i == 0: final_model = nn_model_final
            # Aggregate Predictions
            if (X_test.shape[0] > 0 and not test_preds_ensemble) or not is_preds_ensemble: raise ValueError(f"{model_name} final ensemble failed.")
            preds_oos_final = np.mean(np.array(test_preds_ensemble), axis=0) if X_test.shape[0] > 0 else np.array([])
            preds_is_final = np.mean(np.array(is_preds_ensemble), axis=0)
            return final_model, preds_oos_final, preds_is_final, optimal_params
        except Exception as e: print(f"    FEIL NN final: {e}"); traceback.print_exc(limit=1); return None, np.array([]), np.array([]), {}
# ==========================================================================


# ==========================================================================
# --- MAIN EXECUTION SCRIPT ---
# ==========================================================================
if __name__ == "__main__":
    overall_start_time = datetime.datetime.now()
    print(f"--- Starter ML Asset Pricing Pipeline ---")
    print(f"--- Starttidspunkt: {overall_start_time:%Y-%m-%d %H:%M:%S} ---")
    print(f"--- Output lagres i: {config.OUTPUT_DIR} ---")

    # === 1: Load Preprocessed Data ===
    df_loaded = utils.load_prepare_data(
        config.DATA_FILE, config.COLUMN_CONFIG,
        config.TARGET_VARIABLE, config.NEXT_RETURN_VARIABLE, config.MARKET_CAP_ORIG_VARIABLE
    )
    if df_loaded is None: exit("Avslutter: Lasting av forhåndsbehandlet data feilet.")

    # === 2: Define Features (from preprocessed data) ===
    base_exclude_list = [
        config.TARGET_VARIABLE, config.NEXT_RETURN_VARIABLE, config.MARKET_CAP_ORIG_VARIABLE,
        'Instrument', 'Date', 'id', 'date', # Include potential original names
        'MonthlyReturn_t', 'AdjustedReturn_t', 'MonthlyRiskFreeRate_t', # Exclude intermediate calcs if present
        # Exclude original cols IF their logged versions were successfully created and are found
        # This logic is now handled more robustly inside define_features
    ]
    # Add specific raw names to exclude if their log versions are intended features
    # Check preprocess_data.py output for final log names (e.g., log_marketcap vs log_MarketCap)
    if utils.find_col(df_loaded, ['log_MarketCap', 'log_marketcap']): base_exclude_list.extend(utils.find_col(df_loaded, [n]) for n in ['MarketCap', 'CommonSharesOutstanding', 'ClosePrice'] if utils.find_col(df_loaded, [n]))
    if utils.find_col(df_loaded, ['log_BM', 'log_bm']): base_exclude_list.append(utils.find_col(df_loaded, ['BM', 'bm']))
    if utils.find_col(df_loaded, ['log_ClosePrice', 'log_closeprice']): base_exclude_list.append(utils.find_col(df_loaded, ['ClosePrice', 'closeprice']))
    if utils.find_col(df_loaded, ['log_Volume', 'log_volume']): base_exclude_list.append(utils.find_col(df_loaded, ['Volume', 'volume']))
    if utils.find_col(df_loaded, ['log_CommonSharesOutstanding', 'log_commonsharesoutstanding']): base_exclude_list.append(utils.find_col(df_loaded, ['CommonSharesOutstanding', 'commonsharesoutstanding']))
    if utils.find_col(df_loaded, ['TermSpread', 'termspread']): base_exclude_list.extend(utils.find_col(df_loaded, [n]) for n in ['NorgesBank10Y','NIBOR3M'] if utils.find_col(df_loaded, [n]))

    # Remove None values potentially added by find_col if column wasn't found
    base_exclude_list = [col for col in base_exclude_list if col is not None]

    all_numeric_features_init, ols3_subset_features_init, _ = utils.define_features(
        df_loaded, config.OLS3_FEATURE_NAMES, base_exclude_list
    )
    if not all_numeric_features_init: exit("Avslutter: Ingen features definert etter lasting.")

    # === 3: Rank Standardize Features ===
    df_std = utils.rank_standardize_features(df_loaded, all_numeric_features_init)
    if df_std is None: exit("Avslutter: Standardisering feilet.")

    # === 4: Clean Data (Post-Standardization) ===
    df_clean = utils.clean_data(
        df_std,
        all_numeric_features_init, # Features to check for NaN/inf
        config.ESSENTIAL_COLS_FOR_DROPNA, # Columns where NaN forces row drop
        config.MARKET_CAP_ORIG_VARIABLE   # Column to check for > 0
    )
    if df_clean is None or df_clean.empty: exit("Avslutter: Dataframe tom etter rensing.")

    # === Final Feature Definition and Model Assignment ===
    all_numeric_features, ols3_subset_features, _ = utils.define_features(
        df_clean, config.OLS3_FEATURE_NAMES, base_exclude_list # Use same exclusion list
    )
    if not all_numeric_features: exit("FEIL: Ingen numeriske features igjen etter rensing.")

    ols3_required_count = len(config.OLS3_FEATURE_NAMES)
    if not ols3_subset_features or len(ols3_subset_features) < ols3_required_count:
        if config.RUN_MODELS.get('OLS3H', False): print(f"\nADVARSEL: Ikke alle {ols3_required_count} OLS3 features funnet ({ols3_subset_features}). OLS3H deaktiveres.")
        config.RUN_MODELS['OLS3H'] = False
    elif config.RUN_MODELS.get('OLS3H', False): print(f"\nINFO: Alle OLS3 features funnet ({ols3_subset_features}). OLS3H er aktiv.")

    feature_map = {}
    for model, fset_key in config.MODEL_FEATURE_MAP.items():
        if fset_key == 'ols3_features': feature_map[model] = ols3_subset_features if config.RUN_MODELS.get('OLS3H', False) else []
        elif fset_key == 'all_numeric': feature_map[model] = all_numeric_features
        else: print(f"Advarsel: Ukjent feature set key '{fset_key}' for {model}."); feature_map[model] = all_numeric_features

    # === Initialize Results Storage ===
    all_metrics = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    all_vi = defaultdict(lambda: defaultdict(list)); all_vi_avg = defaultdict(dict)
    all_portfolios = defaultdict(dict); all_summaries = {}

    # --- Define Actual Column Names (using config now) ---
    target_col_actual_name = config.TARGET_VARIABLE
    next_ret_col_actual_name = config.NEXT_RETURN_VARIABLE
    mcap_orig_col_actual_name = config.MARKET_CAP_ORIG_VARIABLE
    date_col_actual_name = 'Date'; id_col_actual_name = 'Instrument'

    essential_check_list = [date_col_actual_name, id_col_actual_name, target_col_actual_name, next_ret_col_actual_name, mcap_orig_col_actual_name]
    missing_essentials = [c for c in essential_check_list if c not in df_clean.columns]
    if missing_essentials: exit(f"FEIL: Nødvendige kolonner mangler i df_clean: {missing_essentials}.")

    # === 5: Outer Loop: Subsets ===
    print(f"\n--- Starter Subset Loop: {config.SUBSETS_TO_RUN} ---")
    for subset in config.SUBSETS_TO_RUN:
        subset_start_time = datetime.datetime.now()
        print(f"\n{'='*30} Starter Subset: {subset.upper()} {'='*30}")

        # --- Create Subset Data ---
        df_subset = pd.DataFrame()
        df_mc_temp = df_clean.dropna(subset=[date_col_actual_name, mcap_orig_col_actual_name]).copy()
        if df_mc_temp.empty: print(f"FEIL: Ingen data for subsetting i {subset}."); continue
        if subset == 'all': df_subset = df_clean.copy(); print("  Bruker fullt datasett.")
        else:
            print(f"  Definerer subset basert på {mcap_orig_col_actual_name} persentiler...")
            df_mc_temp['MonthYear'] = pd.to_datetime(df_mc_temp[date_col_actual_name]).dt.to_period('M')
            if subset == 'big':
                cutoff_quantile = 1.0 - (config.BIG_FIRM_TOP_PERCENT / 100.0)
                size_cutoffs = df_mc_temp.groupby('MonthYear')[mcap_orig_col_actual_name].quantile(cutoff_quantile)
                print(f"  -> Topp {config.BIG_FIRM_TOP_PERCENT}% (Quantile: {cutoff_quantile:.2f})")
                df_mc_temp = df_mc_temp.join(size_cutoffs.rename('cutoff'), on='MonthYear')
                df_subset = df_mc_temp[df_mc_temp[mcap_orig_col_actual_name] >= df_mc_temp['cutoff']].copy()
            elif subset == 'small':
                cutoff_quantile = config.SMALL_FIRM_BOTTOM_PERCENT / 100.0
                size_cutoffs = df_mc_temp.groupby('MonthYear')[mcap_orig_col_actual_name].quantile(cutoff_quantile)
                print(f"  -> Bunn {config.SMALL_FIRM_BOTTOM_PERCENT}% (Quantile: {cutoff_quantile:.2f})")
                df_mc_temp = df_mc_temp.join(size_cutoffs.rename('cutoff'), on='MonthYear')
                df_subset = df_mc_temp[df_mc_temp[mcap_orig_col_actual_name] <= df_mc_temp['cutoff']].copy()
            else: print(f"FEIL: Ukjent subset '{subset}'."); continue
            df_subset = df_subset.drop(columns=['MonthYear', 'cutoff'], errors='ignore')
        if df_subset.empty: print(f"FEIL: Subset '{subset}' er tomt."); continue
        df_subset = df_subset.sort_values(by=[date_col_actual_name, id_col_actual_name]).reset_index(drop=True)
        print(f"  Subset '{subset}' klar. Form: {df_subset.shape}")

        # === 6: Inner Loop: Rolling Windows ===
        try: splits = list(utils.get_yearly_rolling_splits(df_subset, config.INITIAL_TRAIN_YEARS, config.VALIDATION_YEARS, config.TEST_YEARS_PER_WINDOW))
        except ValueError as e: print(f"FEIL split gen for '{subset}': {e}"); continue
        except Exception as e_gen: print(f"Uventet FEIL split gen for '{subset}': {e_gen}"); traceback.print_exc(); continue
        if not splits: print(f"Ingen vinduer for '{subset}'."); continue
        num_windows = len(splits)
        print(f"\n--- Starter Rullerende Vindu Loop for Subset: {subset} ({num_windows} vinduer) ---")
        window_preds_list = []; last_train_idx, last_val_idx = None, None; last_models_fit = {}

        for window_idx, (train_idx, val_idx, test_idx, train_dates, val_dates, test_dates) in enumerate(splits):
            window_num = window_idx + 1; window_start_time = time.time()
            print(f"\n-- Vindu {window_num}/{num_windows} ({subset}) --")
            # --- Use CORRECTED check for date ranges ---
            if train_dates is not None: print(f"  Train: {train_dates['min'].date()} -> {train_dates['max'].date()} ({len(train_idx)} obs)")
            if val_dates is not None:   print(f"  Val:   {val_dates['min'].date()} -> {val_dates['max'].date()} ({len(val_idx)} obs)")
            if test_dates is not None:  print(f"  Test:  {test_dates['min'].date()} -> {test_dates['max'].date()} ({len(test_idx)} obs)")
            # --- End corrected check ---
            if test_idx.empty or train_idx.empty: print("  Advarsel: Tomt train/test sett."); continue
            needs_val_set = lambda name: name not in ['OLS', 'OLS3H', 'ENET']
            if val_idx.empty and any(config.RUN_MODELS[m] and needs_val_set(m) for m in config.RUN_MODELS): print("  Advarsel: Tomt val set, men trengs."); continue

            y_train = df_subset.loc[train_idx, target_col_actual_name].values
            y_val = df_subset.loc[val_idx, target_col_actual_name].values if not val_idx.empty else np.array([])
            y_test = df_subset.loc[test_idx, target_col_actual_name].values
            y_train_val = np.concatenate((y_train, y_val)) if not val_idx.empty else y_train

            window_results = {date_col_actual_name: df_subset.loc[test_idx, date_col_actual_name].values, id_col_actual_name: df_subset.loc[test_idx, id_col_actual_name].values, target_col_actual_name: y_test}
            nan_preds = np.full(len(test_idx), np.nan)
            for model_name_init, run_flag in config.RUN_MODELS.items():
                if run_flag: window_results[f'yhat_{model_name_init.lower()}'] = nan_preds.copy()
            window_models_fitted_this_run = {}

            # === 7: Innermost Loop: Models ===
            for model_name, do_run in config.RUN_MODELS.items():
                if not do_run: continue
                if model_name == 'OLS3H' and (not STATSMODELS_AVAILABLE or not config.RUN_MODELS['OLS3H']): continue
                if model_name.startswith('NN') and not TENSORFLOW_AVAILABLE: continue
                print(f"  -> Trener/Evaluerer: {model_name}...")
                model_start_time = time.time(); fitted_model, preds_oos, preds_is, optimal_hyperparams = None, np.array([]), np.array([]), {}; y_is_target = np.array([])
                current_features = feature_map.get(model_name); current_features = [f for f in current_features if f in df_subset.columns]
                if not current_features: print(f"    Advarsel: Ingen features for {model_name}."); continue
                X_train = df_subset.loc[train_idx, current_features].values; X_val = df_subset.loc[val_idx, current_features].values if not val_idx.empty else np.empty((0, len(current_features)))
                X_test = df_subset.loc[test_idx, current_features].values; X_train_val = np.vstack((X_train, X_val)) if not val_idx.empty else X_train
                min_obs_train = max(2, X_train.shape[1] + 1) if model_name == 'OLS3H' else 2
                if X_train.shape[0] < min_obs_train or np.isnan(y_train).all(): print(f"    Advarsel: Utilstrekkelig train data."); continue
                if val_idx.empty and needs_val_set(model_name): print(f"    Advarsel: Tomt val set."); continue
                if not val_idx.empty and (X_val.shape[0] < 2 or np.isnan(y_val).all()) and needs_val_set(model_name): print(f"    Advarsel: Utilstrekkelig val data."); continue

                try:
                    train_func_name = f"train_evaluate_{model_name.lower().replace('-', '').replace('+', '_')}"
                    train_function = locals().get(train_func_name)
                    if train_function:
                        model_config_params = config.MODEL_PARAMS.get(model_name, {})
                        if model_name in ['OLS', 'OLS3H']: fitted_model, preds_oos, preds_is, optimal_hyperparams = train_function(X_train_val, y_train_val, X_test, model_config_params); y_is_target = y_train_val
                        elif model_name == 'ENET': fitted_model, preds_oos, preds_is, optimal_hyperparams = train_function(X_train, y_train, X_test, model_config_params); y_is_target = y_train
                        elif model_name.startswith('NN'):
                             if TENSORFLOW_AVAILABLE: nn_specific_config = config.MODEL_PARAMS.get(model_name, {}); fitted_model, preds_oos, preds_is, optimal_hyperparams = train_function(X_train, y_train, X_val, y_val, X_test, config.MODEL_PARAMS, nn_specific_config); y_is_target = y_train_val
                             else: continue
                        else: fitted_model, preds_oos, preds_is, optimal_hyperparams = train_function(X_train, y_train, X_val, y_val, X_test, model_config_params); y_is_target = y_train_val

                        if preds_oos is not None and preds_is is not None and len(preds_oos) == len(y_test):
                            preds_oos_finite=preds_oos[np.isfinite(preds_oos)]; y_test_aligned_oos=y_test[np.isfinite(preds_oos)]; preds_is_finite=preds_is[np.isfinite(preds_is)]; y_is_target_aligned_is=y_is_target[np.isfinite(preds_is)]
                            r2_oos=utils.calculate_oos_r2(y_test_aligned_oos, preds_oos_finite) if len(y_test_aligned_oos)>=2 else np.nan; r2_is=utils.calculate_oos_r2(y_is_target_aligned_is, preds_is_finite) if len(y_is_target_aligned_is)>=2 else np.nan; sharpe_oos=utils.calculate_sharpe_of_predictions(preds_oos_finite) if len(preds_oos_finite)>=2 else np.nan
                            all_metrics[subset][model_name]['oos_r2'].append(r2_oos); all_metrics[subset][model_name]['is_r2_train_val'].append(r2_is); all_metrics[subset][model_name]['oos_sharpe'].append(sharpe_oos)
                            for param_name, param_value in optimal_hyperparams.items(): all_metrics[subset][model_name][f'optim_{param_name}'].append(param_value)
                            pred_col_name=f'yhat_{model_name.lower()}'; window_results[pred_col_name]=preds_oos
                            if fitted_model is not None: window_models_fitted_this_run[model_name] = fitted_model
                            print(f"    {model_name}: OOS R²={r2_oos:.4f}, IS R²={r2_is:.4f}, Sharpe={sharpe_oos:.3f} ({time.time()-model_start_time:.1f}s)")
                            if config.CALCULATE_VI and fitted_model is not None and config.MODEL_VI_STRATEGY.get(model_name) == 'per_window':
                                if pd.notna(r2_is):
                                    vi_start_time = time.time(); X_eval_vi = X_train_val if model_name not in ['ENET'] else X_train; y_eval_vi = y_is_target
                                    if model_name == 'OLS3H': finite_mask_vi=np.isfinite(y_eval_vi);
                                    if not np.all(finite_mask_vi): X_eval_vi=X_eval_vi[finite_mask_vi]; y_eval_vi=y_eval_vi[finite_mask_vi]
                                    if X_eval_vi.shape[0] > 0 and y_eval_vi.shape[0] > 0:
                                        vi_df = utils.calculate_variable_importance(model_name, fitted_model, X_eval_vi, y_eval_vi, current_features, r2_is, config.VI_METHOD, optimal_hyperparams)
                                        if vi_df is not None and not vi_df.empty: all_vi[subset][model_name].append(vi_df)
                                    else: print(f"      Advarsel: Ingen data for VI for {model_name}.")
                                else: print(f"    Advarsel: Hopper VI for {model_name} pga. NaN IS R2.")
                        else: print(f"    Advarsel: {model_name} ingen/feil pred."); all_metrics[subset][model_name]['oos_r2'].append(np.nan); all_metrics[subset][model_name]['is_r2_train_val'].append(np.nan); all_metrics[subset][model_name]['oos_sharpe'].append(np.nan)
                    else: print(f"    FEIL: Treningsfunksjon '{train_func_name}' ikke funnet."); continue
                except Exception as e_train: print(f"    !!! KRITISK FEIL {model_name}: {e_train}"); traceback.print_exc(); all_metrics[subset][model_name]['oos_r2'].append(np.nan); all_metrics[subset][model_name]['is_r2_train_val'].append(np.nan); all_metrics[subset][model_name]['oos_sharpe'].append(np.nan)

            # End Model Loop
            window_preds_list.append(pd.DataFrame(window_results))
            if window_idx == num_windows - 1: last_train_idx=train_idx.copy(); last_val_idx=val_idx.copy() if not val_idx.empty else None; last_models_fit=window_models_fitted_this_run.copy()
            print(f"-- Vindu {window_num} ({subset}) fullført ({time.time() - window_start_time:.1f}s) --")
        # End Window Loop

        # === 8-10: Post-Window Analysis for the Subset ===
        if not window_preds_list: print(f"\nFEIL: Ingen vindusprediksjoner for '{subset}'."); continue
        print(f"\n--- Analyserer resultater for Subset: {subset} ---"); results_df_subset=pd.concat(window_preds_list).reset_index(drop=True)
        prediction_cols_subset = [c for c in results_df_subset.columns if c.startswith('yhat_')]
        if not prediction_cols_subset: print(f"FEIL: Ingen prediksjonskolonner for '{subset}'."); continue

        # Calculate Overall OOS R2 (Gu Definition)
        print(f"\n--- Overall OOS R² (Gu-stil) for Subset: {subset} ---"); y_true_overall=results_df_subset[target_col_actual_name]; y_true_finite_overall=y_true_overall[np.isfinite(y_true_overall)]; ss_tot_overall=np.sum(y_true_finite_overall**2)
        if len(y_true_finite_overall) > 1 and ss_tot_overall > 1e-15:
            for pred_col in prediction_cols_subset:
                model_name_oos=pred_col.replace('yhat_', '').upper(); y_pred_overall=results_df_subset[pred_col]; mask_overall=np.isfinite(y_true_overall)&np.isfinite(y_pred_overall); y_t_o=y_true_overall[mask_overall]; y_p_o=y_pred_overall[mask_overall]
                if len(y_t_o) >= 2: r2_overall_gu=utils.calculate_oos_r2(y_t_o, y_p_o); all_metrics[subset][model_name_oos]['oos_r2_overall_gu']=r2_overall_gu; print(f"  {model_name_oos}: {r2_overall_gu:.6f}")
                else: print(f"  {model_name_oos}: N/A"); all_metrics[subset][model_name_oos]['oos_r2_overall_gu']=np.nan
        else: print("  Kan ikke beregne overall OOS R2.")

        # Perform Portfolio Analysis
        print(f"\n--- Starter Detaljert Porteføljeanalyse for Subset: {subset} ---")
        decile_tables, hl_risk_tables, long_risk_tables = utils.perform_detailed_portfolio_analysis(results_df_subset, df_clean, prediction_cols_subset, mcap_orig_col_actual_name, next_ret_col_actual_name, config.FILTER_SMALL_CAPS_PORTFOLIO, config.ANNUALIZATION_FACTOR, config.BENCHMARK_FILE, config.FF_FACTOR_FILE)
        all_portfolios[subset]={'decile_tables': decile_tables, 'hl_risk_tables': hl_risk_tables, 'long_risk_tables': long_risk_tables}

        # Calculate Average/Last Window Variable Importance
        if config.CALCULATE_VI:
            print(f"\n--- Beregner Variabel Viktighet (VI) for Subset: {subset} ---")
            for model_name, do_run in config.RUN_MODELS.items():
                if not do_run: continue
                if model_name=='OLS3H' and (not STATSMODELS_AVAILABLE or not config.RUN_MODELS['OLS3H']): continue
                if model_name.startswith('NN') and not TENSORFLOW_AVAILABLE: continue
                vi_strategy=config.MODEL_VI_STRATEGY.get(model_name); current_features_vi=feature_map.get(model_name); current_features_vi=[f for f in current_features_vi if f in df_subset.columns]
                if not current_features_vi: continue
                if vi_strategy=='per_window':
                    vi_list_model = all_vi[subset].get(model_name, [])
                    if vi_list_model:
                        try:
                            avg_vi_df=pd.concat(vi_list_model).groupby('Feature')['Importance'].mean().reset_index(); total_avg_importance=avg_vi_df['Importance'].sum(); avg_vi_df['Importance']=avg_vi_df['Importance']/total_avg_importance if total_avg_importance > 1e-9 else 0.0
                            all_vi_avg[subset][model_name]=avg_vi_df.sort_values('Importance', ascending=False).reset_index(drop=True); print(f"  VI (Avg/Window) beregnet for {model_name}.")
                        except Exception as e_vi_avg: print(f"  FEIL VI avg for {model_name}: {e_vi_avg}")
                    else: print(f"  Ingen 'per_window' VI data for {model_name}.")
                elif vi_strategy=='last_window':
                    print(f"  Beregner 'last_window' VI for {model_name}...")
                    if last_train_idx is None or model_name not in last_models_fit: print(f"    Hoppet over (mangler data/modell)."); continue
                    last_model_instance=last_models_fit[model_name]; last_is_r2=all_metrics[subset][model_name]['is_r2_train_val'][-1] if all_metrics[subset][model_name]['is_r2_train_val'] else np.nan
                    if pd.isna(last_is_r2): print(f"    Advarsel: IS R2 siste vindu NaN."); continue
                    if model_name=='ENET': X_eval_last=df_subset.loc[last_train_idx, current_features_vi].values; y_eval_last=df_subset.loc[last_train_idx, target_col_actual_name].values
                    else: last_full_idx=last_train_idx.union(last_val_idx) if last_val_idx is not None else last_train_idx; X_eval_last=df_subset.loc[last_full_idx, current_features_vi].values; y_eval_last=df_subset.loc[last_full_idx, target_col_actual_name].values
                    last_optimal_params={k.replace('optim_', ''): v[-1] for k, v in all_metrics[subset][model_name].items() if k.startswith('optim_') and v}
                    vi_df_last=utils.calculate_variable_importance(model_name, last_model_instance, X_eval_last, y_eval_last, current_features_vi, last_is_r2, config.VI_METHOD, last_optimal_params)
                    if vi_df_last is not None and not vi_df_last.empty: all_vi_avg[subset][model_name]=vi_df_last.sort_values('Importance', ascending=False).reset_index(drop=True); print(f"    VI ({model_name}, siste vindu) beregnet.")
                    else: print(f"    VI beregning siste vindu ({model_name}) mislyktes.")

        # Generate Summary Table, Plot Complexity, Plot VI
        all_summaries[subset] = utils.create_summary_table(all_metrics[subset], config.ANNUALIZATION_FACTOR)
        utils.plot_time_varying_complexity(all_metrics[subset], config.COMPLEXITY_PARAMS_TO_PLOT)
        if config.CALCULATE_VI and all_vi_avg[subset]:
             print(f"\n--- Plotter Variabel Viktighet for Subset: {subset} ---")
             for model_name, vi_df in all_vi_avg[subset].items():
                 plt.figure(figsize=(10, max(6, min(len(vi_df), config.VI_PLOT_TOP_N) * 0.3)))
                 plot_df = vi_df[vi_df['Importance'] > 1e-6].head(config.VI_PLOT_TOP_N).sort_values(by='Importance', ascending=True)
                 if not plot_df.empty: plt.barh(plot_df['Feature'], plot_df['Importance']); plt.xlabel("Relativ Viktighet"); plt.title(f"{model_name} VI ({subset} - Top {len(plot_df)})"); plt.tight_layout(); plt.show()
                 else: print(f"  Ingen VI data å plotte for {model_name} ({subset})."); plt.close()

        # Save Results
        results_to_save={'summary_metrics': all_summaries[subset], 'portfolio_deciles': all_portfolios[subset].get('decile_tables', {}), 'portfolio_hl_risk': all_portfolios[subset].get('hl_risk_tables', {}), 'portfolio_long_risk': all_portfolios[subset].get('long_risk_tables', {}), 'variable_importance_avg': all_vi_avg[subset]}
        utils.save_results(config.OUTPUT_DIR, subset, results_to_save)
        subset_end_time = datetime.datetime.now()
        print(f"\n{'='*30} Subset Fullført: {subset.upper()} (Tid: {subset_end_time - subset_start_time}) {'='*30}")
    # End Subset Loop

    # === Final Reporting ===
    print("\n\n" + "="*35 + " SLUTTSAMMENDRAG " + "="*35)
    r2_final_data = defaultdict(dict)
    for sub in config.SUBSETS_TO_RUN:
        if sub in all_metrics:
            for model, metrics in all_metrics[sub].items(): r2_final_data[sub][model] = metrics.get('oos_r2_overall_gu', np.nan) * 100
    if r2_final_data:
        r2_summary_final = pd.DataFrame.from_dict(r2_final_data, orient='index')
        model_order_final = [m for m in config.RUN_MODELS if m in r2_summary_final.columns and config.RUN_MODELS[m]]
        other_models = sorted([m for m in r2_summary_final.columns if m not in model_order_final])
        r2_summary_final = r2_summary_final.reindex(columns=model_order_final + other_models, fill_value=np.nan)
        r2_summary_final.index.name="Subset"; r2_summary_final.columns.name="Model"
        print("\n--- Tabell 1 Stil: Overall Monthly OOS R² (%) [Gu et al. Def] ---")
        print(r2_summary_final.to_string(float_format=lambda x: f"{x:.3f}" if pd.notna(x) else "N/A", na_rep="N/A"))
        utils.save_results(config.OUTPUT_DIR, "consolidated", {"R2_summary_table1_style": r2_summary_final})
    else: print("\nIngen data for endelig OOS R2-oppsummering.")
    overall_end_time = datetime.datetime.now()
    print(f"\n--- Pipeline Fullført ---"); print(f"--- Sluttidspunkt: {overall_end_time:%Y-%m-%d %H:%M:%S} ---"); print(f"--- Total Kjøretid: {overall_end_time - overall_start_time} ---"); print(f"--- Resultater lagret i: {config.OUTPUT_DIR} ---")