In [1]:
# --- main_runner.py ---
# Main orchestration script for running the ML asset pricing pipeline.
# Imports config and utils, defines model training logic, runs the pipeline loops.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import datetime
import time
import traceback
from collections import defaultdict
import random

# --- Import Configuration & Utilities ---
import config
import pipeline_utils as utils

# --- Import Model Specific Libraries ---
from sklearn.linear_model import LinearRegression, ElasticNetCV, ElasticNet, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor # Use this for GBRT_H
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid, KFold
from sklearn.metrics import mean_squared_error # Added explicitly

try: import statsmodels.api as sm; STATSMODELS_AVAILABLE = True
except ImportError: STATSMODELS_AVAILABLE = False; print("Statsmodels not found, OLS3H will be skipped.")
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers, regularizers, callbacks, backend as K
    from tensorflow.keras.optimizers import Adam
    TENSORFLOW_AVAILABLE = True
    os.environ['PYTHONHASHSEED']=str(config.TF_SEED); os.environ['TF_CUDNN_DETERMINISTIC']='1'
    random.seed(config.TF_SEED); np.random.seed(config.TF_SEED); tf.random.set_seed(config.TF_SEED)
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in gpus]; print(f"GPUs found ({len(gpus)}), memory growth enabled.")
        except RuntimeError as e: print(e)
except ImportError: TENSORFLOW_AVAILABLE = False; print("TensorFlow/Keras not found, NN models will be skipped.")

# ==========================================================================
# --- MODEL TRAINING/EVALUATION FUNCTIONS (Per Window) ---
# ==========================================================================

# --- Linear Models ---
def train_evaluate_ols(X_train_val, y_train_val, X_test, model_params):
    try:
        model = LinearRegression(fit_intercept=True).fit(X_train_val, y_train_val)
        preds_oos = model.predict(X_test) if X_test.shape[0] > 0 else np.array([])
        preds_is = model.predict(X_train_val)
        return model, preds_oos, preds_is, {}
    except Exception as e:
        print(f" FEIL OLS: {e}")
        return None, np.array([]), np.array([]), {}

def train_evaluate_ols3h(X_train_val, y_train_val, X_test, model_params):
    if not STATSMODELS_AVAILABLE: return None, np.array([]), np.array([]), {}
    try:
        X_tv_c=sm.add_constant(X_train_val)
        X_te_c=sm.add_constant(X_test) if X_test.shape[0]>0 else None
        fitted=sm.RLM(y_train_val, X_tv_c, M=sm.robust.norms.HuberT()).fit(**model_params)
        preds_oos = fitted.predict(X_te_c) if X_te_c is not None else np.array([])
        preds_is = fitted.predict(X_tv_c)
        return fitted, preds_oos, preds_is, {'M':'HuberT'}
    except Exception as e:
        print(f" FEIL OLS3H: {e}")
        return None, np.array([]), np.array([]), {}

# --- Dimension Reduction ---
def _tune_simple_model(ModelClass, X_train, y_train, X_val, y_val, param_grid_dict):
    best_mse, best_p = np.inf, None
    p_name=list(param_grid_dict.keys())[0]; p_vals=param_grid_dict[p_name]
    max_c=min(p_vals[-1], X_train.shape[0], X_train.shape[1]); grid=[p for p in p_vals if 0<p<=max_c]; grid=grid if grid else [1]
    for p_val in grid:
        try:
            m_val=Pipeline([('pca',PCA(n_components=p_val)),('lr',LinearRegression())]) if ModelClass==Pipeline else ModelClass(**{p_name:p_val,'scale':False})
            m_val.fit(X_train,y_train)
            y_pred=m_val.predict(X_val).flatten();
            if not np.all(np.isfinite(y_pred)): continue
            mse=mean_squared_error(y_val,y_pred);
            if not np.isnan(mse) and mse<best_mse: best_mse=mse; best_p=p_val
        except Exception: continue # Catch errors for specific param combo
    return best_p

def train_evaluate_pls(X_train, y_train, X_val, y_val, X_test, model_params):
    optimal_params = {}
    try:
        # *** CORRECTED BLOCK ***
        best_n = _tune_simple_model(PLSRegression, X_train, y_train, X_val, y_val, {'n_components': model_params['n_components_grid']})
        if best_n is None:
            raise ValueError("PLS tuning failed")
        optimal_params = {'n_components': best_n}
        model = PLSRegression(n_components=best_n, scale=False)
        X_tv = np.vstack((X_train, X_val))
        y_tv = np.concatenate((y_train, y_val))
        model.fit(X_tv, y_tv)
        preds_oos = model.predict(X_test).flatten() if X_test.shape[0] > 0 else np.array([])
        preds_is = model.predict(X_tv).flatten()
        return model, preds_oos, preds_is, optimal_params
        # *** END CORRECTION ***
    except Exception as e:
        print(f" FEIL PLS: {e}")
        return None, np.array([]), np.array([]), {}

def train_evaluate_pcr(X_train, y_train, X_val, y_val, X_test, model_params):
    optimal_params = {}
    try:
        # *** CORRECTED BLOCK ***
        best_n = _tune_simple_model(Pipeline, X_train, y_train, X_val, y_val, {'n_components': model_params['n_components_grid']})
        if best_n is None:
            raise ValueError("PCR tuning failed")
        optimal_params = {'n_components': best_n}
        model = Pipeline([('pca', PCA(n_components=best_n)), ('lr', LinearRegression())])
        X_tv = np.vstack((X_train, X_val))
        y_tv = np.concatenate((y_train, y_val))
        model.fit(X_tv, y_tv)
        preds_oos = model.predict(X_test) if X_test.shape[0] > 0 else np.array([])
        preds_is = model.predict(X_tv) # Predict on Train+Val
        return model, preds_oos, preds_is, optimal_params
        # *** END CORRECTION ***
    except Exception as e:
        print(f" FEIL PCR: {e}")
        return None, np.array([]), np.array([]), {}

# --- Penalized Linear ---
def train_evaluate_enet(X_train, y_train, X_test, model_params):
    optimal_params = {}
    try:
        cv_strategy = KFold(n_splits=model_params['cv_folds'], shuffle=True, random_state=config.GENERAL_SEED)
        model = ElasticNetCV(alphas=model_params['alphas'], l1_ratio=model_params['l1_ratio'],
                             fit_intercept=True, cv=cv_strategy, n_jobs=model_params.get('n_jobs',-1),
                             max_iter=model_params.get('max_iter',1000), tol=model_params.get('tol',0.001),
                             random_state=config.GENERAL_SEED)
        model.fit(X_train, y_train)
        optimal_params = {'alpha': model.alpha_, 'l1_ratio': model.l1_ratio_}
        preds_oos = model.predict(X_test) if X_test.shape[0] > 0 else np.array([])
        preds_is = model.predict(X_train) # IS Prediction only on Training data for CV models
        return model, preds_oos, preds_is, optimal_params
    except Exception as e:
        print(f" FEIL ENET: {e}")
        return None, np.array([]), np.array([]), {}

def train_evaluate_glm_h(X_train, y_train, X_val, y_val, X_test, model_params):
    optimal_params = {}; best_mse = np.inf; optim_found = None
    grid = list(ParameterGrid(model_params['param_grid'])); max_iter=model_params.get('max_iter',300)
    for params in grid: # Tune
        try:
            m_v=HuberRegressor(fit_intercept=True,**params,max_iter=max_iter).fit(X_train,y_train)
            y_pred_v=m_v.predict(X_val)
            if not np.all(np.isfinite(y_pred_v)): continue
            mse=mean_squared_error(y_val,y_pred_v)
            if not np.isnan(mse) and mse<best_mse: best_mse=mse; optim_found=params
        except Exception: continue
    if optim_found is None: print(" FEIL GLM_H Tuning"); return None,np.array([]),np.array([]),{}
    optimal_params=optim_found.copy()
    try: # Final fit
        # *** CORRECTED BLOCK ***
        X_tv=np.vstack((X_train,X_val)); y_tv=np.concatenate((y_train,y_val))
        model=HuberRegressor(fit_intercept=True,**optimal_params,max_iter=max_iter).fit(X_tv,y_tv)
        preds_oos=model.predict(X_test) if X_test.shape[0]>0 else np.array([])
        preds_is=model.predict(X_tv) # Predict on Train+Val
        return model,preds_oos,preds_is,optimal_params
        # *** END CORRECTION ***
    except Exception as e:
        print(f" FEIL GLM_H Final: {e}")
        return None, np.array([]), np.array([]), {}

# --- Tree Models ---
def _tune_tree_model(ModelClass, X_train, y_train, X_val, y_val, model_params):
    best_mse, best_params = np.inf, None; param_grid=list(ParameterGrid(model_params['param_grid'])); base_params={k:v for k,v in model_params.items() if k!='param_grid'}
    for params in param_grid:
        try:
            current_params={**base_params, **params}
            # Handle n_jobs for RandomForest specifically
            if isinstance(ModelClass, RandomForestRegressor) and 'n_jobs' not in current_params: current_params['n_jobs'] = -1
            model_val=ModelClass(**current_params).fit(X_train, y_train)
            y_pred_val=model_val.predict(X_val)
            if not np.all(np.isfinite(y_pred_val)): continue
            mse=mean_squared_error(y_val,y_pred_val)
            if not np.isnan(mse) and mse<best_mse: best_mse=mse; best_params=params
        except Exception: continue
    return best_params

def train_evaluate_rf(X_train, y_train, X_val, y_val, X_test, model_params):
    optimal_params = {}
    try:
        # *** CORRECTED BLOCK ***
        best_p = _tune_tree_model(RandomForestRegressor, X_train, y_train, X_val, y_val, model_params)
        if best_p is None:
            raise ValueError("RF tuning failed")
        optimal_params = best_p.copy()
        final_params = {**{k:v for k,v in model_params.items() if k!='param_grid'}, **optimal_params}
        # Ensure n_jobs and random_state are correctly passed if not in tuned params
        if 'n_jobs' not in final_params: final_params['n_jobs'] = model_params.get('n_jobs', -1)
        if 'random_state' not in final_params: final_params['random_state'] = model_params.get('random_state', 42)

        X_tv=np.vstack((X_train,X_val)); y_tv=np.concatenate((y_train,y_val))
        model=RandomForestRegressor(**final_params).fit(X_tv, y_tv)
        preds_oos = model.predict(X_test) if X_test.shape[0]>0 else np.array([])
        preds_is = model.predict(X_tv) # Predict on Train+Val
        return model, preds_oos, preds_is, optimal_params
        # *** END CORRECTION ***
    except Exception as e:
        print(f" FEIL RF: {e}")
        return None, np.array([]), np.array([]), {}

def train_evaluate_gbrt_h(X_train, y_train, X_val, y_val, X_test, model_params):
    optimal_params = {}
    try:
        # *** CORRECTED BLOCK ***
        best_p = _tune_tree_model(GradientBoostingRegressor, X_train, y_train, X_val, y_val, model_params)
        if best_p is None:
            raise ValueError("GBRT tuning failed")
        optimal_params = best_p.copy()
        final_params = {**{k:v for k,v in model_params.items() if k!='param_grid'}, **optimal_params}
        final_params['loss']='huber' # Ensure Huber loss is set
        if 'random_state' not in final_params: final_params['random_state'] = model_params.get('random_state', 42)

        X_tv=np.vstack((X_train,X_val)); y_tv=np.concatenate((y_train,y_val))
        model=GradientBoostingRegressor(**final_params).fit(X_tv, y_tv)
        preds_oos = model.predict(X_test) if X_test.shape[0]>0 else np.array([])
        preds_is = model.predict(X_tv) # Predict on Train+Val
        return model, preds_oos, preds_is, optimal_params
        # *** END CORRECTION ***
    except Exception as e:
        print(f" FEIL GBRT_H: {e}")
        return None, np.array([]), np.array([]), {}

# --- Neural Networks ---
if TENSORFLOW_AVAILABLE:
    def build_nn_model(input_shape, nn_config, lambda1):
        model=keras.Sequential(name=nn_config['name']); model.add(layers.Input(shape=(input_shape,)))
        for u in nn_config['hidden_units']: model.add(layers.Dense(u,activation='relu',kernel_regularizer=regularizers.l1(lambda1)))
        model.add(layers.Dense(1,activation='linear')); return model

    def train_evaluate_nn(X_train, y_train, X_val, y_val, X_test, model_params, nn_specific_config):
        opt_p={}; best_mse=np.inf; optim_found=None; shape=X_train.shape[1]; shared=model_params['NN_SHARED']; grid=list(ParameterGrid(shared['param_grid'])); epochs=shared['epochs']; batch=shared['batch_size']; patience=shared['patience']; ens=shared['ensemble_size']; seed=shared['random_seed_base']; name=nn_specific_config['name']; cb_stop=callbacks.EarlyStopping(monitor='val_loss',patience=patience,restore_best_weights=True,verbose=0); cb_nan=callbacks.TerminateOnNaN()
        for params in grid: # Tune
            l1=params['lambda1']; lr=params['learning_rate']; val_preds_ens=[];
            try:
                for i in range(ens): K.clear_session(); tf.random.set_seed(seed+i); m=build_nn_model(shape,nn_specific_config,l1); m.compile(optimizer=Adam(learning_rate=lr),loss='mse'); hist=m.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=epochs,batch_size=batch,callbacks=[cb_stop,cb_nan],verbose=0);
                if not np.isnan(hist.history['val_loss']).any(): val_preds_ens.append(m.predict(X_val,batch_size=batch).flatten())
                else: val_preds_ens=[]; break
                if not val_preds_ens: continue; avg_v_preds=np.mean(np.array(val_preds_ens),axis=0); mse=mean_squared_error(y_val[np.isfinite(avg_v_preds)],avg_v_preds[np.isfinite(avg_v_preds)]);
                if not np.isnan(mse) and mse<best_mse: best_mse=mse; optim_found=params
            except Exception: continue
        if optim_found is None: print(f" FEIL {name} Tuning"); return None,np.array([]),np.array([]),{}
        opt_p=optim_found.copy(); opt_l1=opt_p['lambda1']; opt_lr=opt_p['learning_rate']
        # Final Train
        final_m=None; test_preds_ens, is_preds_ens=[],[]; X_tv=np.vstack((X_train,X_val)); y_tv=np.concatenate((y_train,y_val))
        try:
            # *** CORRECTED BLOCK ***
            for i in range(ens):
                K.clear_session(); tf.random.set_seed(seed+i+ens)
                m=build_nn_model(shape,nn_specific_config,opt_l1)
                m.compile(optimizer=Adam(learning_rate=opt_lr),loss='mse') # Use learning_rate here
                hist_f=m.fit(X_tv,y_tv,epochs=epochs,batch_size=batch,callbacks=[cb_nan],verbose=0);
                if not np.isnan(hist_f.history['loss']).any():
                    if X_test.shape[0]>0: test_preds_ens.append(m.predict(X_test,batch_size=batch).flatten())
                    is_preds_ens.append(m.predict(X_tv,batch_size=batch).flatten()); # IS on Train+Val
                    if i==0: final_m=m # Store one instance
                else: test_preds_ens=[]; is_preds_ens=[]; break # Fail fast

            if X_test.shape[0]>0 and not test_preds_ens: raise ValueError(f"{name}: OOS pred failed")
            if not is_preds_ens: raise ValueError(f"{name}: IS pred failed")

            p_oos=np.mean(np.array(test_preds_ens),axis=0) if X_test.shape[0]>0 else np.array([])
            p_is=np.mean(np.array(is_preds_ens),axis=0);
            return final_m,p_oos,p_is,opt_p
            # *** END CORRECTION ***
        except Exception as e:
             print(f" FEIL {name} Final: {e}");
             return None, np.array([]), np.array([]), {}

# ==========================================================================
# --- MAIN EXECUTION SCRIPT ---
# ==========================================================================
if __name__ == "__main__":
    overall_start_time = datetime.datetime.now()
    print(f"--- Start: {overall_start_time:%Y-%m-%d %H:%M:%S} ---")

    # === 1-4: Load, Prep, Features, Standardize, Clean ===
    df_prep = utils.load_prepare_data(config.DATA_FILE, config.COLUMN_CONFIG, config.VARS_TO_LOG, config.WINSORIZE_LIMITS, config.TARGET_VARIABLE, config.NEXT_RETURN_VARIABLE, config.MARKET_CAP_ORIG_VARIABLE)
    if df_prep is None: exit()
    all_num, ols3_f, all_needed = utils.define_features(df_prep, config.OLS3_FEATURE_NAMES, [config.TARGET_VARIABLE, config.NEXT_RETURN_VARIABLE, config.MARKET_CAP_ORIG_VARIABLE, 'Instrument', 'Date', 'MonthlyReturn_t', 'MonthlyRiskFreeRate_t'])
    if not all_needed: exit()
    df_std = utils.rank_standardize_features(df_prep, all_needed)
    df_clean = utils.clean_data(df_std, all_needed, config.ESSENTIAL_COLS_FOR_DROPNA, config.MARKET_CAP_ORIG_VARIABLE)
    if df_clean is None or df_clean.empty: exit()
    all_num, ols3_f, _ = utils.define_features(df_clean, config.OLS3_FEATURE_NAMES, [config.TARGET_VARIABLE, config.NEXT_RETURN_VARIABLE, config.MARKET_CAP_ORIG_VARIABLE, 'Instrument', 'Date', 'MonthlyReturn_t', 'MonthlyRiskFreeRate_t'])
    if not ols3_f: config.RUN_MODELS['OLS3H'] = False; print("ADVARSEL: OLS3H deaktivert.")
    if not all_num: print("FEIL: Ingen numeriske features igjen."); exit()

    # === Initialize Results Storage ===
    all_metrics = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    all_vi_avg = defaultdict(dict)
    all_portfolios = defaultdict(dict)
    all_summaries = {}

    # === 5: Outer Loop: Subsets ===
    for subset in config.SUBSETS_TO_RUN:
        subset_start = datetime.datetime.now()
        print(f"\n{'='*25} Starter Subset: {subset.upper()} {'='*25}")
        # --- Create Subset (Using PERCENTILES) ---
        df_sub = pd.DataFrame(); df_mc = df_clean.dropna(subset=[config.MARKET_CAP_ORIG_VARIABLE, 'Date']).copy()
        if df_mc.empty: print(f"FEIL: Ingen data for subsetting i {subset}."); continue
        if subset=='all': df_sub = df_clean.copy()
        else:
            df_mc['MonthYear'] = df_mc['Date'].dt.to_period('M')
            if subset=='big': cutoff_perc=1-(config.BIG_FIRM_TOP_PERCENT/100.0); size_cutoffs=df_mc.groupby('MonthYear')[config.MARKET_CAP_ORIG_VARIABLE].quantile(cutoff_perc); print(f"(Definert som Topp {config.BIG_FIRM_TOP_PERCENT}%)")
            elif subset=='small': cutoff_perc=config.SMALL_FIRM_BOTTOM_PERCENT/100.0; size_cutoffs=df_mc.groupby('MonthYear')[config.MARKET_CAP_ORIG_VARIABLE].quantile(cutoff_perc); print(f"(Definert som Bunn {config.SMALL_FIRM_BOTTOM_PERCENT}%)")
            else: print("FEIL: Ukjent subset type"); continue
            df_sub = df_mc.join(size_cutoffs.rename('cutoff'), on='MonthYear')
            df_sub = df_sub[df_sub[config.MARKET_CAP_ORIG_VARIABLE] >= df_sub['cutoff']] if subset == 'big' else df_sub[df_sub[config.MARKET_CAP_ORIG_VARIABLE] <= df_sub['cutoff']]
            df_sub = df_sub.drop(columns=['MonthYear', 'cutoff'], errors='ignore')
        if df_sub.empty: print(f"FEIL: Tomt subset {subset}."); continue
        df_sub = df_sub.sort_values(["Date", "Instrument"]).reset_index(drop=True); print(f"Subset form: {df_sub.shape}")

        # === 6: Inner Loop: Rolling Windows ===
        try: splits = list(utils.get_yearly_rolling_splits(df_sub, config.INITIAL_TRAIN_YEARS, config.VALIDATION_YEARS, config.TEST_YEARS_PER_WINDOW))
        except ValueError as e: print(f"FEIL splits {subset}: {e}"); continue
        if not splits: print(f"Ingen vinduer for {subset}."); continue
        window_preds_list = []; last_train_idx, last_val_idx, last_models = None, None, {}
        all_vi_window = defaultdict(list)

        for window, (train_idx, val_idx, test_idx, _, _, _) in enumerate(splits):
            win_num = window + 1; win_start = time.time(); print(f"-- Vindu {win_num}/{len(splits)} --")
            if test_idx.empty or val_idx.empty or train_idx.empty: print(" Tomt sett."); continue
            y_train=df_sub.loc[train_idx,config.TARGET_VARIABLE].values; y_val=df_sub.loc[val_idx,config.TARGET_VARIABLE].values; y_test=df_sub.loc[test_idx,config.TARGET_VARIABLE].values; y_train_val=np.concatenate((y_train,y_val))
            window_preds={'Date':df_sub.loc[test_idx,'Date'].values,'Instrument':df_sub.loc[test_idx,'Instrument'].values,config.TARGET_VARIABLE:y_test}
            window_models_fit={}

            # === 7: Innermost Loop: Models ===
            for model_name, do_run in config.RUN_MODELS.items():
                if not do_run: continue
                print(f"  -> {model_name}...")
                m_start=time.time(); fitted_model,p_oos,p_is,opt_p = None,np.array([]),np.array([]),{}
                f_key=config.MODEL_FEATURE_MAP.get(model_name); current_f=ols3_f if f_key=='ols3_features' else all_num
                m_params=config.MODEL_PARAMS.get(model_name,{});
                if not current_f: print(f"    FEIL: Mangler features."); continue
                X_train=df_sub.loc[train_idx,current_f].values; X_val=df_sub.loc[val_idx,current_f].values; X_test=df_sub.loc[test_idx,current_f].values; X_train_val=np.vstack((X_train,X_val))
                min_obs_train=max(2,X_train.shape[1]+1 if model_name=='OLS3H' else 2)
                if np.isnan(y_train).all() or X_train.shape[0]<min_obs_train: print(f"    Utilstrekkelig train data."); continue
                if np.isnan(y_val).all() or X_val.shape[0]<2:
                    if model_name not in ['OLS','OLS3H','ENET']: print(f"    Utilstrekkelig val data."); continue

                try: # Call training function
                    train_func_name = f"train_evaluate_{model_name.lower().replace('-','').replace('+','_')}"
                    train_func = locals().get(train_func_name)
                    if train_func:
                        if model_name in ['OLS','OLS3H']: fitted_model,p_oos,p_is,opt_p = train_func(X_train_val,y_train_val,X_test,m_params); y_is_target=y_train_val
                        elif model_name == 'ENET': fitted_model,p_oos,p_is,opt_p = train_func(X_train,y_train,X_test,m_params); y_is_target=y_train
                        elif model_name.startswith('NN'):
                             if TENSORFLOW_AVAILABLE: fitted_model,p_oos,p_is,opt_p = train_func(X_train,y_train,X_val,y_val,X_test,m_params,config.MODEL_PARAMS[model_name]); y_is_target=y_train_val
                             else: continue
                        else: fitted_model,p_oos,p_is,opt_p = train_func(X_train,y_train,X_val,y_val,X_test,m_params); y_is_target=y_train_val
                    else: print(f"    FEIL: Fant ikke {train_func_name}."); continue
                    # Metrics & Storage
                    r2_oos=utils.calculate_oos_r2(y_test,p_oos); r2_is=utils.calculate_oos_r2(y_is_target,p_is); sharpe_oos=utils.calculate_sharpe_of_predictions(p_oos)
                    all_metrics[subset][model_name]['oos_r2'].append(r2_oos); all_metrics[subset][model_name]['is_r2_train_val'].append(r2_is); all_metrics[subset][model_name]['oos_sharpe'].append(sharpe_oos)
                    for pname,pval in opt_p.items(): all_metrics[subset][model_name][f'optim_{pname}'].append(pval)
                    window_preds[f'yhat_{model_name.lower()}']=p_oos if p_oos is not None else np.nan; window_models_fit[model_name]=fitted_model
                    print(f"    {model_name}: R2={r2_oos:.4f} ({time.time()-m_start:.1f}s)")
                    # Per-window VI
                    if config.CALCULATE_VI and config.MODEL_VI_STRATEGY.get(model_name)=='per_window':
                         vi_start=time.time(); vi_df=utils.calculate_variable_importance(model_name,fitted_model,X_train_val,y_train_val,current_f,r2_is,config.VI_METHOD,opt_p)
                         if vi_df is not None and not vi_df.empty: all_vi_window[model_name].append(vi_df)
                except Exception as e: print(f"    !!! FEIL {model_name}: {e}"); traceback.print_exc(); all_metrics[subset][model_name]['oos_r2'].append(np.nan); window_preds[f'yhat_{model_name.lower()}']=np.nan

            window_preds_list.append(pd.DataFrame(window_preds))
            if window==len(splits)-1: last_train_idx,last_val_idx,last_models = train_idx.copy(),val_idx.copy(),window_models_fit.copy()
            print(f"-- Vindu {win_num} ferdig ({time.time()-win_start:.1f}s) --")
        # End window loop

        # Aggregate per-window VI
        if config.CALCULATE_VI:
            for model_name, vi_list in all_vi_window.items():
                 if vi_list: all_vi[subset][model_name].extend(vi_list)

        # === 8-10: Post-Window Analysis ===
        if not window_preds_list: print(f"Ingen resultater for {subset}."); continue
        results_df_sub=pd.concat(window_preds_list).reset_index(drop=True); pred_cols_sub=[c for c in results_df_sub.columns if c.startswith('yhat_')]
        if not pred_cols_sub: continue

        # Overall OOS R2
        print(f"\n--- Overall OOS R2 ({subset}) ---")
        y_true_s=results_df_sub[config.TARGET_VARIABLE].dropna(); ss_tot_s=np.sum(y_true_s**2)
        if len(y_true_s)>1 and ss_tot_s>1e-15:
            for pc in pred_cols_sub: mn=pc.replace('yhat_','').upper(); y_pred_s=results_df_sub[pc]; r2_o=utils.calculate_oos_r2(y_true_s,y_pred_s); all_metrics[subset][mn]['oos_r2_overall_gu']=r2_o; print(f"  {mn}: {r2_o:.6f}")

        # Portfolio Analysis
        decile_t, hl_risk_t, long_risk_t = utils.perform_detailed_portfolio_analysis(results_df_sub, df_clean, pred_cols_sub, config.MARKET_CAP_ORIG_VARIABLE, config.NEXT_RETURN_VARIABLE, 'MonthlyRiskFreeRate_t', config.FILTER_SMALL_CAPS_PORTFOLIO, config.ANNUALIZATION_FACTOR, config.BENCHMARK_FILE, config.FF_FACTOR_FILE)
        all_portfolios[subset]={'decile_tables':decile_t, 'hl_risk_tables':hl_risk_t, 'long_risk_tables':long_risk_t}

        # Variable Importance (Final)
        if config.CALCULATE_VI:
            print(f"\n--- Variabel Viktighet ({subset}) ---")
            for model_name, do_run in config.RUN_MODELS.items():
                if not do_run: continue
                vi_strat=config.MODEL_VI_STRATEGY.get(model_name); f_key=config.MODEL_FEATURE_MAP.get(model_name); current_f=ols3_f if f_key=='ols3_features' else all_num
                if not current_f: continue
                if vi_strat=='per_window':
                    vi_list=all_vi[subset].get(model_name,[])
                    if vi_list: avg_vi=pd.concat(vi_list).groupby('Feature')['Importance'].mean().reset_index(); tot_avg=avg_vi['Importance'].sum(); avg_vi['Importance']=avg_vi['Importance']/tot_avg if tot_avg>1e-9 else 0.0; all_vi_avg[subset][model_name]=avg_vi.sort_values('Importance',ascending=False).reset_index(drop=True); print(f"  VI (Avg) for {model_name} beregnet.")
                elif vi_strat=='last_window':
                    if last_train_idx is None or model_name not in last_models: print(f"  Skipping last_window VI for {model_name}."); continue
                    print(f"  Beregner last_window VI for {model_name}..."); vi_start=time.time(); last_model=last_models[model_name]
                    last_is_r2=all_metrics[subset][model_name]['is_r2_train_val'][-1] if all_metrics[subset][model_name]['is_r2_train_val'] else np.nan
                    X_tv_last=df_sub.loc[last_train_idx.union(last_val_idx),current_f].values; y_tv_last=df_sub.loc[last_train_idx.union(last_val_idx),config.TARGET_VARIABLE].values
                    last_opt_params={k.replace('optim_',''):v[-1] for k,v in all_metrics[subset][model_name].items() if k.startswith('optim_') and v}
                    vi_df=utils.calculate_variable_importance(model_name,last_model,X_tv_last,y_tv_last,current_f,last_is_r2,config.VI_METHOD,last_opt_params)
                    if vi_df is not None and not vi_df.empty: all_vi_avg[subset][model_name]=vi_df.sort_values('Importance',ascending=False).reset_index(drop=True); print(f"  VI ({model_name}) ferdig ({time.time()-vi_start:.1f}s)")

        # Summary Table & Plots
        all_summaries[subset] = utils.create_summary_table(all_metrics[subset], config.ANNUALIZATION_FACTOR)
        utils.plot_time_varying_complexity(all_metrics[subset], config.COMPLEXITY_PARAMS_TO_PLOT)
        if config.CALCULATE_VI and all_vi_avg[subset]:
             print(f"\n--- Plotter VI for Subset: {subset} ---")
             for model_name, vi_df in all_vi_avg[subset].items():
                 plt.figure(figsize=(10,max(6, config.VI_PLOT_TOP_N*0.3))); plot_df=vi_df[vi_df['Importance']>1e-6].head(config.VI_PLOT_TOP_N).sort_values(by='Importance',ascending=True)
                 if not plot_df.empty: plt.barh(plot_df['Feature'],plot_df['Importance']); plt.xlabel("Relativ Viktighet"); plt.title(f"{model_name} VI ({subset} - Top {config.VI_PLOT_TOP_N})"); plt.tight_layout(); plt.show()

        # Save Results
        results_to_save={'summary_metrics':all_summaries[subset],'portfolio_deciles':all_portfolios[subset].get('decile_tables',{}),'portfolio_hl_risk':all_portfolios[subset].get('hl_risk_tables',{}),'portfolio_long_risk':all_portfolios[subset].get('long_risk_tables',{}),'variable_importance_avg':all_vi_avg[subset]}
        utils.save_results(config.OUTPUT_DIR, subset, results_to_save)
        print(f"\n{'='*25} Subset Fullført: {subset.upper()} (Tid: {datetime.datetime.now() - subset_start}) {'='*25}")
    # End subset loop

    # === Final Reporting ===
    print("\n\n" + "="*30 + " SLUTTSAMMENDRAG " + "="*30)
    r2_final = defaultdict(dict)
    for sub in config.SUBSETS_TO_RUN:
        for model, mets in all_metrics[sub].items(): r2_final[sub][model] = mets.get('oos_r2_overall_gu', np.nan) * 100
    r2_summary_final = pd.DataFrame.from_dict(r2_final, orient='index')
    model_order=['OLS','OLS3H','PLS','PCR','ENET','GLM_H','RF','GBRT_H','NN1','NN2','NN3','NN4','NN5']
    cols_ordered=[m for m in model_order if m in r2_summary_final.columns]+[m for m in r2_summary_final.columns if m not in model_order]
    r2_summary_final = r2_summary_final[cols_ordered]; r2_summary_final.index.name="Subset"; r2_summary_final.columns.name="Model"
    print("--- Tabell 1 Stil: Overall Monthly OOS R2 (%) [Gu et al. Def] ---"); print(r2_summary_final.round(4))
    utils.save_results(config.OUTPUT_DIR, "consolidated", {"R2_summary_table1_style": r2_summary_final})

    print(f"\n--- Pipeline Fullført --- ({datetime.datetime.now():%Y-%m-%d %H:%M:%S})")
    print(f"Total kjøretid: {datetime.datetime.now() - overall_start_time}")
    print(f"Resultater lagret i: {config.OUTPUT_DIR}")

Configuration loaded from config.py


2025-04-02 16:00:25.218943: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


--- Start: 2025-04-02 16:00:29 ---

--- 1. Laster & Forbereder Data ---
Laster data fra: Cleaned_OSEFX_Market_Macro_Data.csv
Data lastet inn. Form: (34476, 34)
Kolonnenavn standardisert.
FEIL: Date/Instrument mangler.

--- 2. Definerer Features ---
 FEIL: DataFrame tom.

--- 3. Rank Standardiserer Features ---


AttributeError: 'NoneType' object has no attribute 'columns'