# GBRT + H 

In [2]:
# --- IMPORTS ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats.mstats import winsorize
from sklearn.metrics import r2_score, mean_squared_error
# *** Import GBRT ***
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import ParameterGrid
# *** Import for Factor Regressions (commented out until needed) ***
# import statsmodels.api as sm
import datetime
import warnings
import traceback
from collections import defaultdict
import os
import time # For timing VI

# --- WARNINGS CONFIGURATION ---
warnings.filterwarnings("ignore", category=DeprecationWarning, module="pandas")
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="Mean of empty slice")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered in log")
warnings.filterwarnings("ignore", category=RuntimeWarning, message="Maximum number of iterations reached.*")
pd.options.mode.chained_assignment = None

# --------------------------------------------------------------------------
# FUNCTION DEFINITIONS
# --------------------------------------------------------------------------

# Step 1: Load and Prepare Dataset
def load_prepare_data(file_path):
    """ Loads, cleans, calculates returns/market cap, handles dates/IDs. """
    print(f"Laster data fra: {file_path}")
    try:
        df = pd.read_csv(file_path, low_memory=False)
    except FileNotFoundError: print(f"FEIL: Fil '{file_path}' ikke funnet."); return None
    print(f"Data lastet inn. Form: {df.shape}")

    date_col = 'Date' if 'Date' in df.columns else 'eom'
    id_col = 'Instrument' if 'Instrument' in df.columns else 'id'
    price_col = 'ClosePrice' if 'ClosePrice' in df.columns else 'prc'
    shares_col = 'CommonSharesOutstanding'
    rf_col = 'NorgesBank10Y'
    sector_col = 'EconomicSector'

    if date_col not in df.columns: print("FEIL: Dato-kolonne mangler."); return None
    if id_col not in df.columns: print("FEIL: Instrument-ID mangler."); return None
    if price_col not in df.columns: print(f"FEIL: Pris-kolonne ('{price_col}') mangler."); return None

    df = df.rename(columns={date_col: 'Date', id_col: 'Instrument'})
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values(by=["Instrument", "Date"]).reset_index(drop=True)
    print("Dato konvertert og data sortert.")

    df["MonthlyReturn"] = df.groupby("Instrument")[price_col].pct_change()
    df["MonthlyReturn"].fillna(0, inplace=True)
    df["MonthlyReturn"] = winsorize(df["MonthlyReturn"].values, limits=[0.01, 0.01])
    print("Månedlig avkastning ('MonthlyReturn') beregnet/winsorisert.")

    if rf_col not in df.columns: df[rf_col] = 0; print(f"ADVARSEL: '{rf_col}' mangler, bruker 0.")
    df["MonthlyRiskFreeRate_t"] = df[rf_col] / 12 / 100 if df[rf_col].abs().max() > 1 else df[rf_col] / 12
    df["TargetReturn_t"] = df["MonthlyReturn"] - df["MonthlyRiskFreeRate_t"]
    print("Risikojustert avkastning ('TargetReturn_t') beregnet (modellens y).")

    df['NextMonthlyReturn_t+1'] = df.groupby('Instrument')['MonthlyReturn'].shift(-1)
    print("Neste måneds rå avkastning ('NextMonthlyReturn_t+1') beregnet.")

    if shares_col not in df.columns: print(f"FEIL: '{shares_col}' mangler for MarketCap."); return None
    df["MarketCap"] = df[price_col] * df[shares_col]
    df['MarketCap_orig'] = df['MarketCap'].copy()
    df['MarketCap'] = df['MarketCap'].fillna(0)
    print("Markedsverdi ('MarketCap') beregnet.")

    if sector_col in df.columns:
        df = pd.get_dummies(df, columns=[sector_col], prefix="Sector", dtype=int)
        print("Sektor dummy-variabler opprettet.")

    df.columns = df.columns.str.replace(" ", "_").str.replace("-", "_")
    print("Kolonnenavn renset.")

    print("Log-transformerer spesifikke variabler...")
    vars_to_log = ["MarketCap", "BM", "ClosePrice", "Volume", "CommonSharesOutstanding"]
    vars_to_log = [v if v in df.columns else v.lower() for v in vars_to_log]
    vars_to_log = [v if v in df.columns else 'prc' if v == 'closeprice' else v for v in vars_to_log]
    vars_to_log = [col for col in vars_to_log if col in df.columns]
    for var in vars_to_log:
        if var in df.columns and pd.api.types.is_numeric_dtype(df[var]):
             df[f"{var}_positive"] = df[var].where(df[var] > 1e-9, np.nan)
             df[f"log_{var}"] = np.log(df[f"{var}_positive"])
             log_median = df[f"log_{var}"].median()
             df[f"log_{var}"] = df[f"log_{var}"].fillna(log_median)
             if pd.isna(log_median): df[f"log_{var}"] = df[f"log_{var}"].fillna(0)
             df.drop(columns=[f"{var}_positive"], inplace=True)
    print("Log-transformasjon fullført.")

    if 'MarketCap_orig' not in df.columns and 'MarketCap' in df.columns:
         df['MarketCap_orig'] = df['MarketCap'].copy()

    return df

# Step 1.5: Rank Standardization
def rank_standardize_features(df, features_to_standardize):
    print(f"Rank standardiserer {len(features_to_standardize)} features...")
    if 'Date' not in df.columns: print("FEIL: 'Date' mangler."); return df
    features_present = [f for f in features_to_standardize if f in df.columns]
    if len(features_present) < len(features_to_standardize):
        missing = [f for f in features_to_standardize if f not in features_present]; print(f"  ADVARSEL: Features manglet for standardisering: {missing}")
    if not features_present: print("  Ingen features å standardisere."); return df
    def rank_transform(x): x_numeric = pd.to_numeric(x, errors='coerce'); ranks = x_numeric.rank(pct=True); return ranks * 2 - 1
    try: ranked_cols = df.groupby('Date')[features_present].transform(rank_transform); df[features_present] = ranked_cols
    except Exception as e:
        print(f"  ADVARSEL rank (transform): {e}. Prøver apply.")
        try: df_std = df.set_index('Date'); [df_std.groupby(level=0)[col].apply(rank_transform) for col in features_present]; df = df_std.reset_index()
        except Exception as e2: print(f"  FEIL: Også alternativ standardisering feilet: {e2}"); return df
    print("Rank standardisering fullført.")
    return df

# Step 2: Define Feature Sets
def define_features(df):
    print("Identifiserer numeriske features...")
    if df is None or df.empty: print("  FEIL: DataFrame er tom."); return []
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    print(f"  Funnet {len(numeric_cols)} numeriske kolonner totalt.")
    cols_to_exclude = ['Instrument', 'Date', 'level_0', 'index', 'Year', 'MonthYear', 'TargetReturn_t', 'NextMonthlyReturn_t+1', 'MonthlyReturn', 'MonthlyRiskFreeRate_t', 'MarketCap_orig', 'rank', 'DecileRank', 'eq_weights', 'me_weights']
    cols_to_exclude.extend([col for col in df.columns if 'return_stock' in col or 'return_portfolio' in col])
    log_cols = [col for col in numeric_cols if col.startswith('log_')]; originals_of_log = [col.replace('log_','') for col in log_cols]; common_originals = ['MarketCap', 'ClosePrice', 'prc', 'Volume', 'CommonSharesOutstanding', 'BM']; originals_to_exclude = [orig for orig in originals_of_log if orig in df.columns and orig in common_originals]; cols_to_exclude.extend(originals_to_exclude); cols_to_exclude = list(set(cols_to_exclude))
    potential_features = [col for col in numeric_cols if col not in cols_to_exclude]
    final_features = [col for col in potential_features if col in df.columns and df[col].nunique(dropna=True) > 1 and df[col].std(ddof=0, skipna=True) > 1e-9]
    final_features = sorted(list(set(final_features))); print(f"  Identifisert {len(final_features)} features for bruk i modellen.")
    return final_features

# Step 3: Handle Missing / Infinite Values
def clean_data(df, numeric_features_to_impute, essential_cols_for_dropna, target="TargetReturn_t"):
    print("Starter datarensing (missing/inf)..."); initial_rows = len(df)
    features_present = [f for f in numeric_features_to_impute if f in df.columns]
    if features_present:
        inf_mask = df[features_present].isin([np.inf, -np.inf]);
        if inf_mask.any().any(): print(f"  Erstatter inf med NaN i {inf_mask.any(axis=0).sum()} feature kolonner..."); df[features_present] = df[features_present].replace([np.inf, -np.inf], np.nan)
        nan_counts_before = df[features_present].isnull().sum(); medians = df[features_present].median(skipna=True); df[features_present] = df[features_present].fillna(medians); nan_counts_after = df[features_present].isnull().sum()
        imputed_cols = (nan_counts_before - nan_counts_after); print(f"  NaNs imputert med median i {imputed_cols[imputed_cols > 0].count()} feature kolonner.")
        if medians.isnull().any(): cols_nan_median = medians[medians.isnull()].index.tolist(); print(f"  ADVARSEL: Median NaN for: {cols_nan_median}. Fyller med 0."); df[cols_nan_median] = df[cols_nan_median].fillna(0)
    essential_cols_present = [col for col in essential_cols_for_dropna if col in df.columns]; [essential_cols_present.append(col) for col in [target,'NextMonthlyReturn_t+1','MarketCap_orig'] if col in df.columns and col not in essential_cols_present]
    unique_essential_cols = sorted(list(set(essential_cols_present)))
    if unique_essential_cols: rows_before_dropna = len(df); df = df.dropna(subset=unique_essential_cols); rows_dropped = rows_before_dropna - len(df);
    if rows_dropped > 0: print(f"  Fjernet {rows_dropped} rader pga. NaN i essensielle kolonner: {unique_essential_cols}")
    mc_orig_col = 'MarketCap_orig'
    if mc_orig_col in df.columns: rows_before_mc_filter = len(df); df = df[df[mc_orig_col] > 0]; rows_dropped_mc = rows_before_mc_filter - len(df);
    if rows_dropped_mc > 0: print(f"  Fjernet {rows_dropped_mc} rader der {mc_orig_col} <= 0.")
    final_rows = len(df); print(f"Datarensing fullført. Form: {df.shape}. Fjernet totalt {initial_rows - final_rows} rader.");
    if df.empty: print("FEIL: Ingen data igjen etter rensing.")
    return df

# Step 4: Yearly Rolling Window Splits
def get_yearly_rolling_splits(df, initial_train_years, val_years, test_years=1):
    if "Date" not in df.columns: raise ValueError("'Date'-kolonnen mangler.")
    df['Year'] = df["Date"].dt.year; unique_years = sorted(df["Year"].unique()); n_unique_years = len(unique_years); print(f"Unike år i data: {n_unique_years} ({unique_years[0]} - {unique_years[-1]})")
    if n_unique_years < initial_train_years + val_years + test_years: df.drop(columns=['Year'], inplace=True, errors='ignore'); raise ValueError(f"Ikke nok unike år ({n_unique_years}) for split.")
    first_test_year_index = initial_train_years + val_years;
    if first_test_year_index >= n_unique_years: df.drop(columns=['Year'], inplace=True, errors='ignore'); raise ValueError("Train+Val år for lange.")
    first_test_year = unique_years[first_test_year_index]; last_test_year = unique_years[-test_years]; num_windows = last_test_year - first_test_year + 1
    if num_windows <= 0: df.drop(columns=['Year'], inplace=True, errors='ignore'); raise ValueError("Negativt antall vinduer.")
    print(f"Genererer {num_windows} årlige rullerende vinduer... (Første testår: {first_test_year}, Siste: {last_test_year})")
    for i in range(num_windows):
        current_test_start_year = first_test_year + i; current_test_end_year = current_test_start_year + test_years - 1
        current_val_end_year = current_test_start_year - 1; current_val_start_year = current_val_end_year - val_years + 1
        current_train_end_year = current_val_start_year - 1; current_train_start_year = unique_years[0]
        train_idx = df[(df['Year'] >= current_train_start_year) & (df['Year'] <= current_train_end_year)].index
        val_idx = df[(df['Year'] >= current_val_start_year) & (df['Year'] <= current_val_end_year)].index
        test_idx = df[(df['Year'] >= current_test_start_year) & (df['Year'] <= current_test_end_year)].index
        train_dates = df.loc[train_idx, "Date"].agg(['min', 'max']) if not train_idx.empty else None
        val_dates = df.loc[val_idx, "Date"].agg(['min', 'max']) if not val_idx.empty else None
        test_dates = df.loc[test_idx, "Date"].agg(['min', 'max']) if not test_idx.empty else None
        print(f"\n  Vindu {i+1}/{num_windows}: Train {current_train_start_year}-{current_train_end_year} ({len(train_idx)}), Val {current_val_start_year}-{current_val_end_year} ({len(val_idx)}), Test {current_test_start_year}-{current_test_end_year} ({len(test_idx)})")
        yield train_idx, val_idx, test_idx, train_dates, val_dates, test_dates
    df.drop(columns=['Year'], inplace=True, errors='ignore')


# --------------------------------------------------------------------------
# Step 5: Run GBRT with Huber Loss on a Single Rolling Window (MODIFIED)
# --------------------------------------------------------------------------
def run_gbrt_h_on_window(X_train, y_train, X_val, y_val, X_test, y_test, param_grid=None):
    """
    Trains GBRT with Huber loss (alpha=0.999 fixed), tunes other hyperparameters via validation MSE.
    Returns model, predictions, metrics, optimal max_depth, and optimal parameter dictionary.
    """
    model = None
    optim_param_found = None
    optimal_max_depth = np.nan
    preds_oos = np.full(y_test.shape[0], np.nan)
    preds_is_train_val = np.full(y_train.shape[0] + y_val.shape[0], np.nan)
    r2_oos, mse_oos, sharpe_oos, r2_is_train_val = (np.nan,) * 4

    # Default GBRT-H Hyperparameter Grid (Inspired by article/example, fixed alpha)
    if param_grid is None:
         param_grid = {
             'n_estimators': [100], # Fixed number of trees (as in German example) - Can be tuned with early stopping if needed
             'learning_rate': [0.1, 0.01], # As per Table A.5
             'max_depth': [1, 2], # As per Table A.5
             # Adding reasonable regularization based on German example
             'min_samples_split': [1000, 5000], # Adjust based on dataset size
             'min_samples_leaf': [500, 1000],   # Adjust based on dataset size
             'max_features': ['sqrt'] # Common choice
             # 'alpha' is NOT tuned here, fixed at 0.999 later based on Table A.5
         }

    grid = list(ParameterGrid(param_grid))
    best_mse_val = np.inf

    if X_val.shape[0] < 2:
        print("    ADVARSEL: Valideringssettet for lite (<2 obs). Hopper over tuning/trening.")
        return model, preds_oos, r2_oos, mse_oos, sharpe_oos, preds_is_train_val, r2_is_train_val, optimal_max_depth, optim_param_found

    # --- Hyperparameter Tuning Loop (Minimizing Validation MSE) ---
    # print(f"    Tuner GBRT-H ({len(grid)} kombinasjoner via Val MSE)...") # Verbose
    for i, params in enumerate(grid):
        try:
             # *** Use loss='huber' and fix alpha=0.999 ***
             gbrt_val = GradientBoostingRegressor(loss='huber', alpha=0.999, random_state=42, **params)
             gbrt_val.fit(X_train, y_train)
             y_val_pred = gbrt_val.predict(X_val)

             if not np.all(np.isfinite(y_val_pred)): continue # Skip if prediction failed

             # *** Use MSE for validation objective ***
             current_mse_val = mean_squared_error(y_val, y_val_pred)

             if not np.isnan(current_mse_val) and current_mse_val < best_mse_val:
                 best_mse_val = current_mse_val
                 optim_param_found = params

        except Exception as e:
             # print(f"      Tuning error for params {params}: {e}") # Optional debug
             continue # Continue tuning even if one combination fails

    if optim_param_found is None:
        print("    FEIL: GBRT-H Tuning feilet (ingen gyldig parameter funnet basert på Val MSE). Hopper over trening.")
        return model, preds_oos, r2_oos, mse_oos, sharpe_oos, preds_is_train_val, r2_is_train_val, optimal_max_depth, optim_param_found

    optimal_max_depth = optim_param_found.get('max_depth', np.nan)
    # print(f"    Optimal GBRT-H params funnet (basert på Val MSE): {optim_param_found}") # Verbose

    # --- Final Training (using Train + Validation set) ---
    try:
        # print("    Trener endelig GBRT-H på Train+Val...") # Verbose
        X_train_val = np.vstack((X_train, X_val)); y_train_val = np.concatenate((y_train, y_val))

        # *** Instantiate final model with fixed alpha and optimal params ***
        model = GradientBoostingRegressor(loss='huber', alpha=0.999, random_state=42, **optim_param_found)
        model.fit(X_train_val, y_train_val)

        # --- OOS Evaluation ---
        if X_test.shape[0] > 0:
            preds_oos = model.predict(X_test); nan_preds_oos_mask = ~np.isfinite(preds_oos)
            if nan_preds_oos_mask.any(): preds_oos[nan_preds_oos_mask] = 0 # Replace non-finite
            valid_oos_mask = np.isfinite(y_test) & np.isfinite(preds_oos); y_test_valid = y_test[valid_oos_mask]; preds_oos_valid = preds_oos[valid_oos_mask]
            if len(preds_oos_valid) > 1:
                ss_res_oos = np.sum((y_test_valid - preds_oos_valid)**2); ss_tot_oos = np.sum(y_test_valid**2)
                r2_oos = 1 - (ss_res_oos / ss_tot_oos) if ss_tot_oos > 1e-9 else np.nan
                mse_oos = mean_squared_error(y_test_valid, preds_oos_valid)
                pred_std_oos = np.std(preds_oos_valid); sharpe_oos = (np.mean(preds_oos_valid)/pred_std_oos)*np.sqrt(12) if pred_std_oos > 1e-9 else np.nan

        # --- IS Evaluation (on Train+Val set for VI baseline) ---
        preds_is_train_val = model.predict(X_train_val); nan_preds_is_mask = ~np.isfinite(preds_is_train_val)
        if nan_preds_is_mask.any(): preds_is_train_val[nan_preds_is_mask] = 0
        valid_is_mask = np.isfinite(y_train_val) & np.isfinite(preds_is_train_val); y_train_val_valid = y_train_val[valid_is_mask]; preds_is_valid = preds_is_train_val[valid_is_mask]
        if len(preds_is_valid) > 1:
            ss_res_is = np.sum((y_train_val_valid - preds_is_valid)**2); ss_tot_is = np.sum(y_train_val_valid**2)
            r2_is_train_val = 1 - (ss_res_is / ss_tot_is) if ss_tot_is > 1e-9 else np.nan

    except Exception as e:
        print(f"  FEIL under endelig GBRT-H trening/prediksjon: {e}")
        model = None; optimal_max_depth = np.nan; preds_oos.fill(np.nan); preds_is_train_val.fill(np.nan)
        r2_oos, mse_oos, sharpe_oos, r2_is_train_val = (np.nan,) * 4
        optim_param_found = None # Reset params if training failed

    # Return optimal_max_depth and the full parameter dictionary
    return model, preds_oos, r2_oos, mse_oos, sharpe_oos, preds_is_train_val, r2_is_train_val, optimal_max_depth, optim_param_found


# Step 6.5: Detailed Portfolio Analysis Function
def MDD(returns):
    """ Calculates Maximum Drawdown using arithmetic returns for NAV. """
    returns = pd.Series(returns).fillna(0)
    if returns.empty: return np.nan
    nav = (1 + returns).cumprod(); hwm = nav.cummax(); dd = nav / hwm - 1
    return dd.min() if not dd.empty else np.nan

def perform_detailed_portfolio_analysis(results_df, original_df_subset, benchmark_file=None, ff_factor_file=None, filter_small_caps=False, model_name_label="GBRT-H"): # Default Label Changed
    """ Performs detailed portfolio analysis, generates tables and plots for the given model label. """
    print("\n--- Starter Detaljert Porteføljeanalyse (Prediction-Sorted Deciles) ---")
    pred_col = f'yhat_{model_name_label.lower().replace("-","_").replace("+","h")}' # e.g., yhat_gbrt_h
    ew_table, vw_table, ew_chart_hl, vw_chart_hl, ew_chart_long, vw_chart_long = (pd.DataFrame(),)*6

    if pred_col not in results_df.columns: print(f"FEIL: Kolonne '{pred_col}' mangler."); return (pd.DataFrame(),)*6
    required_cols = ['Date', 'Instrument', 'MarketCap_orig', 'NextMonthlyReturn_t+1', 'MonthlyReturn', 'MonthlyRiskFreeRate_t']
    if not all(c in original_df_subset.columns for c in required_cols): print("FEIL: Mangler kolonner i original subset."); return (pd.DataFrame(),)*6

    portfolio_data = pd.merge(results_df[['Date', 'Instrument', 'TargetReturn_t', pred_col]], original_df_subset[required_cols], on=['Date', 'Instrument'], how='inner')
    portfolio_data = portfolio_data.rename(columns={'TargetReturn_t': 'y_true_t', pred_col: 'yhat_t+1', 'MarketCap_orig': 'me', 'NextMonthlyReturn_t+1': 'ret_t+1'})
    portfolio_data['MonthYear'] = portfolio_data['Date'].dt.to_period('M')
    monthly_rf_map = portfolio_data.groupby('MonthYear')['MonthlyRiskFreeRate_t'].mean().shift(-1)
    portfolio_data['NextMonthRiskFreeRate_t+1'] = portfolio_data['MonthYear'].map(monthly_rf_map)
    cols_for_eval_dropna = ['yhat_t+1', 'ret_t+1', 'me', 'NextMonthRiskFreeRate_t+1']
    portfolio_data = portfolio_data.dropna(subset=cols_for_eval_dropna)
    if portfolio_data.empty: print("  FEIL: Ingen data igjen for analyse."); return (pd.DataFrame(),)*6
    portfolio_data['excess_ret_t+1'] = portfolio_data['ret_t+1'] - portfolio_data['NextMonthRiskFreeRate_t+1']

    print("  Sorterer i desiler og beregner vekter...")
    monthly_data_dict = {}; all_decile_dfs = {i: [] for i in range(10)}
    unique_months = sorted(portfolio_data['MonthYear'].unique())
    for month in unique_months:
        monthly_df = portfolio_data[portfolio_data['MonthYear'] == month].copy()
        if filter_small_caps:
            if 'me' in monthly_df.columns and len(monthly_df) > 10: mc_cutoff = monthly_df['me'].quantile(0.10); monthly_df = monthly_df[monthly_df['me'] >= mc_cutoff].copy()
        if len(monthly_df) < 10: continue
        monthly_df = monthly_df.sort_values('yhat_t+1')
        try:
            monthly_df['rank'] = monthly_df['yhat_t+1'].rank(method='first'); monthly_df['DecileRank'] = pd.qcut(monthly_df['rank'], 10, labels=False, duplicates='drop')
            if monthly_df['DecileRank'].nunique() < 10: continue
        except ValueError: continue
        monthly_df = monthly_df.drop(columns=['rank']); monthly_df["eq_weights"] = 1 / monthly_df.groupby('DecileRank')["Instrument"].transform('size'); monthly_df["me_weights"] = monthly_df["me"] / monthly_df.groupby('DecileRank')["me"].transform('sum'); monthly_df["me_weights"] = monthly_df["me_weights"].fillna(0); monthly_data_dict[month] = monthly_df
        for decile_rank, group_df in monthly_df.groupby('DecileRank'):
            if decile_rank in all_decile_dfs: all_decile_dfs[decile_rank].append(group_df)
    decile_portfolios = {j: pd.concat(all_decile_dfs[j], ignore_index=True) if all_decile_dfs[j] else pd.DataFrame() for j in range(10)}
    if not any(not df.empty for df in decile_portfolios.values()): print("  FEIL: Ingen desilporteføljer konstruert."); return (pd.DataFrame(),)*6

    print("  Beregner desil-metrikker...")
    decile_results = []; monthly_agg_data = {}
    for j in range(10):
        rank_df = decile_portfolios.get(j, pd.DataFrame());
        if rank_df.empty: continue
        rank_df['excess_return_stock_ew']=rank_df["excess_ret_t+1"]*rank_df["eq_weights"]; rank_df['excess_return_stock_vw']=rank_df["excess_ret_t+1"]*rank_df["me_weights"]
        rank_df['pred_excess_return_stock_ew']=rank_df["yhat_t+1"]*rank_df["eq_weights"]; rank_df['pred_excess_return_stock_vw']=rank_df["yhat_t+1"]*rank_df["me_weights"]
        rank_df['return_stock_ew']=rank_df["ret_t+1"]*rank_df["eq_weights"]; rank_df['return_stock_vw']=rank_df["ret_t+1"]*rank_df["me_weights"]
        monthly_rank_j = rank_df.groupby('MonthYear').agg(excess_return_portfolio_ew=('excess_return_stock_ew','sum'), excess_return_portfolio_vw=('excess_return_stock_vw','sum'), pred_excess_return_portfolio_ew=('pred_excess_return_stock_ew','sum'), pred_excess_return_portfolio_vw=('pred_excess_return_stock_vw','sum'), return_portfolio_ew=('return_stock_ew','sum'), return_portfolio_vw=('return_stock_vw','sum')).reset_index()
        monthly_rank_j['DecileRank'] = j; monthly_agg_data[j] = monthly_rank_j
        ew_mean_ret=monthly_rank_j["excess_return_portfolio_ew"].mean(); vw_mean_ret=monthly_rank_j["excess_return_portfolio_vw"].mean()
        ew_mean_pred=monthly_rank_j["pred_excess_return_portfolio_ew"].mean(); vw_mean_pred=monthly_rank_j["pred_excess_return_portfolio_vw"].mean()
        std_ew_raw=monthly_rank_j["return_portfolio_ew"].std(); std_vw_raw=monthly_rank_j["return_portfolio_vw"].std()
        sharpe_ew=(ew_mean_ret/std_ew_raw)*np.sqrt(12) if std_ew_raw>1e-9 else np.nan; sharpe_vw=(vw_mean_ret/std_vw_raw)*np.sqrt(12) if std_vw_raw>1e-9 else np.nan
        decile_results.append({'DecileRank':j,'ew_mean_pred':ew_mean_pred,'ew_mean_ret':ew_mean_ret,'std_ew_ret':std_ew_raw,'sharpe_ew':sharpe_ew,'vw_mean_pred':vw_mean_pred,'vw_mean_ret':vw_mean_ret,'std_vw_ret':std_vw_raw,'sharpe_vw':sharpe_vw})

    zeronet_monthly = pd.DataFrame(); hl_calculated = False
    if 0 in monthly_agg_data and 9 in monthly_agg_data and not monthly_agg_data[0].empty and not monthly_agg_data[9].empty:
        long_monthly=monthly_agg_data[9].set_index('MonthYear'); short_monthly=monthly_agg_data[0].set_index('MonthYear'); common_index=long_monthly.index.intersection(short_monthly.index)
        if not common_index.empty:
             zeronet_monthly = long_monthly.loc[common_index].subtract(short_monthly.loc[common_index], fill_value=0).rename(columns=lambda x: x+'_HL')
             if 'DecileRank_HL' in zeronet_monthly.columns: zeronet_monthly = zeronet_monthly.drop(columns=['DecileRank_HL'],errors='ignore')
             zeronet_monthly = zeronet_monthly.reset_index(); hl_calculated = True
             ew_mean_ret_hl=zeronet_monthly["excess_return_portfolio_ew_HL"].mean(); vw_mean_ret_hl=zeronet_monthly["excess_return_portfolio_vw_HL"].mean(); ew_mean_pred_hl=zeronet_monthly["pred_excess_return_portfolio_ew_HL"].mean(); vw_mean_pred_hl=zeronet_monthly["pred_excess_return_portfolio_vw_HL"].mean()
             std_ew_raw_hl=zeronet_monthly["return_portfolio_ew_HL"].std(); std_vw_raw_hl=zeronet_monthly["return_portfolio_vw_HL"].std()
             sharpe_ew_hl=(ew_mean_ret_hl/std_ew_raw_hl)*np.sqrt(12) if std_ew_raw_hl>1e-9 else np.nan; sharpe_vw_hl=(vw_mean_ret_hl/std_vw_raw_hl)*np.sqrt(12) if std_vw_raw_hl>1e-9 else np.nan
             decile_results.append({'DecileRank':'H-L','ew_mean_pred':ew_mean_pred_hl,'ew_mean_ret':ew_mean_ret_hl,'std_ew_ret':std_ew_raw_hl,'sharpe_ew':sharpe_ew_hl,'vw_mean_pred':vw_mean_pred_hl,'vw_mean_ret':vw_mean_ret_hl,'std_vw_ret':std_vw_raw_hl,'sharpe_vw':sharpe_vw_hl})
        else: print("  ADVARSEL: Ingen overlappende måneder for H-L.")
    else: print("  ADVARSEL: Kan ikke beregne H-L.");

    if not decile_results: print("  FEIL: Ingen desilresultater."); return (pd.DataFrame(),)*6
    results_summary_df = pd.DataFrame(decile_results).set_index('DecileRank')

    def format_decile_table(summary_df, weight_scheme):
        cols_map={'ew_mean_pred':'Pred','ew_mean_ret':'Avg','std_ew_ret':'SD','sharpe_ew':'SR'} if weight_scheme=='EW' else {'vw_mean_pred':'Pred','vw_mean_ret':'Avg','std_vw_ret':'SD','sharpe_vw':'SR'}
        sub_df=summary_df[[k for k in cols_map.keys() if k in summary_df.columns]].rename(columns=cols_map)
        for col in ['Pred','Avg','SD']:
            if col in sub_df.columns: sub_df[col]=sub_df[col]*100
        def map_index(x):
            if x==0: return 'Low (L)';
            if x==9: return 'High (H)';
            if x=='H-L': return 'H-L'
            try: return str(int(x)+1)
            except: return str(x)
        sub_df.index=sub_df.index.map(map_index); desired_order=['Low (L)','2','3','4','5','6','7','8','9','High (H)','H-L']; sub_df=sub_df.reindex([idx for idx in desired_order if idx in sub_df.index])
        return sub_df[[col for col in ['Pred','Avg','SD','SR'] if col in sub_df.columns]]

    ew_table_unform = format_decile_table(results_summary_df, 'EW'); vw_table_unform = format_decile_table(results_summary_df, 'VW')
    ew_table = ew_table_unform.copy(); vw_table = vw_table_unform.copy()
    for df_tbl in [ew_table, vw_table]:
        for col in ['Pred', 'Avg', 'SD']:
             if col in df_tbl.columns: df_tbl[col] = df_tbl[col].map('{:.2f}'.format).replace('nan','N/A')
        if 'SR' in df_tbl.columns: df_tbl['SR'] = df_tbl['SR'].map('{:.2f}'.format).replace('nan','N/A')

    print(f"\n--- Ytelsestabell ({model_name_label} Desiler) - EW ---"); print(ew_table);
    print(f"\n--- Ytelsestabell ({model_name_label} Desiler) - VW ---"); print(vw_table)

    print("\n--- Analyse av H-L Portefølje (Turnover, Drawdown, Risk-Adj. Perf.) ---")
    turnover_ew, turnover_vw, maxDD_ew, maxDD_vw, max_loss_ew, max_loss_vw = (np.nan,) * 6; hl_metrics_calculated = False
    if hl_calculated and not zeronet_monthly.empty and monthly_data_dict:
        all_months_weights = pd.concat(monthly_data_dict.values(), ignore_index=True)
        if 0 in all_months_weights['DecileRank'].unique() and 9 in all_months_weights['DecileRank'].unique():
             long_weights = all_months_weights[all_months_weights['DecileRank']==9][['MonthYear','Instrument','eq_weights','me_weights']]; short_weights = all_months_weights[all_months_weights['DecileRank']==0][['MonthYear','Instrument','eq_weights','me_weights']]; short_weights[['eq_weights','me_weights']]*=-1
             hl_weights = pd.concat([long_weights,short_weights]).sort_values(['Instrument','MonthYear']); hl_weights['eq_weights_lead1'] = hl_weights.groupby('Instrument')['eq_weights'].shift(-1).fillna(0); hl_weights['me_weights_lead1'] = hl_weights.groupby('Instrument')['me_weights'].shift(-1).fillna(0); hl_weights['trade_ew'] = abs(hl_weights['eq_weights_lead1']-hl_weights['eq_weights']); hl_weights['trade_vw'] = abs(hl_weights['me_weights_lead1']-hl_weights['me_weights']); last_month_hl = hl_weights['MonthYear'].max(); monthly_turnover = hl_weights[hl_weights['MonthYear'] != last_month_hl].groupby('MonthYear').agg(sum_trade_ew=('trade_ew','sum'),sum_trade_vw=('trade_vw','sum'))
             if not monthly_turnover.empty: turnover_ew=monthly_turnover['sum_trade_ew'].mean()/2; turnover_vw=monthly_turnover['sum_trade_vw'].mean()/2; print(f"  Avg Turnover (H-L): EW={turnover_ew*100:.2f}%, VW={turnover_vw*100:.2f}%")
        maxDD_ew=MDD(zeronet_monthly['excess_return_portfolio_ew_HL']); maxDD_vw=MDD(zeronet_monthly['excess_return_portfolio_vw_HL']); print(f"  Max Drawdown (H-L, Excess Ret): EW={abs(maxDD_ew)*100:.2f}%, VW={abs(maxDD_vw)*100:.2f}%")
        max_loss_ew=zeronet_monthly['excess_return_portfolio_ew_HL'].min(); max_loss_vw=zeronet_monthly['excess_return_portfolio_vw_HL'].min(); print(f"  Max 1M Loss (H-L, Excess Ret): EW={max_loss_ew*100:.2f}%, VW={max_loss_vw*100:.2f}%")
        hl_metrics_calculated = True
    else: print("  Kan ikke utføre H-L turnover/drawdown analyse.");

    benchmark_name = "OSEBX"; print(f"\n--- Sammenligning med Benchmark ({benchmark_name}) og Faktor Modell ---")
    alpha_ew, t_alpha_ew, r2_reg_ew = np.nan, np.nan, np.nan; alpha_vw, t_alpha_vw, r2_reg_vw = np.nan, np.nan, np.nan; factor_data_loaded = False; factors = None; bench_data_loaded = False; bench_zeronet = pd.DataFrame(); mean_ret_bench_pct, std_bench_raw_pct, sr_bench, mdd_bench_pct = (np.nan,) * 4

    # --- Load Factor Data (COMMENTED OUT) ---
    # ... (factor loading code - unchanged) ...
    # --- Load Benchmark Data (COMMENTED OUT) ---
    # ... (benchmark loading code - unchanged) ...
    # --- Factor Regression for H-L Portfolio (COMMENTED OUT) ---
    # ... (factor regression code - unchanged) ...

    if hl_metrics_calculated:
        hl_res = results_summary_df.loc['H-L'] if 'H-L' in results_summary_df.index else pd.Series(dtype=float)
        index_perf_hl = ["Mean Excess Return [%]", 'Std Dev (Raw) [%]', "Ann. Sharpe Ratio", "Max Drawdown [%]", "Avg Monthly Turnover [%]", "FF5+Mom Alpha [%]", "t(Alpha)", "FF5+Mom Adj R2"]
        ew_chart_hl_data = {f'{model_name_label} H-L': [hl_res.get('ew_mean_ret', np.nan) * 100, hl_res.get('std_ew_ret', np.nan) * 100, hl_res.get('sharpe_ew', np.nan), abs(maxDD_ew) * 100 if pd.notna(maxDD_ew) else np.nan, turnover_ew * 100 if pd.notna(turnover_ew) else np.nan, alpha_ew, t_alpha_ew, r2_reg_ew]}
        vw_chart_hl_data = {f'{model_name_label} H-L': [hl_res.get('vw_mean_ret', np.nan) * 100, hl_res.get('std_vw_ret', np.nan) * 100, hl_res.get('sharpe_vw', np.nan), abs(maxDD_vw) * 100 if pd.notna(maxDD_vw) else np.nan, turnover_vw * 100 if pd.notna(turnover_vw) else np.nan, alpha_vw, t_alpha_vw, r2_reg_vw]}
        if bench_data_loaded:
            bench_col_data = [mean_ret_bench_pct, std_bench_raw_pct, sr_bench, mdd_bench_pct, 0, np.nan, np.nan, np.nan]; ew_chart_hl_data[benchmark_name] = bench_col_data; vw_chart_hl_data[benchmark_name] = bench_col_data
            ew_chart_hl = pd.DataFrame(ew_chart_hl_data, index=index_perf_hl)[[benchmark_name, f'{model_name_label} H-L']]; vw_chart_hl = pd.DataFrame(vw_chart_hl_data, index=index_perf_hl)[[benchmark_name, f'{model_name_label} H-L']]
        else: ew_chart_hl = pd.DataFrame(ew_chart_hl_data, index=index_perf_hl); vw_chart_hl = pd.DataFrame(vw_chart_hl_data, index=index_perf_hl)
        print(f"\n--- Risk-Adjusted Performance (H-L vs {benchmark_name}, EW) ---"); print(ew_chart_hl.round(3)); print(f"\n--- Risk-Adjusted Performance (H-L vs {benchmark_name}, VW) ---"); print(vw_chart_hl.round(3))

    print("\n--- Analyse av Long-Only (Topp Desil) Portefølje ---")
    rank_9_df = decile_portfolios.get(9, pd.DataFrame()); turnover_ew_long, turnover_vw_long, maxDD_ew_long, maxDD_vw_long, max_loss_ew_long, max_loss_vw_long = (np.nan,) * 6; long_metrics_calculated = False; alpha_long_ew, t_alpha_long_ew, r2_reg_long_ew = np.nan, np.nan, np.nan; alpha_long_vw, t_alpha_long_vw, r2_reg_long_vw = np.nan, np.nan, np.nan
    if not rank_9_df.empty:
        long_weights = rank_9_df[['MonthYear','Instrument','eq_weights','me_weights']].copy().sort_values(['Instrument','MonthYear']); long_weights['eq_weights_lead1'] = long_weights.groupby('Instrument')['eq_weights'].shift(-1).fillna(0); long_weights['me_weights_lead1'] = long_weights.groupby('Instrument')['me_weights'].shift(-1).fillna(0); long_weights['trade_ew'] = abs(long_weights['eq_weights_lead1']-long_weights['eq_weights']); long_weights['trade_vw'] = abs(long_weights['me_weights_lead1']-long_weights['me_weights']); last_month_long = long_weights['MonthYear'].max(); monthly_turnover_long=long_weights[long_weights['MonthYear']!=last_month_long].groupby('MonthYear').agg(sum_trade_ew=('trade_ew','sum'),sum_trade_vw=('trade_vw','sum'))
        if not monthly_turnover_long.empty: turnover_ew_long=monthly_turnover_long['sum_trade_ew'].mean()/2; turnover_vw_long=monthly_turnover_long['sum_trade_vw'].mean()/2; print(f"  Avg Turnover (Long Only): EW={turnover_ew_long*100:.2f}%, VW={turnover_vw_long*100:.2f}%")
        long_monthly_agg = monthly_agg_data.get(9, pd.DataFrame())
        if not long_monthly_agg.empty:
            maxDD_ew_long=MDD(long_monthly_agg["excess_return_portfolio_ew"]); maxDD_vw_long=MDD(long_monthly_agg["excess_return_portfolio_vw"]); print(f"  Max Drawdown (Long Only, Excess Ret): EW={abs(maxDD_ew_long)*100:.2f}%, VW={abs(maxDD_vw_long)*100:.2f}%")
            max_loss_ew_long=long_monthly_agg["excess_return_portfolio_ew"].min(); max_loss_vw_long=long_monthly_agg["excess_return_portfolio_vw"].min(); print(f"  Max 1M Loss (Long Only, Excess Ret): EW={max_loss_ew_long*100:.2f}%, VW={max_loss_vw_long*100:.2f}%")
            long_metrics_calculated = True
            # --- Factor Regression for Long-Only Portfolio (COMMENTED OUT) ---
            # ... (factor regression code for long-only - unchanged) ...
            if long_metrics_calculated:
                long_res = results_summary_df.loc[9] if 9 in results_summary_df.index else pd.Series(dtype=float)
                index_perf_long = ["Mean Excess Return [%]", 'Std Dev (Raw) [%]', "Ann. Sharpe Ratio", "Max Drawdown [%]", "Avg Monthly Turnover [%]", "FF5+Mom Alpha [%]", "t(Alpha)", "FF5+Mom Adj R2"]
                ew_chart_long_data = {f'{model_name_label} Long': [long_res.get('ew_mean_ret', np.nan) * 100, long_res.get('std_ew_ret', np.nan) * 100, long_res.get('sharpe_ew', np.nan), abs(maxDD_ew_long) * 100 if pd.notna(maxDD_ew_long) else np.nan, turnover_ew_long * 100 if pd.notna(turnover_ew_long) else np.nan, alpha_long_ew, t_alpha_long_ew, r2_reg_long_ew]}
                vw_chart_long_data = {f'{model_name_label} Long': [long_res.get('vw_mean_ret', np.nan) * 100, long_res.get('std_vw_ret', np.nan) * 100, long_res.get('sharpe_vw', np.nan), abs(maxDD_vw_long) * 100 if pd.notna(maxDD_vw_long) else np.nan, turnover_vw_long * 100 if pd.notna(turnover_vw_long) else np.nan, alpha_long_vw, t_alpha_long_vw, r2_reg_long_vw]}
                if bench_data_loaded:
                    bench_col_data_long = [mean_ret_bench_pct, std_bench_raw_pct, sr_bench, mdd_bench_pct, 0, np.nan, np.nan, np.nan]; ew_chart_long_data[benchmark_name] = bench_col_data_long; vw_chart_long_data[benchmark_name] = bench_col_data_long
                    ew_chart_long = pd.DataFrame(ew_chart_long_data, index=index_perf_long)[[benchmark_name, f'{model_name_label} Long']]; vw_chart_long = pd.DataFrame(vw_chart_long_data, index=index_perf_long)[[benchmark_name, f'{model_name_label} Long']]
                else: ew_chart_long = pd.DataFrame(ew_chart_long_data, index=index_perf_long); vw_chart_long = pd.DataFrame(vw_chart_long_data, index=index_perf_long)
                print(f"\n--- Risk-Adjusted Performance (Long Only vs {benchmark_name}, EW) ---"); print(ew_chart_long.round(3)); print(f"\n--- Risk-Adjusted Performance (Long Only vs {benchmark_name}, VW) ---"); print(vw_chart_long.round(3))
    else: print("  Ingen data for Long-Only porteføljeanalyse.")

    print("\n--- Genererer kumulative avkastningsplott (Excess Returns) ---")
    plot_data = {};
    for j in [0, 9]:
        if j in monthly_agg_data and not monthly_agg_data[j].empty:
            df_agg=monthly_agg_data[j].set_index('MonthYear').sort_index();
            if not df_agg.empty: df_agg[f'cum_ret_ew_{j}']=(1+df_agg['excess_return_portfolio_ew']).cumprod()-1; df_agg[f'cum_ret_vw_{j}']=(1+df_agg['excess_return_portfolio_vw']).cumprod()-1; plot_data[j]=df_agg
    # if bench_data_loaded and not bench_zeronet.empty: bench_plot=bench_zeronet.set_index('MonthYear').sort_index(); bench_plot['cum_ret_bench']=(1+bench_plot['monthly_excess_return_bench']).cumprod()-1; plot_data['benchmark']=bench_plot
    def plot_cumulative_returns(plot_data_dict, weight_scheme, model_label, benchmark_label):
        fig, ax = plt.subplots(figsize=(15, 7)); legend_items=[]
        if 9 in plot_data_dict: col_name=f'cum_ret_{weight_scheme.lower()}_9'; plot_data_dict[9][col_name].plot(ax=ax, label=f'{model_label} Long {weight_scheme.upper()}'); legend_items.append(f'{model_label} Long {weight_scheme.upper()}')
        if 0 in plot_data_dict: col_name=f'cum_ret_{weight_scheme.lower()}_0'; plot_data_dict[0][col_name].plot(ax=ax, label=f'{model_label} Short {weight_scheme.upper()}', linestyle=':'); legend_items.append(f'{model_label} Short {weight_scheme.upper()}')
        # if 'benchmark' in plot_data_dict: plot_data_dict['benchmark']['cum_ret_bench'].plot(ax=ax, label=benchmark_label, linestyle='--', color='grey'); legend_items.append(benchmark_label)
        if legend_items: ax.set_title(f'Kumulativ Excess Avkastning ({weight_scheme.upper()} Vektet)'); ax.set_ylabel('Kumulativ Excess Avkastning'); ax.set_xlabel('Måned'); ax.legend(legend_items); ax.grid(True); fig.tight_layout(); plt.show()
        else: plt.close(fig); print(f"Ingen {weight_scheme.upper()} plottdata tilgjengelig.")
    plot_cumulative_returns(plot_data, 'EW', model_name_label, benchmark_name); plot_cumulative_returns(plot_data, 'VW', model_name_label, benchmark_name)

    print("--- Detaljert Porteføljeanalyse Fullført ---")
    return ew_table, vw_table, ew_chart_hl, vw_chart_hl, ew_chart_long, vw_chart_long


# Step 6.7: Variable Importance Function (Adapted for GBRT)
def calculate_variable_importance_single_window(model_params, X_eval, y_eval, features, base_r2):
    """ Calculates permutation importance for GBRT-H for ONE window by retraining. """
    importance_results = {}
    ss_tot_eval = np.sum((y_eval - y_eval.mean())**2)
    if ss_tot_eval < 1e-15: return pd.DataFrame({'Feature': features, 'Importance': 0.0})

    if not model_params: print("  ADVARSEL (VI): Modellparametre mangler."); return pd.DataFrame({'Feature': features, 'Importance': 0.0})

    # Ensure correct fixed settings for GBRT VI runs
    current_model_params = model_params.copy()
    current_model_params['loss'] = 'huber'
    current_model_params['alpha'] = 0.999 # Fix alpha as per main model
    current_model_params['random_state'] = 42 # Consistent state

    for feature_idx, feature_name in enumerate(features):
        X_eval_permuted = X_eval.copy(); X_eval_permuted[:, feature_idx] = 0
        try:
            # Re-train GBRT with zeroed feature using same optimal params
            permuted_model = GradientBoostingRegressor(**current_model_params)
            permuted_model.fit(X_eval_permuted, y_eval)
            permuted_preds = permuted_model.predict(X_eval_permuted)
            if not np.all(np.isfinite(permuted_preds)): permuted_r2 = np.nan
            else: ss_res_permuted = np.sum((y_eval - permuted_preds)**2); permuted_r2 = 1 - (ss_res_permuted / ss_tot_eval)
            r2_reduction = base_r2 - permuted_r2
            importance_results[feature_name] = max(0, r2_reduction) if pd.notna(r2_reduction) else 0.0
        except Exception as e: importance_results[feature_name] = 0.0

    if not importance_results: return pd.DataFrame({'Feature': features, 'Importance': 0.0})
    imp_df = pd.DataFrame(importance_results.items(), columns=['Feature', 'R2_reduction'])
    total_reduction = imp_df['R2_reduction'].sum()
    imp_df['Importance'] = imp_df['R2_reduction'] / total_reduction if total_reduction > 1e-9 else 0.0
    return imp_df[['Feature', 'Importance']]


# Step 6.8 Plot Time-Varying Complexity (Adapted for Max Depth)
def plot_time_varying_complexity(model_metrics, model_name='GBRT-H'):
     """ Plots the optimal max_depth over time for GBRT-H """
     print(f"\n--- Plotter Tidsvarierende Modellkompleksitet (Optimal Max Depth for {model_name}) ---")
     complexity_param = 'optim_max_depth' # Parameter to plot

     if model_name in model_metrics and complexity_param in model_metrics[model_name]:
         values = model_metrics[model_name][complexity_param]
         if values and not all(np.isnan(d) for d in values):
             valid_values = [(i + 1, int(v)) for i, v in enumerate(values) if pd.notna(v)]
             if valid_values:
                 windows, plot_values = zip(*valid_values); data = pd.DataFrame({complexity_param: plot_values}, index=pd.Index(windows, name='Window'))
                 print(f"\n--- Optimal {complexity_param} per Vindu Tabell ({model_name}) ---"); print(data)
                 plt.figure(figsize=(10, 5)); plt.plot(windows, plot_values, marker='o', linestyle='-')
                 plot_title = f"{model_name} Optimal Max Depth per Rullerende Vindu"; y_label = f"Optimal Max Depth"
                 plt.xlabel("Rullerende Vindu Nummer"); plt.ylabel(y_label); plt.title(plot_title); plt.grid(True); plt.tight_layout(); plt.show()
             else: print(f"  Ingen gyldige verdier funnet for '{complexity_param}' for {model_name}.")
         else: print(f"  Ingen data funnet for '{complexity_param}' for {model_name}.")
     else: print(f"  Metrikk '{complexity_param}' ikke funnet for {model_name}.")


# Step 7: Analyze Prespecified Portfolios (Placeholder)
def analyze_prespecified_portfolios(results_df, original_df_subset, portfolio_definitions_file=None, model_name_label="GBRT-H"): # Default Label Changed
    """ Placeholder function for analyzing prespecified portfolios. Needs implementation. """
    print("\n--- Starter Analyse av Prespesifiserte Porteføljer (Placeholder) ---")
    # ... (rest of placeholder logic - unchanged) ...
    pred_col = f'yhat_{model_name_label.lower().replace("-","_").replace("+","h")}'
    portfolio_r2_results = pd.DataFrame(); market_timing_results = pd.DataFrame()
    if portfolio_definitions_file is None or not os.path.exists(portfolio_definitions_file):
        print("  ADVARSEL: Fil med porteføljedefinisjoner mangler. Hopper over."); return portfolio_r2_results, market_timing_results
    # --- IMPLEMENTATION NEEDED ---
    print("--- Analyse av Prespesifiserte Porteføljer Fullført (Placeholder) ---")
    return portfolio_r2_results, market_timing_results


# Step 8: Main Orchestration Function (Adapted for GBRT-H)
INITIAL_TRAIN_YEARS_DEFAULT = 9
VALIDATION_YEARS_DEFAULT = 6
TEST_YEARS_PER_WINDOW_DEFAULT = 1

def run_analysis_for_subset(file_path,
                            data_subset='all',
                            benchmark_file=None,
                            ff_factor_file=None,
                            portfolio_defs_file=None,
                            filter_portfolio_construction=False,
                            top_n=1000,
                            bottom_n=1000,
                            initial_train_years=INITIAL_TRAIN_YEARS_DEFAULT,
                            val_years=VALIDATION_YEARS_DEFAULT,
                            test_years=TEST_YEARS_PER_WINDOW_DEFAULT
                           ):
    """ Runs the full pipeline for one data subset using GBRT-H with YEARLY refitting. """
    run_label = data_subset.capitalize()
    model_name = 'GBRT-H' # *** MODEL NAME SET ***
    start_time = datetime.datetime.now(); print(f"\n{'='*20} Starter Kjøring: {model_name} for '{run_label}' Firms (ÅRLIG Refitting) {'='*20}")
    if data_subset == 'big': print(f"(Definert som Topp {top_n} basert på MarketCap per måned)")
    if data_subset == 'small': print(f"(Definert som Bunn {bottom_n} basert på MarketCap per måned)")
    print(f"Starttid: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")

    # --- Load & Subset ---
    print("\n--- Steg 1: Laster og Forbereder Rådata ---"); df_raw = load_prepare_data(file_path)
    if df_raw is None: return np.nan, None, None, (None,)*6, (None, None)
    if 'MarketCap_orig' not in df_raw.columns: print("FEIL: MarketCap_orig mangler."); return np.nan, None, None, (None,)*6, (None, None)
    if 'Date' not in df_raw.columns: print("FEIL: Date mangler."); return np.nan, None, None, (None,)*6, (None, None)

    print(f"\n--- Steg 1.1: Lager Subset: {run_label} ---"); df = pd.DataFrame(); df_raw_mc = df_raw.dropna(subset=['MarketCap_orig', 'Date'])
    if data_subset == 'all': df = df_raw.copy()
    elif data_subset == 'big': df_raw_mc['MonthYear'] = df_raw_mc['Date'].dt.to_period('M'); df = df_raw_mc.groupby('MonthYear', group_keys=False).apply(lambda x: x.nlargest(top_n, "MarketCap_orig")); df = df.drop(columns=['MonthYear'])
    elif data_subset == 'small': df_raw_mc['MonthYear'] = df_raw_mc['Date'].dt.to_period('M'); df = df_raw_mc.groupby('MonthYear', group_keys=False).apply(lambda x: x.nsmallest(bottom_n, "MarketCap_orig")); df = df.drop(columns=['MonthYear'])
    else: print(f"FEIL: Ukjent subset '{data_subset}'."); return np.nan, None, None, (None,)*6, (None, None)
    if df.empty: print(f"FEIL: Subset '{run_label}' er tomt."); return np.nan, None, None, (None,)*6, (None, None)
    print(f"Subset '{run_label}' initiell form: {df.shape}")

    # --- Define Features, Standardize & Clean ---
    print("\n--- Steg 2 & 1.5: Definerer Features og Rank Standardiserer ---"); gbrt_features = define_features(df)
    if not gbrt_features: print(f"FEIL: Ingen features funnet i subset '{run_label}'."); return np.nan, None, None, (None,)*6, (None, None)
    df = rank_standardize_features(df, gbrt_features)
    print("\n--- Steg 3: Renser Data (Missing/Inf/Filters) ---"); essential_cols = ['TargetReturn_t', 'NextMonthlyReturn_t+1', 'MarketCap_orig', 'Date', 'Instrument']
    df = clean_data(df, gbrt_features, essential_cols, target="TargetReturn_t")
    if df.empty: print(f"FEIL: DataFrame er tom etter rensing for subset '{run_label}'."); return np.nan, None, None, (None,)*6, (None, None)
    gbrt_features = [f for f in gbrt_features if f in df.columns and df[f].nunique() > 1 and df[f].std() > 1e-9]; # Refresh features
    if not gbrt_features: print("FEIL: Ingen features igjen etter datarensing."); return np.nan, None, None, (None,)*6, (None, None)
    df = df.sort_values(["Date", "Instrument"]).reset_index(drop=True)

    # --- Yearly Rolling Window Setup ---
    print(f"\n--- Steg 4: Setter opp ÅRLIG Rullerende Vindu (InitTrain={initial_train_years}, Val={val_years}, Test={test_years}) ---"); results_list = []; model_metrics = defaultdict(lambda: defaultdict(list)); variable_importance_scores_all_windows = []; yhat_col_name = f'yhat_{model_name.lower().replace("-", "_").replace("+","h")}'
    try: splits_generator = get_yearly_rolling_splits(df, initial_train_years, val_years, test_years); splits = list(splits_generator); num_windows = len(splits)
    except ValueError as e: print(f"FEIL ved generering av årlige vinduer: {e}"); if 'Year' in df.columns: df.drop(columns=['Year'], inplace=True, errors='ignore'); return np.nan, None, model_metrics, (None,)*6, (None, None)
    print(f"Antall årlige rullerende vinduer som skal kjøres: {num_windows}\n");
    if num_windows == 0: print("Ingen årlige vinduer å kjøre. Avslutter."); return np.nan, None, model_metrics, (None,)*6, (None, None)

    # --- Rolling Window Loop ---
    print(f"--- Starter {model_name} ÅRLIG Rullerende Vindu Trening & Prediksjon ---"); total_vi_time = 0
    for window, (train_idx, val_idx, test_idx, train_dates, val_dates, test_dates) in enumerate(splits):
        window_start_time = datetime.datetime.now(); window_num = window + 1; print("-" * 60)
        if test_idx.empty or val_idx.empty or train_idx.empty: print(f"Vindu {window_num}: Tomt train/val/test. Hopper over."); model_metrics[model_name]['oos_r2'].append(np.nan); model_metrics[model_name]['optim_max_depth'].append(np.nan); model_metrics[model_name]['is_r2_train_val'].append(np.nan); continue
        X_train=df.loc[train_idx, gbrt_features].values; y_train=df.loc[train_idx, "TargetReturn_t"].values; X_val=df.loc[val_idx, gbrt_features].values; y_val=df.loc[val_idx, "TargetReturn_t"].values; X_test=df.loc[test_idx, gbrt_features].values; y_test=df.loc[test_idx, "TargetReturn_t"].values; X_train_val=np.vstack((X_train, X_val)); y_train_val=np.concatenate((y_train, y_val))
        if np.isnan(y_train).all() or np.isnan(y_val).all() or X_train.shape[0]<2 or X_val.shape[0]<2 or np.nanstd(y_train) < 1e-9: print(f"Vindu {window_num}: Utilstrekkelig data/varians. Hopper over."); model_metrics[model_name]['oos_r2'].append(np.nan); model_metrics[model_name]['optim_max_depth'].append(np.nan); model_metrics[model_name]['is_r2_train_val'].append(np.nan); continue

        # *** Define GBRT parameter grid for this window ***
        gbrt_param_grid = { 'n_estimators': [100], 'learning_rate': [0.1, 0.01], 'max_depth': [1, 2], 'min_samples_split': [1000, 5000], 'min_samples_leaf': [500, 1000], 'max_features': ['sqrt'] }

        # *** Call the GBRT-H function ***
        trained_model, preds_oos, r2_oos, mse_oos, sharpe_oos, preds_is, r2_is, optim_depth, optim_params = run_gbrt_h_on_window( X_train, y_train, X_val, y_val, X_test, y_test, param_grid=gbrt_param_grid )

        # --- Store Results ---
        if test_idx.shape[0] > 0 and preds_oos is not None: window_predictions = {'Date': df.loc[test_idx, 'Date'].values, 'Instrument': df.loc[test_idx, 'Instrument'].values, 'TargetReturn_t': y_test, yhat_col_name: preds_oos}; results_list.append(pd.DataFrame(window_predictions))
        model_metrics[model_name]['oos_r2'].append(r2_oos); model_metrics[model_name]['optim_max_depth'].append(optim_depth); model_metrics[model_name]['is_r2_train_val'].append(r2_is)

        # --- Variable Importance ---
        if trained_model is not None and optim_params is not None:
            print(f"  Beregner VI for Vindu {window_num} (GBRT)...") # Keep less verbose
            vi_start_time = time.time(); window_vi_df = calculate_variable_importance_single_window(optim_params, X_train_val, y_train_val, gbrt_features, r2_is);
            if window_vi_df is not None and not window_vi_df.empty: variable_importance_scores_all_windows.append(window_vi_df); total_vi_time += time.time() - vi_start_time; print(f"    VI beregnet på {time.time() - vi_start_time:.2f}s.")
        # elif trained_model is None: print("    ADVARSEL: Hovedmodell feilet, hopper over VI.")
        # elif optim_params is None: print("    ADVARSEL: Optimal parametre ikke funnet, hopper over VI.") # Less verbose

        print(f"  Vindu {window_num} fullført på {(datetime.datetime.now() - window_start_time).total_seconds():.1f}s. Opt Depth: {optim_depth}, Win OOS R2: {r2_oos:.4f}")

    # --- Aggregate Results & Overall Analysis ---
    if not results_list: print(f"\nFEIL: Ingen OOS resultater for {run_label}."); return np.nan, None, model_metrics, (None,)*6, (None, None)
    results_df = pd.concat(results_list).reset_index(drop=True); print(f"\n--- Samlet Resultatanalyse ({run_label}, {model_name}, Årlig Refit) ---")
    y_true_all = results_df['TargetReturn_t']; y_pred_all = results_df[yhat_col_name]; valid_idx_all = y_true_all.notna() & y_pred_all.notna() & np.isfinite(y_true_all) & np.isfinite(y_pred_all); y_t_valid_all = y_true_all[valid_idx_all]; y_p_valid_all = y_pred_all[valid_idx_all]; R2OOS_overall = np.nan
    if len(y_t_valid_all) > 1: ss_res_all = np.sum((y_t_valid_all - y_p_valid_all)**2); ss_true_sq_all = np.sum(y_t_valid_all**2);
    if ss_true_sq_all > 1e-15: R2OOS_overall = 1 - (ss_res_all / ss_true_sq_all)
    avg_yearly_window_r2 = np.nanmean(model_metrics[model_name]['oos_r2']); print(f"Overall OOS R² ({run_label}, Gu et al. Def): {R2OOS_overall:.6f}"); print(f"Average Yearly Window OOS R² ({run_label}):  {avg_yearly_window_r2:.6f}"); model_metrics[model_name]['oos_r2_overall_gu'] = R2OOS_overall

    # --- Detailed Portfolio Analysis ---
    portfolio_tables = perform_detailed_portfolio_analysis(results_df, df, benchmark_file=benchmark_file, ff_factor_file=ff_factor_file, filter_small_caps=filter_portfolio_construction, model_name_label=model_name)

    # --- Analyze Prespecified Portfolios ---
    prespec_r2_table, prespec_timing_table = analyze_prespecified_portfolios(results_df, df, portfolio_definitions_file=portfolio_defs_file, model_name_label=model_name)

    # --- Aggregate and Plot Variable Importance ---
    averaged_vi_df = None
    if variable_importance_scores_all_windows:
         print(f"\n--- Aggregerer Variabel Viktighet over {len(variable_importance_scores_all_windows)} ÅRLIGE vinduer --- (Total VI tid: {total_vi_time:.1f}s)")
         all_vi_df = pd.concat(variable_importance_scores_all_windows); averaged_vi_df = all_vi_df.groupby('Feature')['Importance'].mean().reset_index(); total_avg_importance = averaged_vi_df['Importance'].sum(); averaged_vi_df['Importance'] = averaged_vi_df['Importance'] / total_avg_importance if total_avg_importance > 1e-9 else 0.0; averaged_vi_df = averaged_vi_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)
         print(f"\n--- Gjennomsnittlig Variabel Viktighet ({model_name}, Top 10) ---"); print(averaged_vi_df[['Feature', 'Importance']].head(10).round(4))
         plt.figure(figsize=(12, 8)); top_n_plot = 20; plot_df = averaged_vi_df.head(top_n_plot).sort_values(by='Importance', ascending=True); plt.barh(plot_df['Feature'], plot_df['Importance']); plt.xlabel("Gj.snitt Relativ Viktighet"); plt.ylabel("Feature"); plt.title(f'{model_name} Gj.snitt Variabel Viktighet (Top {top_n_plot})'); plt.tight_layout(); plt.show()
    else: print("\nIngen VI data samlet.")

    # --- Plot Time-Varying Complexity (Max Depth) ---
    plot_time_varying_complexity(model_metrics, model_name)

    # --- Final Summary ---
    end_time = datetime.datetime.now(); print(f"\n--- Kjøring ({run_label}, {model_name}, Årlig Refit) fullført ---"); print(f"Sluttid: {end_time.strftime('%Y-%m-%d %H:%M:%S')} (Total tid: {(end_time - start_time)})"); print(f"{'='*70}")
    return R2OOS_overall, averaged_vi_df, model_metrics, portfolio_tables, (prespec_r2_table, prespec_timing_table)


# --------------------------------------------------------------------------
# Main Execution Block (MODIFIED for GBRT-H)
# --------------------------------------------------------------------------
if __name__ == "__main__":
    # --- CONFIGURATION ---
    data_file = "Cleaned_OSEFX_Market_Macro_Data.csv" # <--- SET YOUR INPUT DATA FILE

    # --- Yearly Split Parameters ---
    INITIAL_TRAIN_YEARS = 9
    VALIDATION_YEARS = 6
    TEST_YEARS_PER_WINDOW = 1

    # --- Optional External Data Files ---
    benchmark_csv_file = None
    ff_factor_csv_file = None
    portfolio_defs_csv_file = None

    # --- Analysis Settings ---
    filter_small_caps_portfolio = False
    TOP_N_FIRMS = 1000
    BOTTOM_N_FIRMS = 1000

    # --- Output Directory ---
    output_dir = "GBRT_H_Results_YearlyRefit" # *** CHANGED Directory Name ***
    if not os.path.exists(output_dir):
        os.makedirs(output_dir); print(f"Opprettet mappe: {output_dir}")

    # --- Check File Existence ---
    if not os.path.exists(data_file): print(f"FEIL: Input datafil ikke funnet på '{data_file}'. Avslutter."); exit()
    if benchmark_csv_file and not os.path.exists(benchmark_csv_file): print(f"ADVARSEL: Benchmark fil ikke funnet. Benchmark-sammenligning deaktivert."); benchmark_csv_file = None
    if ff_factor_csv_file and not os.path.exists(ff_factor_csv_file): print(f"ADVARSEL: FF+Mom faktorfil ikke funnet. Faktorregresjoner deaktivert."); ff_factor_csv_file = None
    if portfolio_defs_csv_file and not os.path.exists(portfolio_defs_csv_file): print(f"ADVARSEL: Porteføljedefinisjonsfil ikke funnet. Prespesifisert analyse deaktivert."); portfolio_defs_csv_file = None

    # --- RUN ANALYSIS FOR EACH SUBSET ---
    results_r2_summary = {}; overall_variable_importance_df = None; all_subset_metrics = {}; all_portfolio_results_tables = {}; all_prespecified_results = {}
    subsets_to_run = ['all', 'big', 'small']
    model_run_name = 'GBRT-H' # *** Set model name for output ***

    for subset in subsets_to_run:
        r2, vi_df_avg_run, metrics_run, port_tables_run, prespec_tables_run = run_analysis_for_subset(
            file_path=data_file, data_subset=subset, benchmark_file=benchmark_csv_file, ff_factor_file=ff_factor_csv_file, portfolio_defs_file=portfolio_defs_csv_file,
            filter_portfolio_construction=filter_small_caps_portfolio, top_n=TOP_N_FIRMS, bottom_n=BOTTOM_N_FIRMS,
            initial_train_years=INITIAL_TRAIN_YEARS, val_years=VALIDATION_YEARS, test_years=TEST_YEARS_PER_WINDOW
        )
        results_r2_summary[subset] = r2; all_subset_metrics[subset] = metrics_run; all_portfolio_results_tables[subset] = port_tables_run; all_prespecified_results[subset] = prespec_tables_run
        if subset == 'all' and vi_df_avg_run is not None: overall_variable_importance_df = vi_df_avg_run

    # --- Print Final R2 Summary Table ---
    print("\n\n" + "="*30 + f" Final OOS R2 Summary ({model_run_name} - Yearly Refit) " + "="*30)
    summary_r2_data = { "Full Sample (all)": [results_r2_summary.get('all', np.nan)], f"Large Firms (Top {TOP_N_FIRMS})": [results_r2_summary.get('big', np.nan)], f"Small Firms (Bottom {BOTTOM_N_FIRMS})": [results_r2_summary.get('small', np.nan)] }
    r2_summary_df = pd.DataFrame.from_dict(summary_r2_data, orient='index', columns=[f'{model_run_name} R2oos (%)']); r2_summary_df[f'{model_run_name} R2oos (%)'] *= 100
    print(r2_summary_df.round(4))
    try: r2_summary_filename = os.path.join(output_dir, f"{model_run_name.lower().replace('+','h')}_R2oos_summary_subsets_yearly.csv"); r2_summary_df.to_csv(r2_summary_filename); print(f" -> R2 Sammendrag lagret til {r2_summary_filename}")
    except Exception as e: print(f"  FEIL ved lagring av R2 Sammendrag: {e}")
    print("="*78)

    # --- Save Averaged VI results ---
    if overall_variable_importance_df is not None:
         print("\nLagrer Gjennomsnittlig Variabel Viktighet resultater...")
         try: vi_filename = os.path.join(output_dir, f"{model_run_name.lower().replace('+','h')}_variable_importance_averaged_yearly.csv"); overall_variable_importance_df.to_csv(vi_filename, index=False); print(f" -> Gjennomsnittlig Variabel Viktighet lagret til {vi_filename}")
         except Exception as e: print(f"  FEIL ved lagring av VI: {e}")

    # --- Save Portfolio Decile Tables and Risk Tables ---
    print("\nLagrer Portefølje Desil Tabeller...")
    for subset, tables in all_portfolio_results_tables.items():
         if tables and len(tables) == 6:
              names = ['decile_ew', 'decile_vw', 'hl_risk_ew', 'hl_risk_vw', 'long_risk_ew', 'long_risk_vw']
              for i, table_df in enumerate(tables):
                   if table_df is not None and not table_df.empty:
                        filename = os.path.join(output_dir, f"{model_run_name.lower().replace('+','h')}_portfolio_{subset}_{names[i]}_yearly.csv")
                        try: table_df.to_csv(filename, float_format='%.4f'); print(f" -> Porteføljetabell lagret til {filename}")
                        except Exception as e: print(f"  FEIL ved lagring av porteføljetabell {filename}: {e}")

    # --- Save Prespecified Portfolio Results (Optional) ---
    # print("\nLagrer Prespesifiserte Portefølje Resultater...")
    # ...

    print(f"\nFull {model_run_name} analyse (Årlig Refitting) fullført.")

SyntaxError: invalid syntax (3237511281.py, line 587)