<a href="https://colab.research.google.com/github/sabire113/Master/blob/main/Master_Lin%C3%A6r.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

25. Februar:
Feilsøking og revidert vesjon av Sabires Ols.ipynb


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV

# Step 1: Load and Clean Data
def load_and_clean_data(file_path):
    df = pd.read_csv(file_path)

    # Konverter til datetime og sorter for å bevare tidsrekkefølgen
    df['Date'] = pd.to_datetime(df['Date'])
    df.sort_values('Date', inplace=True)

    # Fjern ikke-numeriske kolonner (f.eks. Ticker)
    if 'Ticker' in df.columns:
        df.drop(columns=['Ticker'], inplace=True)

    # Eventuelt filtrer for minimum markedsverdi (kan avkommenteres)
    # df = df[df['MarketCap'] >= 2e9]

    # Beregn nye variabler
    df['Excess_Return'] = df['OSEBXReturns'] - df['NorgesBank10Y']
    df['Size'] = np.log(df['MarketCap'])
    df['BM'] = (df['BookValuePerShare'] * df['CommonSharesOutstanding']) / df['MarketCap']
    df = df[df['BM'] > 0]
    df['BM'] = np.log(df['BM'])

    # Opprett momentumvariabel og fjern den opprinnelige for å unngå duplisering
    df['Mom12m'] = df['Momentum_12M']
    df.drop(columns=['Momentum_12M'], inplace=True)

    # Håndter manglende verdier (her droppes alle rader, men alternativ imputering kan vurderes)
    df.dropna(inplace=True)
    return df

# Step 2: Drop Highly Correlated Features
def drop_highly_correlated_features(X, threshold=0.99, keep_features=['Mom12m']):
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold) and column not in keep_features]
    if to_drop:
        print(f"Dropping highly correlated features: {to_drop}")
        X = X.drop(columns=to_drop)
    return X

# Step 3: Remove High VIF Features
def remove_high_vif_features(X, threshold=10, keep_features=['Mom12m']):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    high_vif_features = vif_data[vif_data["VIF"] > threshold]["Feature"].tolist()
    high_vif_features = [feature for feature in high_vif_features if feature not in keep_features]
    if high_vif_features:
        print(f"Removing high VIF features: {high_vif_features}")
        X = X.drop(columns=high_vif_features)
    return X

# Step 4: Split Data (Stock-Level OLS)
def split_stock_level_data(df):
    X = df.drop(columns=['Excess_Return', 'Date'])
    y = df['Excess_Return']

    # Bruk kun numeriske kolonner
    X = X.select_dtypes(include=[np.number])

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), index=df.index, columns=X.columns)

    # Kjør full OLS før filtrering av variabler
    full_ols_model_before_vif, train_r2_before_vif, test_r2_before_vif = run_ols(X_scaled, y)
    print("\nFull OLS Results (Before Removing Any Variables):")
    print(full_ols_model_before_vif.summary())

    # Fjern sterkt korrelerte og høye VIF-variabler (men behold Mom12m)
    X_scaled = drop_highly_correlated_features(X_scaled, keep_features=['Mom12m'])
    X_scaled = remove_high_vif_features(X_scaled, keep_features=['Mom12m'])

    # 80-20 tidsbasert splitt (dataene er sortert etter dato)
    split_index = int(len(df) * 0.8)
    X_train, X_test = X_scaled.iloc[:split_index], X_scaled.iloc[split_index:]
    y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

    return X_train, X_test, y_train, y_test, train_r2_before_vif, test_r2_before_vif

# Step 5: Run OLS with Newey-West Standard Errors
def run_ols(X_train, y_train, X_test=None, y_test=None):
    X_train = sm.add_constant(X_train)
    if X_test is not None:
        X_test = sm.add_constant(X_test)
    model = sm.OLS(y_train, X_train).fit(cov_type='HAC', cov_kwds={'maxlags': 1})

    test_r2 = None
    if X_test is not None and y_test is not None:
        test_r2 = sm.OLS(y_test, X_test).fit().rsquared

    return model, model.rsquared, test_r2

# Step 6: Feature Selection Using Lasso
def lasso_feature_selection(X_train, y_train):
    lasso = LassoCV(cv=5, random_state=42, max_iter=10000, n_jobs=-1)
    lasso.fit(X_train, y_train)
    selected_features = X_train.columns[lasso.coef_ != 0]
    print(f"Selected Features using Lasso: {selected_features.tolist()}")
    return X_train[selected_features], selected_features

# Ny hjelpefunksjon for vektet gjennomsnitt
def weighted_mean(group, col, weight):
    return np.average(group[col], weights=group[weight])

# Beregn porteføljeavkastning med vektede gjennomsnitt
def construct_value_weighted_portfolio(df):
    df['Weight'] = df['MarketCap'] / df.groupby('Date')['MarketCap'].transform('sum')
    df['Weighted_Return'] = df['Weight'] * df['Excess_Return']
    portfolio_return = df.groupby('Date')['Weighted_Return'].sum()
    return portfolio_return

def split_portfolio_data(df):
    y_portfolio = construct_value_weighted_portfolio(df)

    # Bruk vektede gjennomsnitt for prediktorene med MarketCap som vekt
    def weighted_features(group):
        return pd.Series({
            'Size': np.average(group['Size'], weights=group['MarketCap']),
            'BM': np.average(group['BM'], weights=group['MarketCap']),
            'Mom12m': np.average(group['Mom12m'], weights=group['MarketCap'])
        })

    X_weighted = df.groupby('Date').apply(weighted_features)

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X_weighted), index=X_weighted.index, columns=X_weighted.columns)

    # 80-20 splitt basert på dato
    split_date = X_scaled.index[int(len(X_scaled) * 0.8)]
    X_train = X_scaled.loc[X_scaled.index <= split_date]
    X_test = X_scaled.loc[X_scaled.index > split_date]
    y_train = y_portfolio.loc[y_portfolio.index <= split_date]
    y_test = y_portfolio.loc[y_portfolio.index > split_date]

    return X_train, X_test, y_train, y_test

# Step 7: Main Execution
def main():
    file_path = "/content/data25.csv"
    df = load_and_clean_data(file_path)

    X_train, X_test, y_train, y_test, train_r2_before_vif, test_r2_before_vif = split_stock_level_data(df)

    full_ols_model, train_r2_full, test_r2_full = run_ols(X_train, y_train, X_test, y_test)
    print("\nFull OLS Results (After Removing High VIF Features):")
    print(full_ols_model.summary())

    selected_features = ['Size', 'BM', 'Mom12m']
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    selected_ols_model, train_r2_selected, test_r2_selected = run_ols(X_train_selected, y_train, X_test_selected, y_test)
    print("\nOLS Results (BM, Size, Momentum 12M):")
    print(selected_ols_model.summary())

    X_train_lasso, selected_features_lasso = lasso_feature_selection(X_train, y_train)
    X_test_lasso = X_test[selected_features_lasso]
    lasso_ols_model, train_r2_lasso, test_r2_lasso = run_ols(X_train_lasso, y_train, X_test_lasso, y_test)
    print("\nOLS Results (Lasso-Selected Features):")
    print(lasso_ols_model.summary())

    X_train_port, X_test_port, y_train_port, y_test_port = split_portfolio_data(df)
    portfolio_model, train_r2_port, test_r2_port = run_ols(X_train_port, y_train_port, X_test_port, y_test_port)
    print("\nPortfolio-Level OLS Results:")
    print(portfolio_model.summary())

    r2_results = pd.DataFrame({
        "Model": ["Before VIF", "After VIF", "BM, Size, Mom12m", "Lasso-Selected", "Portfolio-Level"],
        "Train R²": [train_r2_before_vif, train_r2_full, train_r2_selected, train_r2_lasso, train_r2_port],
        "Test R²": [test_r2_before_vif, test_r2_full, test_r2_selected, test_r2_lasso, test_r2_port]
    })

    display(r2_results)

if __name__ == "__main__":
    main()



Full OLS Results (Before Removing Any Variables):
                            OLS Regression Results                            
Dep. Variable:          Excess_Return   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 8.910e+30
Date:                Tue, 25 Feb 2025   Prob (F-statistic):               0.00
Time:                        10:47:47   Log-Likelihood:             2.1531e+05
No. Observations:                7024   AIC:                        -4.306e+05
Df Residuals:                    6992   BIC:                        -4.303e+05
Df Model:                          31                                         
Covariance Type:                  HAC                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------

  X_weighted = df.groupby('Date').apply(weighted_features)


Unnamed: 0,Model,Train R²,Test R²
0,Before VIF,1.0,
1,After VIF,0.61861,0.232078
2,"BM, Size, Mom12m",0.00684,0.000268
3,Lasso-Selected,0.61432,0.217425
4,Portfolio-Level,0.667193,0.100284



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



Ny kode basert på datasett fra 95-25.

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV

# Step 1: Load, rename og clean data – fjerner alle referanser til MarketCap, Size og BM
def load_and_clean_data(file_path, momentum_window=12):
    df = pd.read_csv(file_path)

    # Standardiser kolonnenavn (tilpass etter dine data)
    rename_dict = {
        "Company Market Cap": "MarketCap",  # denne ignoreres nå
        "Book Value Per Share": "BookValuePerShare",  # ignoreres
        "Common Shares - Outstanding - Total": "CommonSharesOutstanding",  # ignoreres
        "1 Month Total Return": "StockReturn",
        "Dividend yield": "DividendYield",
        "Price Close": "ClosePrice",
        "Price Open": "OpenPrice",
        "Bid Price": "BidPrice",
        "Ask Price": "AskPrice",
        "Earnings Per Share - Mean": "EarningsPerShare"
    }
    df.rename(columns=rename_dict, inplace=True)

    # Konverter 'Date' til datetime og sorter etter dato
    df['Date'] = pd.to_datetime(df['Date'])
    df.sort_values('Date', inplace=True)

    # Forutsetter at aksjeidentifikatoren er i kolonnen "Instrument"
    if "Instrument" not in df.columns:
        raise KeyError("Kolonnen 'Instrument' må finnes for å identifisere aksjer. Tilgjengelige kolonner: " + str(df.columns))

    # Fjern eventuelle rader med manglende verdier for de essensielle variablene for avkastning og andre signaler
    essential_cols = ['StockReturn']  # vi bruker StockReturn til momentum og Excess_Return
    df.dropna(subset=essential_cols, inplace=True)

    # Beregn 12-måneders momentum for hver aksje.
    # Bruker rullende vindu med min_periods=1 slik at vi får verdi selv for aksjer med kortere historikk.
    df.sort_values(['Instrument', 'Date'], inplace=True)
    df['Momentum_12M'] = df.groupby('Instrument')['StockReturn']\
                             .transform(lambda x: x.rolling(window=momentum_window, min_periods=1)\
                             .apply(lambda r: np.prod(1 + r) - 1, raw=True))
    df.dropna(subset=['Momentum_12M'], inplace=True)

    # Beregn Excess_Return (her antas risikofri rente = 0)
    df['Excess_Return'] = df['StockReturn']

    # Kopier momentum til en enklere kolonnenavn og fjern originalen
    df['Mom12m'] = df['Momentum_12M']
    df.drop(columns=['Momentum_12M'], inplace=True)

    # Fjern rader med manglende verdi i de sentrale variablene
    df.dropna(subset=['Excess_Return', 'Mom12m'], inplace=True)

    # Fjern unødvendige identifikatorer før regresjonsanalyse (for eksempel 'Instrument')
    if 'Instrument' in df.columns:
        df.drop(columns=['Instrument'], inplace=True)

    return df

# Step 2: Drop Highly Correlated Features
def drop_highly_correlated_features(X, threshold=0.99, keep_features=['Mom12m']):
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > threshold) and col not in keep_features]
    if to_drop:
        print(f"Dropping highly correlated features: {to_drop}")
        X = X.drop(columns=to_drop)
    return X

# Step 3: Remove High VIF Features
def remove_high_vif_features(X, threshold=10, keep_features=['Mom12m']):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    high_vif_features = vif_data[vif_data["VIF"] > threshold]["Feature"].tolist()
    high_vif_features = [feat for feat in high_vif_features if feat not in keep_features]
    if high_vif_features:
        print(f"Removing high VIF features: {high_vif_features}")
        X = X.drop(columns=high_vif_features)
    return X

# Step 4: Split Data (Stock-Level OLS)
def split_stock_level_data(df):
    # Fjern 'Excess_Return' og 'Date' fra X
    X = df.drop(columns=['Excess_Return', 'Date'])
    y = df['Excess_Return']

    # Bruk kun numeriske kolonner
    X = X.select_dtypes(include=[np.number])

    # Erstatt inf/-inf med NaN og dropp rader med NaN
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    combined = pd.concat([X, y], axis=1).dropna()
    y = combined['Excess_Return']
    X = combined.drop(columns=['Excess_Return'])

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

    # Skriv ut full OLS-resultat før filtrering
    full_ols_model_before_vif, train_r2_before_vif, test_r2_before_vif = run_ols(X_scaled, y)
    print("\nFull OLS Results (Before Removing Any Variables):")
    print(full_ols_model_before_vif.summary())

    # Fjern høyt korrelerte og høye VIF-variabler (behold Mom12m)
    X_scaled = drop_highly_correlated_features(X_scaled, keep_features=['Mom12m'])
    X_scaled = remove_high_vif_features(X_scaled, keep_features=['Mom12m'])

    # Tidsbasert 80-20 splitting (dataene er sortert etter dato)
    split_index = int(len(df) * 0.8)
    X_train, X_test = X_scaled.iloc[:split_index], X_scaled.iloc[split_index:]
    y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

    return X_train, X_test, y_train, y_test, train_r2_before_vif, test_r2_before_vif

# Step 5: Run OLS med Newey-West standardfeil
def run_ols(X_train, y_train, X_test=None, y_test=None):
    X_train = sm.add_constant(X_train)
    if X_test is not None:
        X_test = sm.add_constant(X_test)
    model = sm.OLS(y_train, X_train).fit(cov_type='HAC', cov_kwds={'maxlags': 1})

    test_r2 = None
    if X_test is not None and y_test is not None:
        test_r2 = sm.OLS(y_test, X_test).fit().rsquared

    return model, model.rsquared, test_r2

# Step 6: Feature Selection med Lasso
def lasso_feature_selection(X_train, y_train):
    lasso = LassoCV(cv=5, random_state=42, max_iter=10000, n_jobs=-1)
    lasso.fit(X_train, y_train)
    selected_features = X_train.columns[lasso.coef_ != 0]
    print(f"Selected Features using Lasso: {selected_features.tolist()}")
    return X_train[selected_features], selected_features

# Step 7: Portfolio-Level Analysis (likevektet)
# Her bruker vi en likevektet portefølje (gjennomsnitt av avkastning per dato)
def construct_equal_weighted_portfolio(df):
    portfolio_return = df.groupby('Date')['Excess_Return'].mean()
    return portfolio_return

def split_portfolio_data(df):
    y_portfolio = construct_equal_weighted_portfolio(df)

    # For porteføljenivå bruker vi nå kun de variable vi ønsker – her beholder vi kun 'Mom12m'
    X_equal = df.groupby('Date').agg({'Mom12m': 'mean'})

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X_equal), index=X_equal.index, columns=X_equal.columns)

    # Tidsbasert 80-20 splitting
    split_date = X_scaled.index[int(len(X_scaled) * 0.8)]
    X_train = X_scaled.loc[X_scaled.index <= split_date]
    X_test = X_scaled.loc[X_scaled.index > split_date]
    y_train = y_portfolio.loc[y_portfolio.index <= split_date]
    y_test = y_portfolio.loc[y_portfolio.index > split_date]

    return X_train, X_test, y_train, y_test

# Step 8: Main Execution
def main():
    file_path = "/content/OSEFX_Monthly_Data_1995_2024.csv"
    df = load_and_clean_data(file_path)

    X_train, X_test, y_train, y_test, train_r2_before_vif, test_r2_before_vif = split_stock_level_data(df)

    full_ols_model, train_r2_full, test_r2_full = run_ols(X_train, y_train, X_test, y_test)
    print("\nFull OLS Results (After Removing High VIF Features):")
    print(full_ols_model.summary())

    # Her bruker vi kun momentum som utvalgt variabel
    selected_features = ['Mom12m']
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    selected_ols_model, train_r2_selected, test_r2_selected = run_ols(X_train_selected, y_train, X_test_selected, y_test)
    print("\nOLS Results (Momentum 12M):")
    print(selected_ols_model.summary())

    X_train_lasso, selected_features_lasso = lasso_feature_selection(X_train, y_train)
    X_test_lasso = X_test[selected_features_lasso]
    lasso_ols_model, train_r2_lasso, test_r2_lasso = run_ols(X_train_lasso, y_train, X_test_lasso, y_test)
    print("\nOLS Results (Lasso-Selected Features):")
    print(lasso_ols_model.summary())

    X_train_port, X_test_port, y_train_port, y_test_port = split_portfolio_data(df)
    portfolio_model, train_r2_port, test_r2_port = run_ols(X_train_port, y_train_port, X_test_port, y_test_port)
    print("\nPortfolio-Level OLS Results (Equal-Weighted):")
    print(portfolio_model.summary())

    r2_results = pd.DataFrame({
        "Model": ["Before VIF", "After VIF", "Momentum 12M", "Lasso-Selected", "Portfolio-Level"],
        "Train R²": [train_r2_before_vif, train_r2_full, train_r2_selected, train_r2_lasso, train_r2_port],
        "Test R²": [test_r2_before_vif, test_r2_full, test_r2_selected, test_r2_lasso, test_r2_port]
    })

    display(r2_results)

if __name__ == "__main__":
    main()



Full OLS Results (Before Removing Any Variables):
                            OLS Regression Results                            
Dep. Variable:          Excess_Return   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.711e+31
Date:                Tue, 25 Feb 2025   Prob (F-statistic):               0.00
Time:                        12:59:10   Log-Likelihood:                 97653.
No. Observations:                3215   AIC:                        -1.953e+05
Df Residuals:                    3201   BIC:                        -1.952e+05
Df Model:                          13                                         
Covariance Type:                  HAC                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------

ValueError: zero-size array to reduction operation maximum which has no identity

Revidert kode etter tilbakemelding:

In [7]:
df = pd.read_csv("/content/data25.csv")
print(df.head())


         Date Instrument First Trade Date  ClosePrice   OpenPrice   Volume  \
0  2015-01-31    AFGA.OL       1997-09-08   79.420230   79.890172      0.0   
1  2015-02-28    AFGA.OL       1997-09-08   85.059536   85.529479      0.0   
2  2015-03-31    AFGA.OL       1997-09-08   93.048554   93.988438  29730.0   
3  2015-04-30    AFGA.OL       1997-09-08   97.747976   93.988438  31574.0   
4  2015-05-31    AFGA.OL       1997-09-08  105.267051  100.567629      0.0   

     BidPrice    AskPrice  DividendYield  BookValuePerShare  ...  \
0   79.420230   80.595086            NaN          15.058302  ...   
1   84.354623   85.059536       5.524862          15.723256  ...   
2   92.578611   93.518496       5.050505          15.723256  ...   
3   96.808091   97.747976       4.807692          15.723256  ...   
4  104.327166  105.267051       4.464286          15.723256  ...   

   TurnoverRatio   BrentOil    USDNOK    EURNOK US10Y    USCPI USGDPGrowth  \
0       0.000000  52.990002  7.725400  8.725

In [9]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.metrics import r2_score

# Hjelpefunksjon for winsorizing
def winsorize_series(s, lower_quantile=0.01, upper_quantile=0.99):
    lower = s.quantile(lower_quantile)
    upper = s.quantile(upper_quantile)
    return s.clip(lower, upper)

# Step 1: Last inn, winsorize og rens data
def load_and_clean_data(file_path):
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'])
    df.sort_values('Date', inplace=True)

    # Fjern kolonner som ikke er nødvendige (f.eks. Ticker)
    if 'Ticker' in df.columns:
        df.drop(columns=['Ticker'], inplace=True)

    # Beregn nye variabler
    df['Excess_Return'] = df['OSEBXReturns'] - df['NorgesBank10Y']
    df['Size'] = np.log(df['MarketCap'])
    df['BM'] = (df['BookValuePerShare'] * df['CommonSharesOutstanding']) / df['MarketCap']
    df = df[df['BM'] > 0]  # beholder kun rader med BM > 0
    df['BM'] = np.log(df['BM'])
    df['Mom12m'] = df['Momentum_12M']
    df.drop(columns=['Momentum_12M'], inplace=True)

    # Winsorize alle numeriske prediktorer (unntatt målvariabelen)
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'Excess_Return' in num_cols:
        num_cols.remove('Excess_Return')
    for col in num_cols:
        df[col] = winsorize_series(df[col])

    # Fjern rader med manglende verdier
    df.dropna(inplace=True)

    print("Etter rensing, dataframe-shape:", df.shape)
    if df.shape[0] == 0:
        raise ValueError("Dataframe er tom etter rensing. Sjekk dine filterbetingelser og inputdata!")

    return df

# Step 2: Del data på aksjenivå uten å fjerne variabler med VIF
def split_stock_level_data(df):
    X = df.drop(columns=['Excess_Return', 'Date'])
    y = df['Excess_Return']

    # Bruk kun numeriske kolonner
    X = X.select_dtypes(include=[np.number])

    print("Størrelse på X før skalering:", X.shape)
    if X.shape[0] == 0:
        raise ValueError("Ingen data igjen i X for skalering!")

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
    print("Størrelse på X_scaled:", X_scaled.shape)

    # Tidsbasert 80/20-splitt
    split_index = int(len(df) * 0.8)
    X_train = X_scaled.iloc[:split_index]
    X_test = X_scaled.iloc[split_index:]
    y_train = y.iloc[:split_index]
    y_test = y.iloc[split_index:]

    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)

    return X_train, X_test, y_train, y_test, scaler

# Step 3: Kjør OLS med Newey-West standardfeil
def run_ols(X_train, y_train, X_test=None, y_test=None):
    X_train_const = sm.add_constant(X_train)
    if X_test is not None:
        X_test_const = sm.add_constant(X_test)
    model = sm.OLS(y_train, X_train_const).fit(cov_type='HAC', cov_kwds={'maxlags': 1})

    test_r2 = None
    if X_test is not None and y_test is not None:
        test_model = sm.OLS(y_test, X_test_const).fit()
        test_r2 = test_model.rsquared
    return model, model.rsquared, test_r2

# Step 4: Penaliserte regresjonsmodeller
def lasso_regression(X_train, y_train, X_test, y_test):
    lasso = LassoCV(cv=5, random_state=42, max_iter=10000, n_jobs=-1)
    lasso.fit(X_train, y_train)
    y_pred_train = lasso.predict(X_train)
    y_pred_test = lasso.predict(X_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    selected_features = X_train.columns[lasso.coef_ != 0]
    print(f"Lasso - Best lambda: {lasso.alpha_}")
    print(f"Valgte funksjoner med Lasso: {selected_features.tolist()}")
    return lasso, r2_train, r2_test, selected_features

def ridge_regression(X_train, y_train, X_test, y_test):
    alphas = np.logspace(-4, 4, 50)
    ridge = RidgeCV(alphas=alphas, scoring='r2', cv=5)
    ridge.fit(X_train, y_train)
    y_pred_train = ridge.predict(X_train)
    y_pred_test = ridge.predict(X_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    print(f"Ridge - Best alpha: {ridge.alpha_}")
    return ridge, r2_train, r2_test

def elasticnet_regression(X_train, y_train, X_test, y_test):
    enet = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 1], alphas=np.logspace(-4, 4, 50),
                        cv=5, random_state=42, max_iter=10000)
    enet.fit(X_train, y_train)
    y_pred_train = enet.predict(X_train)
    y_pred_test = enet.predict(X_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    print(f"ElasticNet - Best alpha: {enet.alpha_}, Best l1_ratio: {enet.l1_ratio_}")
    return enet, r2_train, r2_test

# Step 5: Porteføljeprediksjon med bottom-up tilnærming
def bottom_up_portfolio_prediction(df, model, scaler, features):
    split_index = int(len(df) * 0.8)
    df_train = df.iloc[:split_index]
    df_test = df.iloc[split_index:]

    # Bruk de tre utvalgte funksjonene og den nye scaler
    X_test = df_test[features]
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    df_test = df_test.copy()
    df_test['Predicted_Return'] = model.predict(sm.add_constant(X_test_scaled))

    # Beregn vekt basert på MarketCap for hver dato
    df_test['Weight'] = df_test['MarketCap'] / df_test.groupby('Date')['MarketCap'].transform('sum')
    portfolio_pred = df_test.groupby('Date').apply(lambda g: np.sum(g['Predicted_Return'] * g['Weight']))
    portfolio_actual = df_test.groupby('Date').apply(lambda g: np.sum(g['Excess_Return'] * g['Weight']))

    sharpe_ratio = portfolio_pred.mean() / portfolio_pred.std() if portfolio_pred.std() != 0 else np.nan
    return portfolio_pred, portfolio_actual, sharpe_ratio

# Main Execution
def main():
    file_path = "/content/data25.csv"  # Juster filstien etter behov
    df = load_and_clean_data(file_path)

    # Splitt data på aksjenivå
    X_train, X_test, y_train, y_test, scaler = split_stock_level_data(df)

    # Bruk tre utvalgte funksjoner: Size, BM, Mom12m
    selected_features = ['Size', 'BM', 'Mom12m']
    X_train_sel = X_train[selected_features]
    X_test_sel = X_test[selected_features]

    # Kjør OLS med tre faktorer
    ols_3f_model, train_r2_3f, test_r2_3f = run_ols(X_train_sel, y_train, X_test_sel, y_test)
    print("\nOLS med tre faktorer (Size, BM, Mom12m):")
    print(ols_3f_model.summary())

    # Opprett en separat scaler for de tre funksjonene
    scaler_sel = StandardScaler()
    scaler_sel.fit(X_train_sel)

    # Penaliserte modeller (bruker hele settet med funksjoner her)
    lasso_model, lasso_r2_train, lasso_r2_test, lasso_selected_features = lasso_regression(X_train, y_train, X_test, y_test)
    ridge_model, ridge_r2_train, ridge_r2_test = ridge_regression(X_train, y_train, X_test, y_test)
    enet_model, enet_r2_train, enet_r2_test = elasticnet_regression(X_train, y_train, X_test, y_test)

    r2_results = pd.DataFrame({
        "Model": ["OLS 3 faktorer", "Lasso", "Ridge", "Elastic Net"],
        "Train R²": [train_r2_3f, lasso_r2_train, ridge_r2_train, enet_r2_train],
        "Test R²": [test_r2_3f, lasso_r2_test, ridge_r2_test, enet_r2_test]
    })
    print("\nSammenligning av R²:")
    print(r2_results)

    # Porteføljeprediksjon med bottom-up tilnærming (bruker OLS-modellen med tre faktorer)
    portfolio_pred, portfolio_actual, sharpe_ratio = bottom_up_portfolio_prediction(df, ols_3f_model, scaler_sel, selected_features)
    print("\nPorteføljeprediksjon (bottom-up tilnærming):")
    print("Sharpe Ratio for predikert portefølje:", sharpe_ratio)

if __name__ == "__main__":
    main()


Etter rensing, dataframe-shape: (7024, 37)
Størrelse på X før skalering: (7024, 31)
Størrelse på X_scaled: (7024, 31)
X_train shape: (5619, 31)
X_test shape: (1405, 31)

OLS med tre faktorer (Size, BM, Mom12m):
                            OLS Regression Results                            
Dep. Variable:          Excess_Return   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     12.25
Date:                Wed, 26 Feb 2025   Prob (F-statistic):           5.50e-08
Time:                        17:15:56   Log-Likelihood:                -4943.8
No. Observations:                5619   AIC:                             9896.
Df Residuals:                    5615   BIC:                             9922.
Df Model:                           3                                         
Covariance Type:                  HAC                                         

  portfolio_pred = df_test.groupby('Date').apply(lambda g: np.sum(g['Predicted_Return'] * g['Weight']))
  portfolio_actual = df_test.groupby('Date').apply(lambda g: np.sum(g['Excess_Return'] * g['Weight']))


Revidert versjon til, endringer:

In [10]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.metrics import r2_score

# Hjelpefunksjon for winsorizing
def winsorize_series(s, lower_quantile=0.01, upper_quantile=0.99):
    lower = s.quantile(lower_quantile)
    upper = s.quantile(upper_quantile)
    return s.clip(lower, upper)

# Step 1: Last inn, winsorize og rens data
def load_and_clean_data(file_path):
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'])
    df.sort_values('Date', inplace=True)

    # Fjern kolonner som ikke er nødvendige (f.eks. Ticker)
    if 'Ticker' in df.columns:
        df.drop(columns=['Ticker'], inplace=True)

    # Beregn nye variabler
    # Merk: Excess_Return beregnes som OSEBXReturns minus NorgesBank10Y
    df['Excess_Return'] = df['OSEBXReturns'] - df['NorgesBank10Y']
    df['Size'] = np.log(df['MarketCap'])
    df['BM'] = (df['BookValuePerShare'] * df['CommonSharesOutstanding']) / df['MarketCap']
    # Filtrer ut rader med BM <= 0 før log-transformasjon
    df = df[df['BM'] > 0]
    df['BM'] = np.log(df['BM'])
    df['Mom12m'] = df['Momentum_12M']
    df.drop(columns=['Momentum_12M'], inplace=True)

    # Winsorize alle numeriske prediktorer (unntatt målvariabelen)
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'Excess_Return' in num_cols:
        num_cols.remove('Excess_Return')
    for col in num_cols:
        df[col] = winsorize_series(df[col])

    # Fjern rader med manglende verdier
    df.dropna(inplace=True)

    print("Etter rensing, dataframe-shape:", df.shape)
    if df.shape[0] == 0:
        raise ValueError("Dataframe er tom etter rensing. Sjekk dine filterbetingelser og inputdata!")

    return df

# Step 2: Del data på aksjenivå
def split_stock_level_data(df):
    # Fjern kolonner som direkte lekker målvariabelinformasjon:
    # Vi fjerner 'OSEBXReturns' og 'NorgesBank10Y'
    X = df.drop(columns=['Excess_Return', 'Date', 'OSEBXReturns', 'NorgesBank10Y'])
    y = df['Excess_Return']

    # Bruk kun numeriske kolonner
    X = X.select_dtypes(include=[np.number])

    print("Størrelse på X før skalering:", X.shape)
    if X.shape[0] == 0:
        raise ValueError("Ingen data igjen i X for skalering!")

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
    print("Størrelse på X_scaled:", X_scaled.shape)

    # Tidsbasert 80/20-splitt
    split_index = int(len(df) * 0.8)
    X_train = X_scaled.iloc[:split_index]
    X_test = X_scaled.iloc[split_index:]
    y_train = y.iloc[:split_index]
    y_test = y.iloc[split_index:]

    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)

    return X_train, X_test, y_train, y_test, scaler

# Step 3: Kjør OLS med Newey-West standardfeil
def run_ols(X_train, y_train, X_test=None, y_test=None):
    X_train_const = sm.add_constant(X_train)
    if X_test is not None:
        X_test_const = sm.add_constant(X_test)
    model = sm.OLS(y_train, X_train_const).fit(cov_type='HAC', cov_kwds={'maxlags': 1})

    test_r2 = None
    if X_test is not None and y_test is not None:
        test_model = sm.OLS(y_test, X_test_const).fit()
        test_r2 = test_model.rsquared
    return model, model.rsquared, test_r2

# Step 4: Penaliserte regresjonsmodeller
def lasso_regression(X_train, y_train, X_test, y_test):
    lasso = LassoCV(cv=5, random_state=42, max_iter=10000, n_jobs=-1)
    lasso.fit(X_train, y_train)
    y_pred_train = lasso.predict(X_train)
    y_pred_test = lasso.predict(X_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    selected_features = X_train.columns[lasso.coef_ != 0]
    print(f"Lasso - Best lambda: {lasso.alpha_}")
    print(f"Valgte funksjoner med Lasso: {selected_features.tolist()}")
    return lasso, r2_train, r2_test, selected_features

def ridge_regression(X_train, y_train, X_test, y_test):
    alphas = np.logspace(-4, 4, 50)
    ridge = RidgeCV(alphas=alphas, scoring='r2', cv=5)
    ridge.fit(X_train, y_train)
    y_pred_train = ridge.predict(X_train)
    y_pred_test = ridge.predict(X_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    print(f"Ridge - Best alpha: {ridge.alpha_}")
    return ridge, r2_train, r2_test

def elasticnet_regression(X_train, y_train, X_test, y_test):
    enet = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 1], alphas=np.logspace(-4, 4, 50),
                        cv=5, random_state=42, max_iter=10000)
    enet.fit(X_train, y_train)
    y_pred_train = enet.predict(X_train)
    y_pred_test = enet.predict(X_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    print(f"ElasticNet - Best alpha: {enet.alpha_}, Best l1_ratio: {enet.l1_ratio_}")
    return enet, r2_train, r2_test

# Step 5: Porteføljeprediksjon med bottom-up tilnærming
def bottom_up_portfolio_prediction(df, model, scaler, features):
    split_index = int(len(df) * 0.8)
    df_train = df.iloc[:split_index]
    df_test = df.iloc[split_index:]

    # Bruk de tre utvalgte funksjonene med en egen scaler (skaleren er trent på X_train for disse funksjonene)
    X_test = df_test[features]
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    df_test = df_test.copy()
    df_test['Predicted_Return'] = model.predict(sm.add_constant(X_test_scaled))

    # Beregn vekt basert på MarketCap for hver dato.
    # Bruk groupby med group_keys=False for å unngå DeprecationWarning.
    df_test['Weight'] = df_test['MarketCap'] / df_test.groupby('Date', group_keys=False)['MarketCap'].transform('sum')
    portfolio_pred = df_test.groupby('Date', group_keys=False).apply(lambda g: np.sum(g['Predicted_Return'] * g['Weight']))
    portfolio_actual = df_test.groupby('Date', group_keys=False).apply(lambda g: np.sum(g['Excess_Return'] * g['Weight']))

    sharpe_ratio = portfolio_pred.mean() / portfolio_pred.std() if portfolio_pred.std() != 0 else np.nan
    return portfolio_pred, portfolio_actual, sharpe_ratio

# Main Execution
def main():
    file_path = "/content/data25.csv"  # Juster filstien etter behov
    df = load_and_clean_data(file_path)

    # Splitt data på aksjenivå
    X_train, X_test, y_train, y_test, scaler_full = split_stock_level_data(df)

    # Bruk tre utvalgte funksjoner for OLS med tre faktorer: Size, BM, Mom12m
    selected_features = ['Size', 'BM', 'Mom12m']
    X_train_sel = X_train[selected_features]
    X_test_sel = X_test[selected_features]

    # Kjør OLS med tre faktorer
    ols_3f_model, train_r2_3f, test_r2_3f = run_ols(X_train_sel, y_train, X_test_sel, y_test)
    print("\nOLS med tre faktorer (Size, BM, Mom12m):")
    print(ols_3f_model.summary())

    # Opprett en separat scaler for de tre funksjonene
    scaler_sel = StandardScaler()
    scaler_sel.fit(X_train_sel)

    # Penaliserte modeller – bruker hele settet (uten OSEBXReturns og NorgesBank10Y)
    lasso_model, lasso_r2_train, lasso_r2_test, lasso_selected_features = lasso_regression(X_train, y_train, X_test, y_test)
    ridge_model, ridge_r2_train, ridge_r2_test = ridge_regression(X_train, y_train, X_test, y_test)
    enet_model, enet_r2_train, enet_r2_test = elasticnet_regression(X_train, y_train, X_test, y_test)

    r2_results = pd.DataFrame({
        "Model": ["OLS 3 faktorer", "Lasso", "Ridge", "Elastic Net"],
        "Train R²": [train_r2_3f, lasso_r2_train, ridge_r2_train, enet_r2_train],
        "Test R²": [test_r2_3f, lasso_r2_test, ridge_r2_test, enet_r2_test]
    })
    print("\nSammenligning av R²:")
    print(r2_results)

    # Porteføljeprediksjon med bottom-up tilnærming (bruker OLS-modellen med tre faktorer)
    portfolio_pred, portfolio_actual, sharpe_ratio = bottom_up_portfolio_prediction(df, ols_3f_model, scaler_sel, selected_features)
    print("\nPorteføljeprediksjon (bottom-up tilnærming):")
    print("Sharpe Ratio for predikert portefølje:", sharpe_ratio)

if __name__ == "__main__":
    main()


Etter rensing, dataframe-shape: (7024, 37)
Størrelse på X før skalering: (7024, 29)
Størrelse på X_scaled: (7024, 29)
X_train shape: (5619, 29)
X_test shape: (1405, 29)

OLS med tre faktorer (Size, BM, Mom12m):
                            OLS Regression Results                            
Dep. Variable:          Excess_Return   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     12.25
Date:                Wed, 26 Feb 2025   Prob (F-statistic):           5.50e-08
Time:                        17:17:57   Log-Likelihood:                -4943.8
No. Observations:                5619   AIC:                             9896.
Df Residuals:                    5615   BIC:                             9922.
Df Model:                           3                                         
Covariance Type:                  HAC                                         

  portfolio_pred = df_test.groupby('Date', group_keys=False).apply(lambda g: np.sum(g['Predicted_Return'] * g['Weight']))
  portfolio_actual = df_test.groupby('Date', group_keys=False).apply(lambda g: np.sum(g['Excess_Return'] * g['Weight']))


In [12]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.metrics import r2_score

# Undertrykk DeprecationWarnings for groupby.apply (kan fjernes når warningen er løst)
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Hjelpefunksjon for winsorizing
def winsorize_series(s, lower_quantile=0.01, upper_quantile=0.99):
    lower = s.quantile(lower_quantile)
    upper = s.quantile(upper_quantile)
    return s.clip(lower, upper)

# Step 1: Last inn, winsorize og rens data
def load_and_clean_data(file_path):
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'])
    df.sort_values('Date', inplace=True)

    # Fjern kolonner som ikke er nødvendige (f.eks. Ticker)
    if 'Ticker' in df.columns:
        df.drop(columns=['Ticker'], inplace=True)

    # Beregn nye variabler
    df['Excess_Return'] = df['OSEBXReturns'] - df['NorgesBank10Y']
    df['Size'] = np.log(df['MarketCap'])
    df['BM'] = (df['BookValuePerShare'] * df['CommonSharesOutstanding']) / df['MarketCap']
    df = df[df['BM'] > 0]  # Behold kun rader med BM > 0
    df['BM'] = np.log(df['BM'])
    df['Mom12m'] = df['Momentum_12M']
    df.drop(columns=['Momentum_12M'], inplace=True)

    # Winsorize alle numeriske prediktorer (unntatt målvariabelen)
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'Excess_Return' in num_cols:
        num_cols.remove('Excess_Return')
    for col in num_cols:
        df[col] = winsorize_series(df[col])

    # Fjern rader med manglende verdier
    df.dropna(inplace=True)

    print("Etter rensing, dataframe-shape:", df.shape)
    if df.shape[0] == 0:
        raise ValueError("Dataframe er tom etter rensing. Sjekk dine filterbetingelser og inputdata!")
    return df

# Step 2: Del data på aksjenivå (fjern kolonner som lekker målvariabelinformasjon)
def split_stock_level_data(df):
    # Fjern kolonner som direkte lekker (OSEBXReturns og NorgesBank10Y)
    X = df.drop(columns=['Excess_Return', 'Date', 'OSEBXReturns', 'NorgesBank10Y'])
    y = df['Excess_Return']

    # Bruk kun numeriske kolonner
    X = X.select_dtypes(include=[np.number])

    print("Størrelse på X før skalering:", X.shape)
    if X.shape[0] == 0:
        raise ValueError("Ingen data igjen i X for skalering!")

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)
    print("Størrelse på X_scaled:", X_scaled.shape)

    # Tidsbasert 80/20-splitt
    split_index = int(len(df) * 0.8)
    X_train = X_scaled.iloc[:split_index]
    X_test = X_scaled.iloc[split_index:]
    y_train = y.iloc[:split_index]
    y_test = y.iloc[split_index:]

    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    return X_train, X_test, y_train, y_test, scaler

# Step 3: Kjør OLS med Newey-West standardfeil
def run_ols(X_train, y_train, X_test=None, y_test=None):
    X_train_const = sm.add_constant(X_train)
    if X_test is not None:
        X_test_const = sm.add_constant(X_test)
    model = sm.OLS(y_train, X_train_const).fit(cov_type='HAC', cov_kwds={'maxlags': 1})

    test_r2 = None
    if X_test is not None and y_test is not None:
        test_model = sm.OLS(y_test, X_test_const).fit()
        test_r2 = test_model.rsquared
    return model, model.rsquared, test_r2

# Step 4: Penaliserte regresjonsmodeller
def lasso_regression(X_train, y_train, X_test, y_test):
    lasso = LassoCV(cv=5, random_state=42, max_iter=10000, n_jobs=-1)
    lasso.fit(X_train, y_train)
    y_pred_train = lasso.predict(X_train)
    y_pred_test = lasso.predict(X_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    selected_features = X_train.columns[lasso.coef_ != 0]
    print(f"Lasso - Best lambda: {lasso.alpha_}")
    print(f"Valgte funksjoner med Lasso: {selected_features.tolist()}")
    return lasso, r2_train, r2_test, selected_features

def ridge_regression(X_train, y_train, X_test, y_test):
    alphas = np.logspace(-4, 4, 50)
    ridge = RidgeCV(alphas=alphas, scoring='r2', cv=5)
    ridge.fit(X_train, y_train)
    y_pred_train = ridge.predict(X_train)
    y_pred_test = ridge.predict(X_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    print(f"Ridge - Best alpha: {ridge.alpha_}")
    return ridge, r2_train, r2_test

def elasticnet_regression(X_train, y_train, X_test, y_test):
    enet = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 1], alphas=np.logspace(-4, 4, 50),
                        cv=5, random_state=42, max_iter=10000)
    enet.fit(X_train, y_train)
    y_pred_train = enet.predict(X_train)
    y_pred_test = enet.predict(X_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    print(f"ElasticNet - Best alpha: {enet.alpha_}, Best l1_ratio: {enet.l1_ratio_}")
    return enet, r2_train, r2_test

# Step 5: Porteføljeprediksjon med bottom-up tilnærming
def bottom_up_portfolio_prediction(df, model, scaler, features):
    split_index = int(len(df) * 0.8)
    df_test = df.iloc[split_index:].copy()

    # Bruk kun de tre utvalgte funksjonene med egen scaler (skaleren er trent på X_train for disse funksjonene)
    X_test = df_test[features]
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    df_test['Predicted_Return'] = model.predict(sm.add_constant(X_test_scaled))

    # Beregn vekt basert på MarketCap. Her velger vi kun de nødvendige kolonnene for grouping
    df_group = df_test[['Date', 'Predicted_Return', 'Excess_Return', 'MarketCap']].copy()
    df_group['Weight'] = df_group['MarketCap'] / df_group.groupby('Date')['MarketCap'].transform('sum')

    portfolio_pred = df_group.groupby('Date', group_keys=False).apply(lambda g: np.sum(g['Predicted_Return'] * g['Weight']))
    portfolio_actual = df_group.groupby('Date', group_keys=False).apply(lambda g: np.sum(g['Excess_Return'] * g['Weight']))

    sharpe_ratio = portfolio_pred.mean() / portfolio_pred.std() if portfolio_pred.std() != 0 else np.nan
    return portfolio_pred, portfolio_actual, sharpe_ratio

# Main Execution
def main():
    file_path = "/content/data25.csv"  # Juster filstien etter behov
    df = load_and_clean_data(file_path)

    # Splitt data på aksjenivå
    X_train, X_test, y_train, y_test, scaler_full = split_stock_level_data(df)

    # Bruk tre utvalgte funksjoner for OLS med tre faktorer: Size, BM, Mom12m
    selected_features = ['Size', 'BM', 'Mom12m']
    X_train_sel = X_train[selected_features]
    X_test_sel = X_test[selected_features]

    # Kjør OLS med tre faktorer
    ols_3f_model, train_r2_3f, test_r2_3f = run_ols(X_train_sel, y_train, X_test_sel, y_test)
    print("\nOLS med tre faktorer (Size, BM, Mom12m):")
    print(ols_3f_model.summary())

    # Opprett en separat scaler for de tre funksjonene
    scaler_sel = StandardScaler()
    scaler_sel.fit(X_train_sel)

    # Penaliserte modeller – bruker hele settet (uten OSEBXReturns og NorgesBank10Y)
    lasso_model, lasso_r2_train, lasso_r2_test, lasso_selected_features = lasso_regression(X_train, y_train, X_test, y_test)
    ridge_model, ridge_r2_train, ridge_r2_test = ridge_regression(X_train, y_train, X_test, y_test)
    enet_model, enet_r2_train, enet_r2_test = elasticnet_regression(X_train, y_train, X_test, y_test)

    r2_results = pd.DataFrame({
        "Model": ["OLS 3 faktorer", "Lasso", "Ridge", "Elastic Net"],
        "Train R²": [train_r2_3f, lasso_r2_train, ridge_r2_train, enet_r2_train],
        "Test R²": [test_r2_3f, lasso_r2_test, ridge_r2_test, enet_r2_test]
    })
    print("\nSammenligning av R²:")
    print(r2_results)

    # Porteføljeprediksjon med bottom-up tilnærming (bruker OLS-modellen med tre faktorer)
    portfolio_pred, portfolio_actual, sharpe_ratio = bottom_up_portfolio_prediction(df, ols_3f_model, scaler_sel, selected_features)
    print("\nPorteføljeprediksjon (bottom-up tilnærming):")
    print("Sharpe Ratio for predikert portefølje:", sharpe_ratio)

if __name__ == "__main__":
    main()


Etter rensing, dataframe-shape: (7024, 37)
Størrelse på X før skalering: (7024, 29)
Størrelse på X_scaled: (7024, 29)
X_train shape: (5619, 29)
X_test shape: (1405, 29)

OLS med tre faktorer (Size, BM, Mom12m):
                            OLS Regression Results                            
Dep. Variable:          Excess_Return   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     12.25
Date:                Wed, 26 Feb 2025   Prob (F-statistic):           5.50e-08
Time:                        17:21:55   Log-Likelihood:                -4943.8
No. Observations:                5619   AIC:                             9896.
Df Residuals:                    5615   BIC:                             9922.
Df Model:                           3                                         
Covariance Type:                  HAC                                         