In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV


# Step 1: Load and Clean Data
def load_and_clean_data(file_path):
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'])

    # Remove non-numeric columns like Ticker before calculations
    if 'Ticker' in df.columns:
        df.drop(columns=['Ticker'], inplace=True)

    #df = df[df['MarketCap'] >= 2e9]  # Remove microcaps (NOK 2 billion threshold)
    df['Excess_Return'] = df['OSEBXReturns'] - df['NorgesBank10Y']
    df['Size'] = np.log(df['MarketCap'])
    df['BM'] = (df['BookValuePerShare'] * df['CommonSharesOutstanding']) / df['MarketCap']
    df = df[df['BM'] > 0]
    df['BM'] = np.log(df['BM'])
    
    # Ensure Momentum_12M is included and never removed
    df['Mom12m'] = df['Momentum_12M']
    
    df.dropna(inplace=True)
    return df

# Step 2: Drop Highly Correlated Features
def drop_highly_correlated_features(X, threshold=0.99, keep_features=['Mom12m']):
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    to_drop = [column for column in upper.columns if any(upper[column] > threshold) and column not in keep_features]
    if to_drop:
        print(f"Dropping highly correlated features: {to_drop}")
        X = X.drop(columns=to_drop)

    return X

# Step 3: Remove High VIF Features
def remove_high_vif_features(X, threshold=10, keep_features=['Mom12m']):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    high_vif_features = vif_data[vif_data["VIF"] > threshold]["Feature"].tolist()
    high_vif_features = [feature for feature in high_vif_features if feature not in keep_features]  # Always keep Mom12m

    if high_vif_features:
        print(f"Removing high VIF features: {high_vif_features}")
        X = X.drop(columns=high_vif_features)

    return X

# Step 4: Split Data (Stock-Level OLS)
def split_stock_level_data(df):
    X = df.drop(columns=['Excess_Return', 'Date'])  
    y = df['Excess_Return']

    X = X.select_dtypes(include=[np.number])

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), index=df.index, columns=X.columns)

    # Run Full OLS Before Removing Any Variables
    full_ols_model_before_vif, train_r2_before_vif, test_r2_before_vif = run_ols(X_scaled, y)
    print("\n Full OLS Results (Before Removing Any Variables):")
    print(full_ols_model_before_vif.summary())

    # Drop highly correlated and high VIF features but KEEP Mom12m
    X_scaled = drop_highly_correlated_features(X_scaled, keep_features=['Mom12m'])
    X_scaled = remove_high_vif_features(X_scaled, keep_features=['Mom12m'])

    # 80-20 Train-Test Split
    split_index = int(len(df) * 0.8)
    X_train, X_test = X_scaled.iloc[:split_index], X_scaled.iloc[split_index:]
    y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

    return X_train, X_test, y_train, y_test, train_r2_before_vif, test_r2_before_vif

# Step 5: Run OLS with Newey-West Standard Errors
def run_ols(X_train, y_train, X_test=None, y_test=None):
    X_train = sm.add_constant(X_train)
    if X_test is not None:
        X_test = sm.add_constant(X_test)

    model = sm.OLS(y_train, X_train).fit(cov_type='HAC', cov_kwds={'maxlags': 1})

    test_r2 = None
    if X_test is not None and y_test is not None:
        test_r2 = sm.OLS(y_test, X_test).fit().rsquared  

    return model, model.rsquared, test_r2

# Step 6: Feature Selection Using Lasso
def lasso_feature_selection(X_train, y_train):
    lasso = LassoCV(cv=5, random_state=42, max_iter=10000, n_jobs=-1)
    lasso.fit(X_train, y_train)
    selected_features = X_train.columns[lasso.coef_ != 0]
    print(f"Selected Features using Lasso: {selected_features.tolist()}")
    return X_train[selected_features], selected_features

# Step 7: Portfolio-Level OLS (Keeping Only BM, Size, and Momentum 12M)
def construct_value_weighted_portfolio(df):
    df['Weight'] = df['MarketCap'] / df.groupby('Date')['MarketCap'].transform('sum')
    df['Weighted_Return'] = df['Weight'] * df['Excess_Return']
    return df.groupby('Date')['Weighted_Return'].sum()

def split_portfolio_data(df):
    y_portfolio = construct_value_weighted_portfolio(df)

    # Select only BM, Size, and Mom12m for portfolio-level regression
    X = df[['Size', 'BM', 'Mom12m', 'Date']].copy()
    X = X.groupby('Date').mean()

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

    split_date = X.index[int(len(X) * 0.8)]
    X_train, X_test = X_scaled.loc[X_scaled.index <= split_date], X_scaled.loc[X_scaled.index > split_date]
    y_train, y_test = y_portfolio.loc[y_portfolio.index <= split_date], y_portfolio.loc[y_portfolio.index > split_date]

    return X_train, X_test, y_train, y_test


# Step 8: Main Execution
def main():
    file_path = "OSEBX_Market_Macro_Data_2015_2024.csv"
    df = load_and_clean_data(file_path)

    X_train, X_test, y_train, y_test, train_r2_before_vif, test_r2_before_vif = split_stock_level_data(df)

    full_ols_model, train_r2_full, test_r2_full = run_ols(X_train, y_train, X_test, y_test)
    print("\n Full OLS Results (After Removing High VIF Features):")
    print(full_ols_model.summary())

    selected_features = ['Size', 'BM', 'Mom12m']
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    selected_ols_model, train_r2_selected, test_r2_selected = run_ols(X_train_selected, y_train, X_test_selected, y_test)
    print("\n OLS Results (BM, Size, Momentum_12M):")
    print(selected_ols_model.summary())

    X_train_lasso, selected_features_lasso = lasso_feature_selection(X_train, y_train)
    X_test_lasso = X_test[selected_features_lasso]
    lasso_ols_model, train_r2_lasso, test_r2_lasso = run_ols(X_train_lasso, y_train, X_test_lasso, y_test)
    print("\n OLS Results (Lasso-Selected Features):")
    print(lasso_ols_model.summary())

    X_train_port, X_test_port, y_train_port, y_test_port = split_portfolio_data(df)
    portfolio_model, train_r2_port, test_r2_port = run_ols(X_train_port, y_train_port, X_test_port, y_test_port)
    print("\n Portfolio-Level OLS Results:")
    print(portfolio_model.summary())

    r2_results = pd.DataFrame({
        "Model": ["Before VIF", "After VIF", "BM, Size, Mom12m", "Lasso-Selected", "Portfolio-Level"],
        "Train R²": [train_r2_before_vif, train_r2_full, train_r2_selected, train_r2_lasso, train_r2_port],
        "Test R²": [test_r2_before_vif, test_r2_full, test_r2_selected, test_r2_lasso, test_r2_port]
    })

    display(r2_results)

if __name__ == "__main__":
    main()



 Full OLS Results (Before Removing Any Variables):
                            OLS Regression Results                            
Dep. Variable:          Excess_Return   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.699e+30
Date:                Fri, 21 Feb 2025   Prob (F-statistic):               0.00
Time:                        10:27:39   Log-Likelihood:             2.0930e+05
No. Observations:                7024   AIC:                        -4.185e+05
Df Residuals:                    6992   BIC:                        -4.183e+05
Df Model:                          31                                         
Covariance Type:                  HAC                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------

  vif = 1. / (1. - r_squared_i)


Removing high VIF features: ['Momentum_12M', 'USDNOK', 'EURNOK', 'US10Y', 'USCPI', 'NorgesBank10Y']

 Full OLS Results (After Removing High VIF Features):
                            OLS Regression Results                            
Dep. Variable:          Excess_Return   R-squared:                       0.559
Model:                            OLS   Adj. R-squared:                  0.557
Method:                 Least Squares   F-statistic:                     173.3
Date:                Fri, 21 Feb 2025   Prob (F-statistic):               0.00
Time:                        10:27:39   Log-Likelihood:                -5162.9
No. Observations:                5619   AIC:                         1.037e+04
Df Residuals:                    5596   BIC:                         1.052e+04
Df Model:                          22                                         
Covariance Type:                  HAC                                         
                              coef    std err          

Unnamed: 0,Model,Train R²,Test R²
0,Before VIF,1.0,
1,After VIF,0.559088,0.630404
2,"BM, Size, Mom12m",0.028216,0.041482
3,Lasso-Selected,0.557324,0.623073
4,Portfolio-Level,0.428968,0.235423
