In [131]:
import pandas as pd
import numpy as np
import sklearn.linear_model as sklm
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt


In [118]:
data_ml = pd.read_csv("CountryFactorData (Good Data).csv", sep=",")

In [119]:

# Sort the DataFrame by the 'Date' column
data_ml = data_ml.sort_values(by='Date')
factors = ["Size", "Value", "Momentum", "Market Risk Premium"]

# Finding rows that have no returns recorded and factor data, we can predict these returns
data_to_predict = data_ml[['1M_Log_Ret', '1M_Simple_Ret']].isna().all(axis=1)
data_to_predict = data_ml[data_to_predict]
data_to_predict = data_to_predict[~data_to_predict[factors].isna().any(axis=1)]

data_to_predict



Unnamed: 0,Ticker,Date,Price,Size,Value,1M_Log_Ret,1M_Simple_Ret,Momentum,1Y Beta,Market Risk Premium
0,MXAT,3/31/2024,162.7497,51693.87,0.9667,,,0.97,0.74,0.1615
24,MXIT,3/31/2024,97.6498,669639.0,1.4402,,,1.1811,0.74,0.1615
23,MXIN,3/31/2024,31.6731,3124550.0,4.269,,,1.2899,0.27,0.1615
25,MXJP,3/31/2024,11.3179,5108855.0,1.6728,,,1.1998,0.29,0.1615
26,MXKR,3/31/2024,0.6519,1419998.0,1.105,,,1.0794,0.4,0.1615
27,MXMX,3/31/2024,7076.72,398119.9,2.2294,,,1.056,0.78,0.1615
28,MXMY,3/31/2024,101.7989,230020.9,1.392,,,1.0116,0.14,0.1615
29,MXNL,3/31/2024,294.0434,1054428.0,3.4434,,,1.2044,1.08,0.1615
30,MXNO,3/31/2024,317.5905,217878.9,1.7419,,,0.9703,0.3,0.1615
31,MXNZ,3/31/2024,95.2216,40372.53,2.829,,,0.9245,0.2,0.1615


In [86]:
# Ignore

def build_date_df(date_string):
    df = pd.DataFrame()
    df["Date"] = [date_string for i in range(len(index_cols))]
    df["Index"] = index_cols
    df["PX_LAST"] = list(data_ml[(data_ml['Date'] == date_string) & (data_ml['Field'] == "PX_LAST")].values[0][2:])
    df["CUR_MKT_CAP"] = list(data_ml[(data_ml['Date'] == date_string) & (data_ml['Field'] == "CUR_MKT_CAP")].values[0][2:])
    df["PX_TO_BOOK_RATIO"] = list(data_ml[(data_ml['Date'] == date_string) & (data_ml['Field'] == "PX_TO_BOOK_RATIO")].values[0][2:])

    df['Date'] = pd.to_datetime(df['Date'])
    df = df.dropna()
    return df
    

In [124]:
# Building inputs for our lasso and ridge models

filtered_df_X = data_ml.dropna()
factors = ["Size", "Value", "Momentum", "Market Risk Premium"]
X = filtered_df_X[factors].values
Y = filtered_df_X["1M_Simple_Ret"].values
logY = filtered_df_X["1M_Log_Ret"].values

In [125]:
# Lasso
def generate_lasso_model(alpha, Y):
    lasso = sklm.Lasso(alpha=alpha)
    lasso.fit(X, Y)

    return lasso

lasso_model = generate_lasso_model(0.1, Y)

In [None]:
results = lasso_model.predict(X)

for i in range(len(results)):
    print(abs(Y[i] - results[i]))


In [None]:
lasso_model.predict(data_to_predict[factors].values)

In [137]:
# Generate Lasso models with different alpha values
alpha_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
lasso_models = [generate_lasso_model(alpha, logY) for alpha in alpha_values]

# Evaluate model performance
for alpha, lasso_model in zip(alpha_values, lasso_models):
    Y_pred = lasso_model.predict(X)
    mse = mean_squared_error(Y, Y_pred)
    r2 = r2_score(Y, Y_pred)
    print(f"Alpha={alpha}, MSE={mse:.9f}, R-squared={r2:.4f}")

Alpha=0.1, MSE=0.005521374, R-squared=-0.0026
Alpha=0.2, MSE=0.005521374, R-squared=-0.0026
Alpha=0.3, MSE=0.005521374, R-squared=-0.0026
Alpha=0.4, MSE=0.005521374, R-squared=-0.0026
Alpha=0.5, MSE=0.005521374, R-squared=-0.0026
Alpha=0.6, MSE=0.005521374, R-squared=-0.0026
Alpha=0.7, MSE=0.005521374, R-squared=-0.0026
Alpha=0.8, MSE=0.005521374, R-squared=-0.0026
Alpha=0.9, MSE=0.005521374, R-squared=-0.0026
Alpha=1.0, MSE=0.005521374, R-squared=-0.0026


In [127]:
def generate_ridge_model(alpha, Y):
    ridge = sklm.Ridge(alpha)
    ridge.fit(X,Y)

    return ridge

ridge_model = generate_ridge_model(0.1, logY)

In [128]:
ridge_model.predict(data_to_predict[factors].values)

array([-3.24936756e-04,  1.07103357e-03,  9.74856555e-04,  1.64403038e-03,
        5.69300200e-04, -1.88993612e-04, -1.80866048e-04,  4.53257581e-04,
       -6.24605304e-04, -1.42671816e-03,  1.23699023e-03, -2.98232870e-05,
        2.15075203e-03, -1.57147192e-03, -4.74670686e-05, -8.22554147e-04,
       -1.42434463e-03,  1.13898012e-03,  8.12161827e-04,  1.12952793e-03,
        5.91922507e-05, -1.08367884e-03,  2.25103230e-03, -2.74662889e-03,
       -7.12210969e-04,  1.10726239e-03,  5.99529641e-05, -1.19547148e-03,
       -8.19707352e-04, -2.65142071e-04,  1.21777826e-03,  5.73119367e-03,
       -1.98555091e-03, -1.01648111e-03,  2.36963572e-03,  1.71897503e-04,
       -1.15266655e-03, -7.47201137e-05, -3.58686878e-04,  2.14804279e-03,
       -1.47771706e-03,  1.79084748e-04, -1.13374309e-03, -5.80396106e-04,
       -6.95432558e-03])

In [138]:
def generate_elastic_net(alpha, Y):
    elastic_net = sklm.ElasticNet(alpha)
    elastic_net.fit(X,Y)

    return elastic_net

In [139]:
# Generate Lasso models with different alpha values
alpha_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
elastic_models = [generate_elastic_net(alpha, Y) for alpha in alpha_values]

# Evaluate model performance
for alpha, elastic_model in zip(alpha_values, elastic_models):
    Y_pred = elastic_model.predict(X)
    mse = mean_squared_error(Y, Y_pred)
    r2 = r2_score(Y, Y_pred)
    print(f"Alpha={alpha}, MSE={mse:.9f}, R-squared={r2:.4f}")

Alpha=0.1, MSE=0.005507232, R-squared=0.0000
Alpha=0.2, MSE=0.005507232, R-squared=0.0000
Alpha=0.3, MSE=0.005507232, R-squared=0.0000
Alpha=0.4, MSE=0.005507232, R-squared=0.0000
Alpha=0.5, MSE=0.005507232, R-squared=0.0000
Alpha=0.6, MSE=0.005507232, R-squared=0.0000
Alpha=0.7, MSE=0.005507232, R-squared=0.0000
Alpha=0.8, MSE=0.005507232, R-squared=0.0000
Alpha=0.9, MSE=0.005507232, R-squared=0.0000
Alpha=1.0, MSE=0.005507232, R-squared=0.0000
