In [153]:
import pandas as pd
import numpy as np
import sklearn.linear_model as sklm
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV


In [118]:
data_ml = pd.read_csv("CountryFactorData (Good Data).csv", sep=",")

In [160]:

# Sort the DataFrame by the 'Date' column
data_ml = data_ml.sort_values(by='Date')
factors = ["Size", "Value", "Momentum", "Market Risk Premium"]

# Finding rows that have no returns recorded and factor data, we can predict these returns
data_to_predict = data_ml[['1M_Log_Ret', '1M_Simple_Ret']].isna().all(axis=1)
data_to_predict = data_ml[data_to_predict]
data_to_predict = data_to_predict[~data_to_predict[factors].isna().any(axis=1)]


In [124]:
# Building inputs for our Elastic_Net models

filtered_df_X = data_ml.dropna()
factors = ["Size", "Value", "Momentum", "Market Risk Premium"]
X = filtered_df_X[factors].values
Y = filtered_df_X["1M_Simple_Ret"].values
logY = filtered_df_X["1M_Log_Ret"].values

In [140]:
'''
Here is the definition of the l1_ratio parameter sourced from the library: scikit-learn
l1_ratiofloat, default=0.5
The ElasticNet mixing parameter, with 0 <= l1_ratio <= 1. 
For l1_ratio = 0 the penalty is an L2 penalty. 
For l1_ratio = 1 it is an L1 penalty. 
For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
'''

def generate_elastic_net(alpha, Y):
    elastic_net = sklm.ElasticNet(l1_ratio=alpha)
    elastic_net.fit(X,Y)

    return elastic_net

In [149]:
# Generate Elastic Net models with different alpha values
alpha_values = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
elastic_models = [generate_elastic_net(alpha, Y) for alpha in alpha_values]

# Evaluate model performance
for alpha, elastic_model in zip(alpha_values, elastic_models):
    Y_pred = elastic_model.predict(X)
    mse = mean_squared_error(Y, Y_pred)
    r2 = r2_score(Y, Y_pred)
    print(f"Alpha={alpha}, MSE={mse:.9f}, R-squared={r2:.4f}")

Alpha=0.01, MSE=0.005507232, R-squared=0.0000
Alpha=0.1, MSE=0.005507232, R-squared=0.0000
Alpha=0.2, MSE=0.005507232, R-squared=0.0000
Alpha=0.3, MSE=0.005507232, R-squared=0.0000
Alpha=0.4, MSE=0.005507232, R-squared=0.0000
Alpha=0.5, MSE=0.005507232, R-squared=0.0000
Alpha=0.6, MSE=0.005507232, R-squared=0.0000
Alpha=0.7, MSE=0.005507232, R-squared=0.0000
Alpha=0.8, MSE=0.005507232, R-squared=0.0000
Alpha=0.9, MSE=0.005507232, R-squared=0.0000
Alpha=1.0, MSE=0.005507232, R-squared=0.0000


In [150]:
e1 = sklm.ElasticNet(l1_ratio=0.1)
e1.fit(X,Y)
e2 = sklm.ElasticNet(l1_ratio=0.9)
e2.fit(X,Y)

Y_pred1 = e1.predict(X)
mse1 = mean_squared_error(Y, Y_pred1)
Y_pred2 = e2.predict(X)
mse2 = mean_squared_error(Y, Y_pred2)

print(mse1, mse2)
print("There is no meaningful difference in the l1_ratio chosen between 0 and 1")



0.005507232455332606 0.005507232455407054
There is no meaningful difference in the l1_ratio chosen between 0 and 1


In [156]:
# We can attempt a cross-validation grid search to find the optimal alpha and l1 parameters
'''
Here is the definition of alpha from scikit-learn:
Constant that multiplies the L1 term, controlling regularization strength. 
alpha must be a non-negative float i.e. in [0, inf).
When alpha = 0, the objective is equivalent to ordinary least squares, solved by the LinearRegression object. 
'''

e = sklm.ElasticNet()
# Define hyperparameters grid for tuning
param_grid = {'alpha': [0.1, 0.5, 1.0],
              'l1_ratio': [0.2, 0.5, 0.8]}

grid_search = GridSearchCV(estimator=e, param_grid=param_grid, cv=5)
grid_search.fit(X, Y)
best_model = grid_search.best_estimator_

print(f"Best Model: {best_model}")
print(f"The optimal alpha is {best_model.alpha} and the optimal l1_ratio is {best_model.l1_ratio}")


Best Model: ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.8,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
The optimal alpha is 1.0 and the optimal l1_ratio is 0.8


In [169]:
# Using these chosen parameters we can predict the returns and log_returns for new data.
# data_to_predict is sourced data that has no returns associated with them, but has factor data.
country_indices = data_to_predict["Ticker"]
dates = data_to_predict["Date"]

data_to_predict = data_to_predict[factors].values
final_model = sklm.ElasticNet(alpha=1.0, l1_ratio=0.8)
final_model.fit(X,Y)

final_model_log = sklm.ElasticNet(alpha=1.0, l1_ratio=0.8)
final_model_log.fit(X,logY)

Y_pred_final = final_model.predict(data_to_predict)
Y_pred_final_log = final_model_log.predict(data_to_predict)


In [177]:
print("MXEG or the Egypt Index is predicted to have the best return")

final_df = pd.DataFrame({
    "Date": dates,
    "Country Ticker": country_indices,
    "1M_Predicted_Returns": Y_pred_final,
    "1M_Predicted_LogReturns": Y_pred_final_log
})

final_df

MXEG or the Egypt Index is predicted to have the best return


Unnamed: 0,Date,Country Ticker,1M_Predicted_Returns,1M_Predicted_LogReturns
9,3/31/2024,MXCZ,0.005069,0.001189
2,3/31/2024,MXBE,0.00505,0.001211
3,3/31/2024,MXBR,0.005017,0.001246
4,3/31/2024,MXCA,0.004887,0.001391
5,3/31/2024,MXCH,0.004926,0.001347
6,3/31/2024,MXCL,0.005064,0.001194
7,3/31/2024,MXCN,0.00437,0.001964
8,3/31/2024,MXCO,0.00507,0.001188
42,3/31/2024,MXUS,0.001123,0.005566
11,3/31/2024,MXDK,0.005014,0.00125


In [176]:
# Find the row with the maximum value in column 'C'
max_row_index = final_df['1M_Predicted_Returns'].idxmax()
max_row = final_df.loc[max_row_index]

max_row


Date                        3/31/2024
Country Ticker                   MXEG
1M_Predicted_Returns       0.00507064
1M_Predicted_LogReturns    0.00118718
Name: 12, dtype: object