In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, StackingRegressor, VotingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import plotly.graph_objs as go

In [2]:
# load the Dataset
df = pd.read_csv('data/imputed_data_with_outliers.csv')
df = df.drop('Unnamed: 0', axis=1)
df

Unnamed: 0,Current Ratio,Debt-to-Equity Ratio,ESG Score,P/E (Daily Time Series Ratio),"Property Plant And Equipment, Total - Gross",ROA,ROE,RPE,Revenue Per Share,Total CO2 Equivalent Emissions To Revenues USD in million,returns_yearly
0,-0.667320,0.787649,-0.659333,-0.045664,-0.335405,0.059379,0.738860,-0.015340,-0.077165,-0.257039,1.440343
1,-0.427523,0.372604,-0.242713,-0.078255,-0.334950,0.185157,0.396091,-0.014311,-0.077015,-0.259776,-0.591381
2,-0.240483,0.124141,0.656275,-0.081795,-0.335582,0.187233,0.146383,-0.016007,-0.078394,-0.257494,-0.247656
3,-0.659813,0.127412,-0.338854,-0.099151,-0.328373,0.033950,0.106932,-0.012221,-0.079496,-0.251915,-1.333431
4,-0.660993,0.078395,0.473424,-0.075796,-0.325406,0.162415,0.094132,-0.016535,-0.079223,-0.254051,0.762454
...,...,...,...,...,...,...,...,...,...,...,...
10886,-0.220308,0.000018,1.651741,0.035506,-0.291608,-0.709930,-0.051087,-0.020367,-0.029422,-0.277715,-0.301922
10887,-0.884935,0.050266,-1.042968,-0.127408,-0.326207,-0.070402,0.018275,-0.015050,-0.088309,3.798012,-1.812004
10888,-0.723130,0.087478,-1.416022,-0.101698,-0.332562,-0.871319,-0.063123,-0.017572,-0.088386,2.940274,1.015303
10889,-0.674440,0.012810,-1.155118,-0.115011,-0.324982,-0.457219,-0.033449,-0.019854,-0.088323,3.955348,2.118901


In [3]:
X = df.loc[:, df.columns != 'ESG Score']
y = df['ESG Score']

In [None]:
# initialize the regression models
lr = LinearRegression()
rf = RandomForestRegressor(random_state=42)
gb = GradientBoostingRegressor(random_state=42)


# define the hyperparameter grid for each model
rf_param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15]}
gb_param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15], 'learning_rate': [0.1, 0.5, 1.0]}

# perform hyperparameter tuning using GridSearchCV for each model
rf_grid = GridSearchCV(rf, rf_param_grid, cv=5, scoring='r2')
rf_grid.fit(X, y)

gb_grid = GridSearchCV(gb, gb_param_grid, cv=5, scoring='r2')
gb_grid.fit(X, y)


# print the best parameters and r2_score of each model after hyperparameter tuning
print("Simple Linear Regression r2_score:", r2_score(y, lr.fit(X, y).predict(X)))
print("Random Forest Regression best parameters:", rf_grid.best_params_)
print("Random Forest Regression r2_score:", rf_grid.best_score_)
print("Gradient Boosting Regression best parameters:", gb_grid.best_params_)
print("Gradient Boosting Regression r2_score:", gb_grid.best_score_)


In [None]:
ab = AdaBoostRegressor(random_state=42)
bg = BaggingRegressor(random_state=42)
# stack the models and define the meta-regressor
stack = StackingRegressor(estimators=[('rf', rf), ('gb', gb), ('ab', ab), ('bg', bg)], final_estimator=lr)

# define the voting regressor
vote = VotingRegressor(estimators=[('lr', lr), ('rf', rf), ('gb', gb), ('ab', ab), ('bg', bg)])


ab_param_grid = {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.5, 1.0]}
bg_param_grid = {'n_estimators': [50, 100, 200], 'max_samples': [0.5, 0.8, 1.0]}
stack_param_grid = {
    'final_estimator__fit_intercept': [True, False],
    'final_estimator__normalize': [True, False]
}

vote_param_grid = {
    'weights': [[1, 1, 1, 1, 1], [1, 2, 1, 1, 1], [1, 1, 2, 1, 1], [1, 1, 1, 2, 1], [1, 1, 1, 1, 2]],
}


ab_grid = GridSearchCV(ab, ab_param_grid, cv=5, scoring='r2')
ab_grid.fit(X, y)

bg_grid = GridSearchCV(bg, bg_param_grid, cv=5, scoring='r2')
bg_grid.fit(X, y)

stack_grid = GridSearchCV(stack, stack_param_grid, cv=5, scoring='r2')
stack_grid.fit(X, y)

vote_grid = GridSearchCV(vote, vote_param_grid, cv=5, scoring='r2')
vote_grid.fit(X, y)

print("AdaBoost Regression best parameters:", ab_grid.best_params_)
print("AdaBoost Regression r2_score:", ab_grid.best_score_)
print("Bagging Regression best parameters:", bg_grid.best_params_)
print("Bagging Regression r2_score:", bg_grid.best_score_)
print("Stacking Regression best parameters:", stack_grid.best_params_)
print("Stacking Regression r2_score:", stack_grid.best_score_)
print("Voting Regression best parameters:", vote_grid.best_params_)
print("Voting Regression r2_score:", vote_grid.best_score_)