In [31]:
# import required libraries and load dataset
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, StackingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler


In [12]:
# load the Dataset
df = pd.read_csv('data/imputed_data_with_outliers.csv')
df = df.drop('Unnamed: 0', axis=1)
df

Unnamed: 0,Current Ratio,Debt-to-Equity Ratio,ESG Score,P/E (Daily Time Series Ratio),"Property Plant And Equipment, Total - Gross",ROA,ROE,RPE,Revenue Per Share,Total CO2 Equivalent Emissions To Revenues USD in million,returns_yearly
0,-0.667320,0.787649,-0.659333,-0.045664,-0.335405,0.059379,0.738860,-0.015340,-0.077165,-0.257039,1.440343
1,-0.427523,0.372604,-0.242713,-0.078255,-0.334950,0.185157,0.396091,-0.014311,-0.077015,-0.259776,-0.591381
2,-0.240483,0.124141,0.656275,-0.081795,-0.335582,0.187233,0.146383,-0.016007,-0.078394,-0.257494,-0.247656
3,-0.659813,0.127412,-0.338854,-0.099151,-0.328373,0.033950,0.106932,-0.012221,-0.079496,-0.251915,-1.333431
4,-0.660993,0.078395,0.473424,-0.075796,-0.325406,0.162415,0.094132,-0.016535,-0.079223,-0.254051,0.762454
...,...,...,...,...,...,...,...,...,...,...,...
10886,-0.220308,0.000018,1.651741,0.035506,-0.291608,-0.709930,-0.051087,-0.020367,-0.029422,-0.277715,-0.301922
10887,-0.884935,0.050266,-1.042968,-0.127408,-0.326207,-0.070402,0.018275,-0.015050,-0.088309,3.798012,-1.812004
10888,-0.723130,0.087478,-1.416022,-0.101698,-0.332562,-0.871319,-0.063123,-0.017572,-0.088386,2.940274,1.015303
10889,-0.674440,0.012810,-1.155118,-0.115011,-0.324982,-0.457219,-0.033449,-0.019854,-0.088323,3.955348,2.118901


In [16]:
df.describe([0.9,0.99])

Unnamed: 0,Current Ratio,Debt-to-Equity Ratio,ESG Score,P/E (Daily Time Series Ratio),"Property Plant And Equipment, Total - Gross",ROA,ROE,RPE,Revenue Per Share,Total CO2 Equivalent Emissions To Revenues USD in million,returns_yearly
count,10891.0,10891.0,10891.0,10891.0,10891.0,10891.0,10891.0,10891.0,10891.0,10891.0,10891.0
mean,-1.657128e-16,-1.304826e-18,9.264261000000001e-17,6.785093e-17,7.828953e-18,3.392546e-17,0.0,4.0775800000000005e-17,3.783994e-17,3.5230290000000005e-17,1.158033e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.509083,-86.21671,-3.253113,-0.1471012,-0.3613957,-5.85247,-59.881914,-0.02465539,-0.08939611,-0.2813964,-6.571832
50%,-0.2390459,-0.003513982,0.09170597,-0.07459471,-0.2885142,-0.1657153,-0.008096,-0.01641564,-0.06044032,-0.2500261,-0.009564503
90%,1.052619,0.08573281,1.254923,0.03394676,0.479086,0.8359056,0.082825,0.005837578,0.04211599,0.2804717,1.136862
99%,3.731461,0.5151099,1.81937,0.9328079,5.146209,2.566115,0.745379,0.06810633,0.5352868,4.082149,2.662085
max,12.92817,25.78137,2.103325,53.90407,13.76552,32.7444,28.563646,104.23,77.58222,22.00476,7.823418


In [20]:
X = df.loc[:, df.columns != 'ESG Score']
y = df['ESG Score']

In [None]:
# rescale the features
scaler = MinMaxScaler()

# apply scaler() to all the numeric columns 
X = scaler.fit_transform(X)

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#Linear Regression
lr = LinearRegression()

# initialize the ensemble regression models
rf = RandomForestRegressor(n_estimators=100, random_state=42)
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
ab = AdaBoostRegressor(n_estimators=100, random_state=42)
bg = BaggingRegressor(n_estimators=100, random_state=42)

# stack the models and define the meta-regressor
stack = StackingRegressor(estimators=[('rf', rf), ('gb', gb), ('ab', ab), ('bg', bg)], final_estimator=lr)

# define the voting regressor
vote = VotingRegressor(estimators=[('rf', rf), ('gb', gb), ('ab', ab), ('bg', bg)])

# fit the models on the training data
rf.fit(X_train, y_train)
gb.fit(X_train, y_train)
ab.fit(X_train, y_train)
bg.fit(X_train, y_train)
stack.fit(X_train, y_train)
vote.fit(X_train, y_train)
lr.fit(X_train, y_train)

# make predictions on the testing data
rf_pred = rf.predict(X_test)
gb_pred = gb.predict(X_test)
ab_pred = ab.predict(X_test)
bg_pred = bg.predict(X_test)
stack_pred = stack.predict(X_test)
vote_pred = vote.predict(X_test)
lr_pred = lr.predict(X_test)

# calculate the root mean squared error of each model
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_pred))
ab_rmse = np.sqrt(mean_squared_error(y_test, ab_pred))
bg_rmse = np.sqrt(mean_squared_error(y_test, bg_pred))
stack_rmse = np.sqrt(mean_squared_error(y_test, stack_pred))
vote_rmse = np.sqrt(mean_squared_error(y_test, vote_pred))

# calculate the root mean absolute error of each model
lr_rmae = (mean_absolute_error(y_test, lr_pred))
rf_rmae = (mean_absolute_error(y_test, rf_pred))
gb_rmae = (mean_absolute_error(y_test, gb_pred))
ab_rmae = (mean_absolute_error(y_test, ab_pred))
bg_rmae = (mean_absolute_error(y_test, bg_pred))
stack_rmae = (mean_absolute_error(y_test, stack_pred))
vote_rmae = (mean_absolute_error(y_test, vote_pred))


# print the RMSE of each model
print("Linear Regression RMSE:", lr_rmse)
print("Random Forest RMSE:", rf_rmse)
print("Gradient Boosting RMSE:", gb_rmse)
print("AdaBoost RMSE:", ab_rmse)
print("Bagging RMSE:", bg_rmse)
print("Stacking RMSE:", stack_rmse)
print("Voting RMSE:", vote_rmse)


# print the RMAE of each model
print("Linear Regression RMAE:", lr_rmae)
print("Random Forest RMAE:", rf_rmae)
print("Gradient Boosting RMAE:", gb_rmae)
print("AdaBoost RMAE:", ab_rmae)
print("Bagging RMAE:", bg_rmae)
print("Stacking RMAE:", stack_rmae)
print("Voting RMAE:", vote_rmae)



In [41]:
# create a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
# perform cross-validation and calculate the r2_score for each model
rf_scores = cross_val_score(rf, X, y, cv=folds, scoring='r2')
gb_scores = cross_val_score(gb, X, y, cv=folds, scoring='r2')
ab_scores = cross_val_score(ab, X, y, cv=folds, scoring='r2')
bg_scores = cross_val_score(bg, X, y, cv=folds, scoring='r2')
stack_scores = cross_val_score(stack, X, y, cv=folds, scoring='r2')
vote_scores = cross_val_score(vote, X, y, cv=folds, scoring='r2')
lr_scores = cross_val_score(lr, X, y, cv=folds, scoring='r2')

# print the r2_score of each model
print("Random Forest r2_score:", np.mean(rf_scores))
print("Gradient Boosting r2_score:", np.mean(gb_scores))
print("AdaBoost r2_score:", np.mean(ab_scores))
print("Bagging r2_score:", np.mean(bg_scores))
print("Stacking r2_score:", np.mean(stack_scores))
print("Voting r2_score:", np.mean(vote_scores))
print("Linear r2_score:", np.mean(lr_scores))

Random Forest r2_score: 0.45684120101975323
Gradient Boosting r2_score: 0.3015886749225739
AdaBoost r2_score: 0.18451487470856243
Bagging r2_score: 0.4567097502570161
Stacking r2_score: 0.36419590811909186
Voting r2_score: 0.38901181383159067
Linear r2_score: 0.06725405680828223


In [24]:
# rescale the features
scaler = MinMaxScaler()

# apply scaler() to all the numeric columns 
X = scaler.fit_transform(X)

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

RF without and with cross validation

In [40]:
# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
print("Random Forest RMSE:", rf_rmse)
rf_r2 = r2_score(y_test, rf_pred)
print("Random Forest R2:", rf_r2)

Random Forest RMSE: 0.7499096097076687
Random Forest R2: 0.44206904972644634


In [28]:
rf_scores = cross_val_score(rf, X, y, cv=folds, scoring='r2')
print("Random Forest r2_score:", np.mean(rf_scores))

Random Forest r2_score: 0.45684120101975323


Gradient Boost without and with cross validation

In [42]:
# Gradient Boost
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)

gb_rmse = np.sqrt(mean_squared_error(y_test, gb_pred))
print("Gradient Boost RMSE:", gb_rmse)

gb_r2 = r2_score(y_test, gb_pred)
print("Gradient Boost R2:", gb_r2)

Gradient Boost RMSE: 0.8425812870189978
Gradient Boost R2: 0.29565370374747857


In [30]:
gb_scores = cross_val_score(gb, X, y, cv=folds, scoring='r2')
print("Gradient Boost r2_score:", np.mean(gb_scores))

Gradient Boost r2_score: 0.3015886749225739


In [35]:
scatter = go.Scatter(x=y_test, y=gb_pred, mode='markers', name='y_predicted')
line = go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], mode='lines', name='Ideal')
fig = go.Figure(data=[scatter, line])

# Add axis labels and a title
fig.update_layout(
    xaxis_title='y_test',
    yaxis_title='y_pred',
    title='Gradient Boost: Scatter Plot of y_test vs y_pred with Ideal Scenario'
)
fig.show()


In [36]:
# Ada Boost
ab = AdaBoostRegressor(n_estimators=100, random_state=42)
ab.fit(X_train, y_train)
ab_pred = ab.predict(X_test)

ab_rmse = np.sqrt(mean_squared_error(y_test, ab_pred))
print("AdaBoost RMSE:", ab_rmse)

ab_r2 = r2_score(y_test, ab_pred)
print("AdaBoost R2:", ab_r2)




AdaBoost RMSE: 0.9011152107245075
AdaBoost R2: 0.19439295171030113


In [37]:
bg = BaggingRegressor(n_estimators=100, random_state=42)
bg.fit(X_train, y_train)
bg_pred = bg.predict(X_test)

bg_rmse = np.sqrt(mean_squared_error(y_test, bg_pred))
print("Bagging RMSE:", bg_rmse)

bg_r2 = r2_score(y_test, bg_pred)
print("Bagging R2:", bg_r2)



Bagging RMSE: 0.7501263410116482
Bagging R2: 0.4417465079840396


In [38]:
# stack the models and define the meta-regressor
stack = StackingRegressor(estimators=[('rf', rf), ('gb', gb), ('ab', ab), ('bg', bg)], final_estimator=lr)
stack.fit(X_train, y_train)
stack_pred = stack.predict(X_test)

stack_rmse = np.sqrt(mean_squared_error(y_test, stack_pred))
print("Stacking RMSE:", stack_rmse)

stack_r2 = r2_score(y_test, stack_pred)
print("stacking R2:", stack_r2)






Stacking RMSE: 0.7383972206356333
stacking R2: 0.45906793917492206


In [39]:
# define the voting regressor
vote = VotingRegressor(estimators=[('rf', rf), ('gb', gb), ('ab', ab), ('bg', bg)])
vote.fit(X_train, y_train)
vote_pred = vote.predict(X_test)

vote_rmse = np.sqrt(mean_squared_error(y_test, vote_pred))
print("Voting RMSE:", vote_rmse)

vote_r2 = r2_score(y_test, vote_pred)
print("Voting R2:", vote_r2)

Voting RMSE: 0.7922647699824452
Voting R2: 0.3772649578033088
