In [None]:
# import required libraries and load dataset
import numpy as np
import pandas as pd
import sys
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, StackingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

# load the Dataset and fill the X and Y values
#boston = load_boston()
#sys.path["D:\\Germany\\Semester 3\\Economic Modeling\\Projects\\SP_ESG_Pred\\data\\model_training_filesimputed_with_outliers.csv"]
file = pd.read_csv("imputed_with_outliers.csv")
columns_selected = [
   'Total Current Assets', 'Total Current Liabilities',
       'Total Debt', 'Total Assets, Reported', 'Net Income - Actual',
       'Revenue Per Share', 'Total Revenue', 'Total Equity',
       'Total CO2 Equivalent Emissions To Revenues USD in million',
       'Company Market Capitalization',
       'Property Plant And Equipment, Total - Gross',
       'P/E (Daily Time Series Ratio)', 'returns_yearly'
]
X = file[columns_selected]
y = file['ESG Score']

# rescale the features
scaler = MinMaxScaler()

# apply scaler() to all the numeric columns 
X = scaler.fit_transform(X)

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#Linear Regression
lr = LinearRegression()

# initialize the ensemble regression models
rf = RandomForestRegressor(n_estimators=100, random_state=42)
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
ab = AdaBoostRegressor(n_estimators=100, random_state=42)
bg = BaggingRegressor(n_estimators=100, random_state=42)

# stack the models and define the meta-regressor
stack = StackingRegressor(estimators=[('rf', rf), ('gb', gb), ('ab', ab), ('bg', bg)], final_estimator=lr)

# define the voting regressor
vote = VotingRegressor(estimators=[('rf', rf), ('gb', gb), ('ab', ab), ('bg', bg)])

# fit the models on the training data
rf.fit(X_train, y_train)
gb.fit(X_train, y_train)
ab.fit(X_train, y_train)
bg.fit(X_train, y_train)
stack.fit(X_train, y_train)
vote.fit(X_train, y_train)
lr.fit(X_train, y_train)

# make predictions on the testing data
rf_pred = rf.predict(X_test)
gb_pred = gb.predict(X_test)
ab_pred = ab.predict(X_test)
bg_pred = bg.predict(X_test)
stack_pred = stack.predict(X_test)
vote_pred = vote.predict(X_test)
lr_pred = lr.predict(X_test)

# calculate the root mean squared error of each model
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_pred))
ab_rmse = np.sqrt(mean_squared_error(y_test, ab_pred))
bg_rmse = np.sqrt(mean_squared_error(y_test, bg_pred))
stack_rmse = np.sqrt(mean_squared_error(y_test, stack_pred))
vote_rmse = np.sqrt(mean_squared_error(y_test, vote_pred))

# calculate the root mean absolute error of each model
lr_rmae = (mean_absolute_error(y_test, lr_pred))
rf_rmae = (mean_absolute_error(y_test, rf_pred))
gb_rmae = (mean_absolute_error(y_test, gb_pred))
ab_rmae = (mean_absolute_error(y_test, ab_pred))
bg_rmae = (mean_absolute_error(y_test, bg_pred))
stack_rmae = (mean_absolute_error(y_test, stack_pred))
vote_rmae = (mean_absolute_error(y_test, vote_pred))


# print the RMSE of each model
print("Linear Regression RMSE:", lr_rmse)
print("Random Forest RMSE:", rf_rmse)
print("Gradient Boosting RMSE:", gb_rmse)
print("AdaBoost RMSE:", ab_rmse)
print("Bagging RMSE:", bg_rmse)
print("Stacking RMSE:", stack_rmse)
print("Voting RMSE:", vote_rmse)


# print the RMAE of each model
print("Linear Regression RMAE:", lr_rmae)
print("Random Forest RMAE:", rf_rmae)
print("Gradient Boosting RMAE:", gb_rmae)
print("AdaBoost RMAE:", ab_rmae)
print("Bagging RMAE:", bg_rmae)
print("Stacking RMAE:", stack_rmae)
print("Voting RMAE:", vote_rmae)

# create a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
# perform cross-validation and calculate the r2_score for each model
rf_scores = cross_val_score(rf, X, y, cv=folds, scoring='r2')
gb_scores = cross_val_score(gb, X, y, cv=folds, scoring='r2')
ab_scores = cross_val_score(ab, X, y, cv=folds, scoring='r2')
bg_scores = cross_val_score(bg, X, y, cv=folds, scoring='r2')
stack_scores = cross_val_score(stack, X, y, cv=folds, scoring='r2')
vote_scores = cross_val_score(vote, X, y, cv=folds, scoring='r2')
lr_scores = cross_val_score(lr, X, y, cv=folds, scoring='r2')

# print the r2_score of each model
print("Random Forest r2_score:", np.mean(rf_scores))
print("Gradient Boosting r2_score:", np.mean(gb_scores))
print("AdaBoost r2_score:", np.mean(ab_scores))
print("Bagging r2_score:", np.mean(bg_scores))
print("Stacking r2_score:", np.mean(stack_scores))
print("Voting r2_score:", np.mean(vote_scores))
print("Linear r2_score:", np.mean(lr_scores))

In [None]:
# import required libraries and load dataset
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, StackingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
# load the Dataset and fill the X and Y values
file = pd.read_csv("imputed_with_outliers.csv")
columns_selected = [
   'Total Current Assets', 'Total Current Liabilities',
       'Total Debt', 'Total Assets, Reported', 'Net Income - Actual',
       'Revenue Per Share', 'Total Revenue', 'Total Equity',
       'Total CO2 Equivalent Emissions To Revenues USD in million',
       'Company Market Capitalization',
       'Property Plant And Equipment, Total - Gross',
       'P/E (Daily Time Series Ratio)', 'returns_yearly'
]
X = file[columns_selected]
y = file['ESG Score']

# rescale the features
scaler = MinMaxScaler()

# apply scaler() to all the numeric columns 
X = scaler.fit_transform(X)

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
rf = RandomForestRegressor(n_estimators=100, random_state=42)


#######################################################
# define the hyperparameter grid for each model
rf_param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15]}
#gb_param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15], 'learning_rate': [0.1, 0.5, 1.0]}
#ab_param_grid = {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.5, 1.0]}
#bg_param_grid = {'n_estimators': [50, 100, 200], 'max_samples': [0.5, 0.8, 1.0]}
#stack_param_grid = {
 #   'final_estimator__fit_intercept': [True, False],
  #  'final_estimator__normalize': [True, False]
#}

#vote_param_grid = {
#    'weights': [[1, 1, 1, 1, 1], [1, 2, 1, 1, 1], [1, 1, 2, 1, 1], [1, 1, 1, 2, 1], [1, 1, 1, 1, 2]],
#}
# perform hyperparameter tuning using GridSearchCV for each model
rf_grid = GridSearchCV(rf, rf_param_grid, cv=5, scoring='r2')
#rf_grid1 = GridSearchCV(rf, rf_param_grid, cv=5, scoring='rmse')

#gb_grid = GridSearchCV(gb, gb_param_grid, cv=5, scoring='r2')


#ab_grid = GridSearchCV(ab, ab_param_grid, cv=5, scoring='r2')


#bg_grid = GridSearchCV(bg, bg_param_grid, cv=5, scoring='r2')


#stack_grid = GridSearchCV(stack, stack_param_grid, cv=5, scoring='r2')


#vote_grid = GridSearchCV(vote, vote_param_grid, cv=5, scoring='r2')



#######################################################
# fit the models on the training data
rf_grid.fit(X_train, y_train)
#gb_grid.fit(X_train, y_train)
#ab_grid.fit(X_train, y_train)
#bg_grid.fit(X_train, y_train)
#stack_grid.fit(X_train, y_train)
#vote_grid.fit(X_train, y_train)
#lr.fit(X_train, y_train)

# make predictions on the testing data
rf_pred = rf_grid.predict(X_test)
#gb_pred = gb_grid.predict(X_test)
#ab_pred = ab_grid.predict(X_test)
#bg_pred = bg_grid.predict(X_test)
#stack_pred = stack_grid.predict(X_test)
#vote_pred = vote_grid.predict(X_test)
#lr_pred = lr.predict(X_test)

# calculate the root mean squared error of each model
#lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
#gb_rmse = np.sqrt(mean_squared_error(y_test, gb_pred))
#ab_rmse = np.sqrt(mean_squared_error(y_test, ab_pred))
bg_rmse = np.sqrt(mean_squared_error(y_test, bg_pred))
#stack_rmse = np.sqrt(mean_squared_error(y_test, stack_pred))
#vote_rmse = np.sqrt(mean_squared_error(y_test, vote_pred))

# calculate the root mean absolute error of each model
#lr_rmae = (mean_absolute_error(y_test, lr_pred))
rf_rmae = (mean_absolute_error(y_test, rf_pred))
# gb_rmae = (mean_absolute_error(y_test, gb_pred))
# ab_rmae = (mean_absolute_error(y_test, ab_pred))
# bg_rmae = (mean_absolute_error(y_test, bg_pred))
#stack_rmae = (mean_absolute_error(y_test, stack_pred))
#vote_rmae = (mean_absolute_error(y_test, vote_pred))


# print the RMSE of each model
#print("Linear Regression RMSE:", lr_rmse)
print("Random Forest RMSE:", rf_rmse)
# print("Gradient Boosting RMSE:", gb_rmse)
# print("AdaBoost RMSE:", ab_rmse)
# print("Bagging RMSE:", bg_rmse)
#print("Stacking RMSE:", stack_rmse)
#print("Voting RMSE:", vote_rmse)


# print the RMAE of each model
#print("Linear Regression RMAE:", lr_rmae)
print("Random Forest RMAE:", rf_rmae)
# print("Gradient Boosting RMAE:", gb_rmae)
# print("AdaBoost RMAE:", ab_rmae)
# print("Bagging RMAE:", bg_rmae)
#print("Stacking RMAE:", stack_rmae)
#print("Voting RMAE:", vote_rmae)

# create a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
# perform cross-validation and calculate the r2_score for each model
#rf_scores = cross_val_score(rf, X, y, cv=folds, scoring='r2')
#gb_scores = cross_val_score(gb, X, y, cv=folds, scoring='r2')
#ab_scores = cross_val_score(ab, X, y, cv=folds, scoring='r2')
#bg_scores = cross_val_score(bg, X, y, cv=folds, scoring='r2')
#stack_scores = cross_val_score(stack, X, y, cv=folds, scoring='r2')
#vote_scores = cross_val_score(vote, X, y, cv=folds, scoring='r2')
#lr_scores = cross_val_score(lr, X, y, cv=folds, scoring='r2')

# print the r2_score of each model
print("Random Forest Regression best parameters:", rf_grid.best_params_)
print("Random Forest Regression r2_score:", rf_grid.best_score_)
# print("Gradient Boosting Regression best parameters:", gb_grid.best_params_)
# print("Gradient Boosting Regression r2_score:", gb_grid.best_score_)
# print("AdaBoost Regression best parameters:", ab_grid.best_params_)
# print("AdaBoost Regression r2_score:", ab_grid.best_score_)
# print("Bagging Regression best parameters:", bg_grid.best_params_)
# print("Bagging Regression r2_score:", bg_grid.best_score_)
#print("Stacking Regression best parameters:", stack_grid.best_params_)
#print("Stacking Regression r2_score:", stack_grid.best_score_)
#print("Voting Regression best parameters:", vote_grid.best_params_)
#print("Voting Regression r2_score:", vote_grid.best_score_)
#print("Linear r2_score:", np.mean(lr_scores))