In [2]:
# import required libraries and load dataset
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler


In [3]:

data_without_outliers = pd.read_csv("data/model_training_files/imputed_without_outliers.csv")
data_without_outliers.columns


Index(['Unnamed: 0', 'Total Current Assets', 'Total Current Liabilities',
       'Total Debt', 'Total Assets, Reported', 'Net Income - Actual',
       'Revenue Per Share', 'Total Revenue', 'Total Equity',
       'Total CO2 Equivalent Emissions To Revenues USD in million',
       'Company Market Capitalization',
       'Property Plant And Equipment, Total - Gross',
       'P/E (Daily Time Series Ratio)', 'returns_yearly', 'Current Ratio',
       'Debt-to-Equity Ratio', 'Return on Assets', 'Revenue Per Employee',
       'Return on Equity', 'Asset Turnover Ratio', 'Net Income Margin', 'RIC',
       'Year', 'ESG Score'],
      dtype='object')

In [4]:
data_without_outliers.shape

(9781, 24)

In [5]:

data_with_outliers = pd.read_csv("data/model_training_files/imputed_with_outliers.csv")
data_with_outliers.columns

Index(['Unnamed: 0', 'Total Current Assets', 'Total Current Liabilities',
       'Total Debt', 'Total Assets, Reported', 'Net Income - Actual',
       'Revenue Per Share', 'Total Revenue', 'Total Equity',
       'Total CO2 Equivalent Emissions To Revenues USD in million',
       'Company Market Capitalization',
       'Property Plant And Equipment, Total - Gross',
       'P/E (Daily Time Series Ratio)', 'returns_yearly', 'Current Ratio',
       'Debt-to-Equity Ratio', 'Return on Assets', 'Revenue Per Employee',
       'Return on Equity', 'Asset Turnover Ratio', 'Net Income Margin', 'RIC',
       'Year', 'ESG Score'],
      dtype='object')

In [6]:
data_with_outliers.shape

(17748, 24)

## Without Outliers

### 1. With Absolute Ratios

In [9]:
X = data_without_outliers.drop(columns=['Unnamed: 0','RIC','Year','ESG Score'])

In [10]:
y = pd.Series(data_without_outliers['ESG Score'])

In [11]:
X  = X.drop(columns=['Total Current Assets', 'Total Current Liabilities',
       'Total Debt', 'Total Assets, Reported', 'Net Income - Actual',
        'Company Market Capitalization','Total Revenue', 'Total Equity'])

In [12]:

# rescale the features
scaler = MinMaxScaler()

# apply scaler() to all the numeric columns 
X = scaler.fit_transform(X)

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [13]:

# initialize the ensemble regression models
rf = RandomForestRegressor(n_estimators=100, random_state=42)


# fit the models on the training data
rf.fit(X_train, y_train)


# make predictions on the testing data
rf_pred = rf.predict(X_test)


# calculate the root mean squared error of each model
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))


# calculate the root mean absolute error of each model
rf_rmae = (mean_absolute_error(y_test, rf_pred))


# print the RMSE of each model
print("Random Forest RMSE:", rf_rmse)

# print the RMAE of each model
print("Random Forest RMAE:", rf_rmae)


# create a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
# perform cross-validation and calculate the r2_score for each model
rf_scores = cross_val_score(rf, X, y, cv=folds, scoring='r2')


# print the r2_score of each model
print("Random Forest r2_score:", np.mean(rf_scores))


Random Forest RMSE: 14.760987448439323
Random Forest RMAE: 11.715122260715587
Random Forest r2_score: 0.3798076846847597


### 2. Without Absolute Ratios

In [19]:
X = data_without_outliers.drop(columns=['Unnamed: 0','RIC','Year','ESG Score'])

In [20]:
y = pd.Series(data_without_outliers['ESG Score'])

In [21]:
X  = X.drop(columns=['Current Ratio',
       'Debt-to-Equity Ratio', 'Return on Assets', 'Revenue Per Employee',
       'Return on Equity', 'Asset Turnover Ratio', 'Net Income Margin'])

In [22]:

# rescale the features
scaler = MinMaxScaler()

# apply scaler() to all the numeric columns 
X = scaler.fit_transform(X)

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [24]:

# initialize the ensemble regression models
rf = RandomForestRegressor(n_estimators=100, random_state=42)


# fit the models on the training data
rf.fit(X_train, y_train)


# make predictions on the testing data
rf_pred = rf.predict(X_test)


# calculate the root mean squared error of each model
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))


# calculate the root mean absolute error of each model
rf_rmae = (mean_absolute_error(y_test, rf_pred))


# print the RMSE of each model
print("Random Forest RMSE:", rf_rmse)

# print the RMAE of each model
print("Random Forest RMAE:", rf_rmae)


# create a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
# perform cross-validation and calculate the r2_score for each model
rf_scores = cross_val_score(rf, X, y, cv=folds, scoring='r2')


# print the r2_score of each model
print("Random Forest r2_score:", np.mean(rf_scores))


Random Forest RMSE: 14.264062321926684
Random Forest RMAE: 11.236947882789654
Random Forest r2_score: 0.4175154093785916


### 3. All features

In [25]:
X = data_without_outliers.drop(columns=['Unnamed: 0','RIC','Year','ESG Score'])

In [26]:
y = pd.Series(data_without_outliers['ESG Score'])

In [27]:

# rescale the features
scaler = MinMaxScaler()

# apply scaler() to all the numeric columns 
X = scaler.fit_transform(X)

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [28]:

# initialize the ensemble regression models
rf = RandomForestRegressor(n_estimators=100, random_state=42)


# fit the models on the training data
rf.fit(X_train, y_train)


# make predictions on the testing data
rf_pred = rf.predict(X_test)


# calculate the root mean squared error of each model
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))


# calculate the root mean absolute error of each model
rf_rmae = (mean_absolute_error(y_test, rf_pred))


# print the RMSE of each model
print("Random Forest RMSE:", rf_rmse)

# print the RMAE of each model
print("Random Forest RMAE:", rf_rmae)


# create a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
# perform cross-validation and calculate the r2_score for each model
rf_scores = cross_val_score(rf, X, y, cv=folds, scoring='r2')


# print the r2_score of each model
print("Random Forest r2_score:", np.mean(rf_scores))


Random Forest RMSE: 14.117138195264623
Random Forest RMAE: 11.125579905392717
Random Forest r2_score: 0.4257686155030672


## With Outliers

### 1. With Absolute Ratios

In [20]:
X = data_with_outliers.drop(columns=['Unnamed: 0','RIC','Year','ESG Score'])

In [21]:
y = pd.Series(data_with_outliers['ESG Score'])

In [22]:
X  = X.drop(columns=['Total Current Assets', 'Total Current Liabilities',
       'Total Debt', 'Total Assets, Reported', 'Net Income - Actual',
        'Company Market Capitalization','Total Revenue', 'Total Equity'])

In [23]:

# rescale the features
scaler = MinMaxScaler()

# apply scaler() to all the numeric columns 
X = scaler.fit_transform(X)

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [24]:

# initialize the ensemble regression models
rf = RandomForestRegressor(n_estimators=100, random_state=42)


# fit the models on the training data
rf.fit(X_train, y_train)


# make predictions on the testing data
rf_pred = rf.predict(X_test)


# calculate the root mean squared error of each model
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))


# calculate the root mean absolute error of each model
rf_rmae = (mean_absolute_error(y_test, rf_pred))


# print the RMSE of each model
print("Random Forest RMSE:", rf_rmse)

# print the RMAE of each model
print("Random Forest RMAE:", rf_rmae)


# create a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
# perform cross-validation and calculate the r2_score for each model
rf_scores = cross_val_score(rf, X, y, cv=folds, scoring='r2')


# print the r2_score of each model
print("Random Forest r2_score:", np.mean(rf_scores))


Random Forest RMSE: 13.99539115280832
Random Forest RMAE: 11.006824609431689
Random Forest r2_score: 0.46270015270986986


In [26]:
feature_importance_score = pd.DataFrame(data=  {"feature": data_with_outliers.drop(columns=['Unnamed: 0','RIC','Year','ESG Score','Total Current Assets', 'Total Current Liabilities',
       'Total Debt', 'Total Assets, Reported', 'Net Income - Actual',
        'Company Market Capitalization','Total Revenue', 'Total Equity']).columns,
                                                   "scores" : list(rf.feature_importances_)})
feature_importance_score.sort_values(by="scores",ascending=False)

Unnamed: 0,feature,scores
2,"Property Plant And Equipment, Total - Gross",0.245598
1,Total CO2 Equivalent Emissions To Revenues USD...,0.124669
0,Revenue Per Share,0.094781
10,Asset Turnover Ratio,0.090381
5,Current Ratio,0.082642
6,Debt-to-Equity Ratio,0.06934
8,Revenue Per Employee,0.064707
3,P/E (Daily Time Series Ratio),0.05968
7,Return on Assets,0.059189
4,returns_yearly,0.056759


### 2. Without Absolute Ratios

In [50]:
X = data_with_outliers.drop(columns=['Unnamed: 0','RIC','Year','ESG Score'])

In [51]:
y = pd.Series(data_with_outliers['ESG Score'])

In [52]:
X  = X.drop(columns=['Current Ratio',
       'Debt-to-Equity Ratio', 'Return on Assets', 'Revenue Per Employee',
       'Return on Equity', 'Asset Turnover Ratio', 'Net Income Margin'])

In [53]:

# rescale the features
scaler = MinMaxScaler()

# apply scaler() to all the numeric columns 
X = scaler.fit_transform(X)

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [54]:

# initialize the ensemble regression models
rf = RandomForestRegressor(n_estimators=100, random_state=42)


# fit the models on the training data
rf.fit(X_train, y_train)


# make predictions on the testing data
rf_pred = rf.predict(X_test)


# calculate the root mean squared error of each model
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))


# calculate the root mean absolute error of each model
rf_rmae = (mean_absolute_error(y_test, rf_pred))


# print the RMSE of each model
print("Random Forest RMSE:", rf_rmse)

# print the RMAE of each model
print("Random Forest RMAE:", rf_rmae)


# create a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
# perform cross-validation and calculate the r2_score for each model
rf_scores = cross_val_score(rf, X, y, cv=folds, scoring='r2')


# print the r2_score of each model
print("Random Forest r2_score:", np.mean(rf_scores))


Random Forest RMSE: 13.300192738408638
Random Forest RMAE: 10.474575940664035
Random Forest r2_score: 0.5161897330744114


### 3. All Features

In [7]:
X = data_with_outliers.drop(columns=['Unnamed: 0','RIC','Year','ESG Score'])

In [8]:
y = pd.Series(data_with_outliers['ESG Score'])

In [9]:

# rescale the features
scaler = MinMaxScaler()

# apply scaler() to all the numeric columns 
X = scaler.fit_transform(X)

# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [10]:

# initialize the ensemble regression models
rf = RandomForestRegressor(n_estimators=100, random_state=42)


# fit the models on the training data
rf.fit(X_train, y_train)


# make predictions on the testing data
rf_pred = rf.predict(X_test)


# calculate the root mean squared error of each model
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))


# calculate the root mean absolute error of each model
rf_rmae = (mean_absolute_error(y_test, rf_pred))


# print the RMSE of each model
print("Random Forest RMSE:", rf_rmse)

# print the RMAE of each model
print("Random Forest RMAE:", rf_rmae)


# create a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
# perform cross-validation and calculate the r2_score for each model
rf_scores = cross_val_score(rf, X, y, cv=folds, scoring='r2')


# print the r2_score of each model
print("Random Forest r2_score:", np.mean(rf_scores))


Random Forest RMSE: 13.220924885136958
Random Forest RMAE: 10.379928844761869
Random Forest r2_score: 0.525985877764467


In [19]:
feature_importance_score = pd.DataFrame(data=  {"feature": data_with_outliers.drop(columns=['Unnamed: 0','RIC','Year','ESG Score']).columns,
                                                   "scores" : list(rf.feature_importances_)})
feature_importance_score.sort_values(by="scores",ascending=False)

Unnamed: 0,feature,scores
1,Total Current Liabilities,0.2686
8,Total CO2 Equivalent Emissions To Revenues USD...,0.069345
10,"Property Plant And Equipment, Total - Gross",0.059614
5,Revenue Per Share,0.057508
18,Asset Turnover Ratio,0.044282
13,Current Ratio,0.041532
2,Total Debt,0.041064
9,Company Market Capitalization,0.040635
3,"Total Assets, Reported",0.039951
6,Total Revenue,0.036383
