In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.ensemble import StackingClassifier
import warnings
warnings.filterwarnings('ignore')

# Load and prepare data
df = pd.read_csv('final.csv')

# Select features
features = ['Total ESG Risk score', 'Environment Risk Score', 'Governance Risk Score', 
           'Social Risk Score', 'Controversy Level', 'Controversy Score', 'ESG Risk Percentile']

X = df[features]
y = df['ESG Risk Level']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=10)),
    ('gb', GradientBoostingClassifier(n_estimators=200, learning_rate=0.1)),
    ('xgb', xgb.XGBClassifier(n_estimators=200, learning_rate=0.1)),
    ('mlp', MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000))
]

# Define meta-model
meta_model = SVC(kernel='rbf', probability=True)

# Create stacking classifier
stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5
)

# Train the stacked model
stacked_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = stacked_model.predict(X_test_scaled)

# Print results
print("Model Performance:")
print("=====================================")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance (from Random Forest component)
rf_model = stacked_model.named_estimators_['rf']
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
})
print("\nFeature Importance:")
print(feature_importance.sort_values('importance', ascending=False))

# Cross-validation score
cv_scores = cross_val_score(stacked_model, X_train_scaled, y_train, cv=5)
print("\nCross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())

# Save the model
import joblib
joblib.dump(stacked_model, 'esg_stacked_model.pkl')
joblib.dump(scaler, 'esg_scaler.pkl')

Model Performance:

Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95         9
           1       1.00      1.00      1.00        50
           2       1.00      1.00      1.00        40
           3       1.00      1.00      1.00         1
           4       0.00      0.00      0.00         1

    accuracy                           0.99       101
   macro avg       0.78      0.80      0.79       101
weighted avg       0.98      0.99      0.99       101


Feature Importance:
                  feature  importance
0    Total ESG Risk score    0.400075
6     ESG Risk Percentile    0.263226
3       Social Risk Score    0.144243
1  Environment Risk Score    0.130449
2   Governance Risk Score    0.045808
5       Controversy Score    0.010951
4       Controversy Level    0.005249

Cross-validation scores: [0.97530864 0.97530864 0.9875     0.975      0.9875    ]
Average CV score: 0.9801234567901235


['esg_scaler.pkl']

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming we have our stacked model and individual models from previous training
def get_model_metrics(model, X_test_scaled, y_test):
    y_pred = model.predict(X_test_scaled)
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1 Score': f1_score(y_test, y_pred, average='weighted')
    }

# Create dictionary to store metrics
metrics_dict = {}

# Get metrics for each base model
base_models = {
    'Random Forest': stacked_model.named_estimators_['rf'],
    'Gradient Boosting': stacked_model.named_estimators_['gb'],
    'XGBoost': stacked_model.named_estimators_['xgb'],
    'Neural Network': stacked_model.named_estimators_['mlp']
}

# Calculate metrics for each model
for model_name, model in base_models.items():
    metrics_dict[model_name] = get_model_metrics(model, X_test_scaled, y_test)

# Add stacked model metrics
metrics_dict['Stacked Ensemble'] = get_model_metrics(stacked_model, X_test_scaled, y_test)

# Create DataFrame
metrics_df = pd.DataFrame(metrics_dict).T

# Round the values to 4 decimal places
metrics_df = metrics_df.round(4)

# Sort by accuracy (descending)
metrics_df = metrics_df.sort_values('Accuracy', ascending=False)

print("Model Performance Comparison:")
print("============================")
print(metrics_df)

# Save the metrics to CSV
metrics_df.to_csv('model_metrics_comparison.csv')

Model Performance Comparison:
                   Accuracy  Precision  Recall  F1 Score
Gradient Boosting    1.0000     1.0000  1.0000    1.0000
Random Forest        0.9901     0.9804  0.9901    0.9852
XGBoost              0.9901     0.9903  0.9901    0.9901
Stacked Ensemble     0.9901     0.9812  0.9901    0.9854
Neural Network       0.9010     0.8823  0.9010    0.8915


In [4]:
joblib.dump(stacked_model.named_estimators_['xgb'], 'xgb_model.pkl')


['xgb_model.pkl']

In [3]:
import pandas as pd 
final = pd.read_csv('final.csv')
final.head()

Unnamed: 0.1,Unnamed: 0,Sector,Full Time Employees,Total ESG Risk score,Environment Risk Score,Governance Risk Score,Social Risk Score,Controversy Level,Controversy Score,ESG Risk Percentile,ESG Risk Level
0,0,9,3157.0,21.533721,5.739767,6.725116,9.070465,2,2.007444,6,1
1,1,0,14000.0,25.3,12.8,6.6,5.8,2,2.0,44,2
2,2,2,6500.0,29.2,10.6,6.3,12.2,2,2.0,60,2
3,3,9,9084.0,21.533721,5.739767,6.725116,9.070465,2,2.007444,6,1
4,4,6,70000.0,22.6,0.1,8.4,14.1,2,2.0,30,2
