In [4]:
import pandas as pd 
import numpy as np
import joblib
from xgboost import XGBRegressor
from catboost import CatBoostRegressor 
from lightgbm import LGBMRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor 
from sklearn.metrics import mean_squared_error as MSE
import optuna
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split 
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error as MAE, r2_score as R2
import warnings 
warnings.filterwarnings('ignore')

In [5]:
data = pd.read_csv('supplier_embeddings.csv')
data.tail(2)

Unnamed: 0,supplier_id,supplier_name,country,region,product_category,sub_category,total_eco_score,carbon_score,water_score,waste_score,...,partnership_status,annual_volume,cost_premium,risk_level,last_audit,audit_summary,image_url,recommendation,text_embedding,text_embedding_2
4998,SUP3720,"Wade, Black and York",Vietnam,Asia,Food,Sugar,0.96,92.3,18.5,58.05,...,Under Review,745347,13.74,Medium,2022-02-10,ISO14001 compliance confirmed; Rainforest Alli...,gs://ecochain-product-images/sugar.jpeg,Avoid,"[-0.0738530233502388, -0.04022129997611046, -0...","[-0.03160709887742996, -0.01454420667141676, -..."
4999,SUP1809,"Jones, Gonzalez and Garza",Vietnam,Asia,Food,Sugar,83.91,6.03,2.55,64.93,...,Under Review,160009,18.11,High,2023-08-21,ISO14001 compliance confirmed; GOTS standards ...,gs://ecochain-product-images/sugar.jpeg,Preferred,"[-0.3574414849281311, 0.017862210050225258, -0...","[-0.9049476981163025, 0.22246497869491577, -0...."


Dropping Columns already Embedded and those not needed for testing (View the other notebook to see when the embedding was done in BigQuery)

In [3]:
data.columns

Index(['supplier_id', 'supplier_name', 'country', 'region', 'product_category',
       'sub_category', 'total_eco_score', 'carbon_score', 'water_score',
       'waste_score', 'social_score', 'certification', 'partnership_status',
       'annual_volume', 'cost_premium', 'risk_level', 'last_audit',
       'audit_summary', 'image_url', 'recommendation', 'text_embedding',
       'text_embedding_2'],
      dtype='object')

In [6]:
data = data.drop(['supplier_id','supplier_name','audit_summary','image_url','certification','text_embedding_2','sub_category','product_category', 'recommendation'], axis = 1)

In [13]:
data.head(1)

Unnamed: 0,country,region,total_eco_score,carbon_score,water_score,waste_score,social_score,partnership_status,annual_volume,cost_premium,risk_level,last_audit,text_embedding
0,Argentina,Americas,97.79,91.94,65.86,34.97,83.13,Inactive,697830,17.2,Low,2025-06-17,"[-0.3144732713699341, -0.051802970468997955, 0..."


Splitting the Data for training and testing

In [18]:
X = data.drop(['carbon_score','water_score','waste_score','social_score','total_eco_score'], axis = 1)
y = data[['carbon_score','water_score','waste_score','social_score']]
y_ecoscore = data['total_eco_score']

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, shuffle = True, random_state = 42)

In [7]:
X_train.head(1)

Unnamed: 0,country,region,partnership_status,annual_volume,cost_premium,risk_level,last_audit,recommendation,text_embedding
4227,South Africa,Africa,Inactive,948098,4.09,High,2023-05-05,Neutral,"[-0.12168096005916595, 0.015789248049259186, 0..."


Encoding Categorical Values 

In [19]:
encoder = OrdinalEncoder()

X_train[['country','region','partnership_status','risk_level']] = encoder.fit_transform(X_train[['country','region','partnership_status','risk_level']])

X_test[['country','region','partnership_status','risk_level']] = encoder.transform(X_test[['country','region','partnership_status','risk_level']])

Removing the dash in the last audit column to make them pure numbers and turning the text embedding into a float

In [20]:
import ast

X_train['last_audit'] = X_train['last_audit'].str.replace("-","", regex = False)
X_test['last_audit'] = X_test['last_audit'].str.replace("-","", regex = False)

X_train['last_audit'] = pd.to_numeric(X_train['last_audit'], errors = 'coerce')
X_test['last_audit'] = pd.to_numeric(X_test['last_audit'], errors = 'coerce')

X_train['text_embedding'] = [ast.literal_eval(e) for e in X_train['text_embedding']]
X_test['text_embedding'] = [ast.literal_eval(e) for e in X_test['text_embedding']]

X_train['text_embedding'] = [[float(x) for x in row] for row in X_train['text_embedding']]
X_test['text_embedding'] = [[float(x) for x in row] for row in X_test['text_embedding']]

# Convert embeddings column (list of floats) into a 2D NumPy array
X_train_embeddings = np.vstack(X_train['text_embedding'].values).astype(np.float32)
X_test_embeddings = np.vstack(X_test['text_embedding'].values).astype(np.float32)

# Drop the original embedding column from DataFrame
X_train_num = X_train.drop(columns=['text_embedding']).reset_index(drop=True)
X_test_num = X_test.drop(columns=['text_embedding']).reset_index(drop=True)

# Concatenate numeric features + embeddings into final arrays
X_train_final = np.hstack([X_train_num.values, X_train_embeddings])
X_test_final = np.hstack([X_test_num.values, X_test_embeddings])

First Model Training

Training BaseLine Model

In [7]:
dummy_model = DummyRegressor(strategy = 'mean')
dummy_model.fit(X_train_final, y_train)


ypred = dummy_model.predict(X_test_final)

baseline_test = pd.DataFrame(ypred)
baseline_rmse = np.sqrt(MSE(y_test,baseline_test))
baseline_mae = MAE(y_test,baseline_test)
baseline_r2 = R2(y_test,baseline_test)
print(f'Baseline RMSE: {baseline_rmse:.2f}')
print(f'Baseline MAE: {baseline_mae:.2f}')
print(f'Baseline R2: {baseline_r2:.2f}')

Baseline RMSE: 28.87
Baseline MAE: 25.01
Baseline R2: -0.00


The Base Line RMSE of the ecochain dataset

In [21]:
models = {
    'xgboost': XGBRegressor(n_estimators = 1500, learning_rate = 0.01, max_leaves = 10 , n_jobs= -1),
    'lightgbm': LGBMRegressor(n_estimators = 1500, max_depth = 10, learning_rate = 0.01, random_state = 42)
}


for name, model in models.items():
    print ('#####################################')
    print(f'Model {name} Training In Progess')
    
    multi_model = MultiOutputRegressor(model)
    multi_model.fit(X_train_final, y_train)
    
    print (f'Training Model {name} Complete')
    print('Prediction For Sub Scores In Progress')

    y_pred_train_subscores = multi_model.predict(X_train_final)
    y_pred_test_subscores = multi_model.predict(X_test_final)

    print('SubScores:')
    for i, col in enumerate(y.columns):
        y_pred_train_subscores_df = pd.DataFrame(y_pred_train_subscores)
        y_pred_test_subscores_df = pd.DataFrame(y_pred_test_subscores)
        rmse_train = np.sqrt(MSE(y_train.iloc[ : ,i], y_pred_train_subscores_df.iloc[ : ,i]))
        rmse_test = np.sqrt(MSE(y_test.iloc[ : ,i], y_pred_test_subscores_df.iloc[ : ,i]))
        mae_train = MAE(y_train.iloc[ : ,i], y_pred_train_subscores_df.iloc[ : ,i])
        mae_test = MAE(y_test.iloc[ : ,i], y_pred_test_subscores_df.iloc[ : ,i])
        r2_train = R2(y_train.iloc[ : ,i], y_pred_train_subscores_df.iloc[ : ,i])
        r2_test = R2(y_test.iloc[ : ,i], y_pred_test_subscores_df.iloc[ : ,i])
        print(f" RMSE FOR {col}: Training Score: {rmse_train} Testing Score: {rmse_test}")
        print(f" MAE FOR {col}: Training Score: {mae_train} Testing Score: {mae_test}")
        print(f" r2 FOR {col}: Training Score: {r2_train} Testing Score: {r2_test}")

#####################################
Model xgboost Training In Progess
Training Model xgboost Complete
Prediction For Sub Scores In Progress
SubScores:
 RMSE FOR carbon_score: Training Score: 20.007210321519665 Testing Score: 26.46369938445744
 MAE FOR carbon_score: Training Score: 16.83710017732382 Testing Score: 22.506502940387726
 r2 FOR carbon_score: Training Score: 0.5178478642406732 Testing Score: 0.16143088403539396
 RMSE FOR water_score: Training Score: 19.86684474370558 Testing Score: 26.171654791700334
 MAE FOR water_score: Training Score: 16.59633055663824 Testing Score: 21.99694056781769
 r2 FOR water_score: Training Score: 0.5190204246930527 Testing Score: 0.16048671134003922
 RMSE FOR waste_score: Training Score: 20.263855060104135 Testing Score: 27.495017736691565
 MAE FOR waste_score: Training Score: 16.975413490993382 Testing Score: 23.386519780788422
 r2 FOR waste_score: Training Score: 0.49633907046193104 Testing Score: 0.09951739286029271
 RMSE FOR social_score: Tr

Using The Predicted SubScores as Features to Predict the EcoScore

In [22]:
models_2 = {
    'xgboost': XGBRegressor(n_estimators = 1500, learning_rate = 0.01, max_leaves = 10 , n_jobs= -1),
    'lightgbm': LGBMRegressor(n_estimators = 1500, max_depth = 10, learning_rate = 0.01, random_state = 42)
}

X[['country','region','partnership_status','risk_level']] = encoder.transform(X[['country','region','partnership_status','risk_level']])
X['last_audit'] = X['last_audit'].str.replace("-","", regex = False)
X['last_audit'] = pd.to_numeric(X['last_audit'], errors = 'coerce')
X['text_embedding'] = [ast.literal_eval(e) for e in X['text_embedding']]
X_embeddings = np.vstack(X['text_embedding'].values).astype(np.float32)
X_num = X.drop(columns=['text_embedding']).reset_index(drop=True)
X_final = np.hstack([X_num.values, X_embeddings])

predicted_subscore = multi_model.predict(X_final)
print(f'RMSE FOR THE ENTIRE DATASET: {np.sqrt(MSE(y,predicted_subscore)):.2f}')
print(f' MAE FOR THE ENTIRE DATASET: {MAE(y,predicted_subscore):.2f}')
print(f' R2 FOR THE ENTIRE DATASET: {R2(y,predicted_subscore):.2f}')
subscores = pd.DataFrame(predicted_subscore, columns = ['carbon_score','water_score','waste_score','social_score'])
print(subscores.head(2))


y_ecoscore = pd.DataFrame(y_ecoscore)
X_train_eco,X_test_eco,y_train_eco,y_test_eco = train_test_split(subscores, y_ecoscore, test_size = 0.2, shuffle = True, random_state = 42)

for name_2, model_2 in models_2.items():
    print('###############################################')
    print(f'Training Ecoscore With Subscores, Model: {name_2}')
    model_2.fit(X_train_eco,y_train_eco)
    ypred_train = model_2.predict(X_train_eco)
    ypred_test = model_2.predict(X_test_eco)
    RMSE_train_eco = np.sqrt(MSE(y_train_eco,ypred_train))
    RMSE_test_eco = np.sqrt(MSE(y_test_eco,ypred_test))
    mae_train_eco = MAE(y_train_eco,ypred_train)
    mae_test_eco = MAE(y_test_eco,ypred_test)
    r2_train_eco = R2(y_train_eco,ypred_train)
    r2_test_eco = R2(y_test_eco,ypred_test)
    print (f'Training Data RMSE: {RMSE_train_eco:.2f}. Testing Data RMSE: {RMSE_test_eco:.2f}')
    print (f'Training Data MAE: {mae_train_eco:.2f}. Testing Data RMSE: {mae_test_eco:.2f}')
    print (f'Training Data R2: {r2_train_eco:.2f}. Testing Data RMSE: {r2_test_eco:.2f}')

RMSE FOR THE ENTIRE DATASET: 14.70
 MAE FOR THE ENTIRE DATASET: 10.70
 R2 FOR THE ENTIRE DATASET: 0.74
   carbon_score  water_score  waste_score  social_score
0     81.862301    54.271436    37.029578     66.526458
1     28.503151    19.713255    74.696942     26.194180
###############################################
Training Ecoscore With Subscores, Model: xgboost
Training Data RMSE: 26.51. Testing Data RMSE: 29.37
Training Data MAE: 22.79. Testing Data RMSE: 25.29
Training Data R2: 0.16. Testing Data RMSE: -0.01
###############################################
Training Ecoscore With Subscores, Model: lightgbm
Training Data RMSE: 21.99. Testing Data RMSE: 29.95
Training Data MAE: 18.69. Testing Data RMSE: 25.63
Training Data R2: 0.42. Testing Data RMSE: -0.05


In [45]:
X_train_eco_1,X_val_eco_1,y_train_eco_1,y_val_eco_1 = train_test_split(X_train_final, y_train, random_state = 42, test_size = 0.2, shuffle = True)

def objective(trial):
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'num_leaves': trial.suggest_int('num_leaves', 20, 200),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 1000, 4000)
    }

    preds_list = []

    # Train one model per target column
    for i in range(y_train_eco_1.shape[1]):
        model = LGBMRegressor(**param)
        model.fit(
        X_train_eco_1, y_train_eco_1.iloc[:, i].to_numpy(),
        eval_set=[(X_val_eco_1, y_val_eco_1.iloc[:, i].to_numpy())],
        early_stopping_rounds=50,
        verbose=False
        )

        preds_list.append(model.predict(X_test_final))

    preds = np.vstack(preds_list).T  # shape (n_samples, n_targets)
    rmse_pred = np.sqrt(MSE(y_test, preds))

    return rmse_pred


study = optuna.create_study(direction = 'minimize') # Minimse RMSE
study.optimize(objective, n_trials = 50)
print('Best Trial:')
trial = study.best_trial
print(f'Value:', trial.value)
print(f'Params:',trial.params)

[I 2025-09-08 21:49:24,263] A new study created in memory with name: no-name-243c63d9-fd7c-4bdf-912d-0c138fd97770
[I 2025-09-08 21:51:37,295] Trial 0 finished with value: 26.507649094674296 and parameters: {'max_depth': 4, 'num_leaves': 153, 'learning_rate': 0.0187326888556976, 'n_estimators': 2322}. Best is trial 0 with value: 26.507649094674296.
[I 2025-09-08 21:53:30,662] Trial 1 finished with value: 27.399229177917974 and parameters: {'max_depth': 12, 'num_leaves': 161, 'learning_rate': 0.05737480688642148, 'n_estimators': 2342}. Best is trial 0 with value: 26.507649094674296.
[I 2025-09-08 22:01:56,925] Trial 2 finished with value: 27.194435979790505 and parameters: {'max_depth': 15, 'num_leaves': 119, 'learning_rate': 0.0064256680689245265, 'n_estimators': 2128}. Best is trial 0 with value: 26.507649094674296.
[I 2025-09-08 22:06:25,116] Trial 3 finished with value: 26.801636268230897 and parameters: {'max_depth': 6, 'num_leaves': 177, 'learning_rate': 0.007244976269847492, 'n_es

Best Trial:
Value: 26.38271038827916
Params: {'max_depth': 3, 'num_leaves': 134, 'learning_rate': 0.01996992499390305, 'n_estimators': 2638}


In [12]:
model_3 = LGBMRegressor(max_depth= 3, num_leaves = 134, learning_rate =  0.01996992499390305, n_estimators = 2638, objective = 'regression' ,
        metric = 'rmse', verbosity = -1, boosting_type = 'gbdt', random_state = 42)

multi_model_2 = MultiOutputRegressor(model_3)

multi_model_2.fit(X_train_final, y_train)

preds_2 = multi_model_2.predict(X_test_final)

print('SubScores:')
for i, col in enumerate(y.columns):
    pred_test_subscores_df = pd.DataFrame(preds_2)
    rmse_test = np.sqrt(MSE(y_test.iloc[ : ,i], pred_test_subscores_df.iloc[ : ,i]))
    mae_test = MAE(y_test.iloc[ : ,i], pred_test_subscores_df.iloc[ : ,i])
    r2_test = R2(y_test.iloc[ : ,i], pred_test_subscores_df.iloc[ : ,i])
    print(f" RMSE FOR {col}: Testing Score: {rmse_test:.2f}")
    print(f" MAE FOR {col}: Testing Score: {mae_test:.2f}")
    print(f" r2 FOR {col}: Testing Score: {r2_test:.2f}")

SubScores:
 RMSE FOR carbon_score: Testing Score: 25.84
 MAE FOR carbon_score: Testing Score: 21.67
 r2 FOR carbon_score: Testing Score: 0.20
 RMSE FOR water_score: Testing Score: 26.25
 MAE FOR water_score: Testing Score: 21.68
 r2 FOR water_score: Testing Score: 0.16
 RMSE FOR waste_score: Testing Score: 27.45
 MAE FOR waste_score: Testing Score: 23.02
 r2 FOR waste_score: Testing Score: 0.10
 RMSE FOR social_score: Testing Score: 24.26
 MAE FOR social_score: Testing Score: 20.12
 r2 FOR social_score: Testing Score: 0.30


In [54]:
def objective(trial):
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'num_leaves': trial.suggest_int('num_leaves', 20, 200),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 1000, 4000)
    }

    model = LGBMRegressor(**param)
    model.fit(X_train_eco,y_train_eco)

    preds = model.predict(X_test_eco)
    
    rmse_pred = np.sqrt(MSE(y_test_eco, preds))

    return rmse_pred


study = optuna.create_study(direction = 'minimize') # Minimse RMSE
study.optimize(objective, n_trials = 50)
print('Best Trial:')
trial = study.best_trial
print(f'Value:', trial.value)
print(f'Params:',trial.params)

[I 2025-09-09 02:14:04,429] A new study created in memory with name: no-name-78b8d895-f878-43d5-b758-416cca42a3f6
[I 2025-09-09 02:14:20,316] Trial 0 finished with value: 33.04280527109028 and parameters: {'max_depth': 13, 'num_leaves': 56, 'learning_rate': 0.035704246891821785, 'n_estimators': 3285}. Best is trial 0 with value: 33.04280527109028.
[I 2025-09-09 02:14:26,704] Trial 1 finished with value: 32.0327996354399 and parameters: {'max_depth': 7, 'num_leaves': 198, 'learning_rate': 0.04212510650325252, 'n_estimators': 2470}. Best is trial 1 with value: 32.0327996354399.
[I 2025-09-09 02:14:37,214] Trial 2 finished with value: 32.248628497186665 and parameters: {'max_depth': 13, 'num_leaves': 41, 'learning_rate': 0.02989118758888972, 'n_estimators': 2635}. Best is trial 1 with value: 32.0327996354399.
[I 2025-09-09 02:14:43,608] Trial 3 finished with value: 29.47440349254141 and parameters: {'max_depth': 8, 'num_leaves': 25, 'learning_rate': 0.00234599652421054, 'n_estimators': 21

Best Trial:
Value: 29.23684992403832
Params: {'max_depth': 3, 'num_leaves': 74, 'learning_rate': 0.0012870919267741528, 'n_estimators': 1192}


In [23]:
model_4 = LGBMRegressor(max_depth = 3, num_leaves = 74, learning_rate = 0.0012870919267741528, n_estimators =  1192, objective =  'regression',
        metric = 'rmse',verbosity = -1, boosting_type = 'gbdt')

model_4.fit(X_train_eco,y_train_eco)

preds_3 = model_4.predict(X_test_eco)

rmse = np.sqrt(MSE(y_test_eco,preds_3))
mae = MAE(y_test_eco,preds_3)
r2 = R2(y_test_eco,preds_3)

print(' Eco Score Metrics:')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'r2: {r2}')

 Eco Score Metrics:
RMSE: 29.270555127650308
MAE: 25.284544562264855
r2: -0.006087750161555938


Saving Models

In [24]:
joblib.dump(multi_model_2, '../models/SubScores_V1.pkl')
joblib.dump(model_4, '../models/Ecoscore_V1.pkl')
joblib.dump(encoder, '../models/Encoder_V1.pkl')

['../models/Encoder_V1.pkl']

THE DATASET IS AI GENERATED AND HAS LITTLE SIGNIFICANT RELATIONSHIP, SPENDING TIME AND COMPUTE ON THE DATASET IS NOT NECESSARY AS THE DATASET IS ONLY USED TO SHOW A WORKING PROTOTYPE OF THE ECOCHAIN AI APP.