# Loading the Dataset

In [14]:
import pandas as pd

# Load the dataset
file_path = '/Users/siddhesh/Downloads/generated_emissions_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure and contents
data.head()


Unnamed: 0,Supplier,Year,Scope 1 Emissions,Scope 2 Emissions,Scope 3 Emissions,Total Emissions,Performance,Target Set,Reduction Initiatives,First Year,Target Year
0,Fabrinet,2022,4379.0,88721.0,17593.0,110693.0,Increased,Intensity Target,Yes,No,2030.0
1,Fabrinet,2021,1705.0,76024.0,15060.0,92789.0,Increased,Intensity Target,Yes,No,2030.0
2,Fabrinet,2020,1911.0,66230.0,0.0,68141.0,,Intensity Target,No,Yes,2030.0
3,Lumentum,2022,6328.0,96379.0,44053.0,146760.0,Increased,Absolute Target,Yes,No,
4,Lumentum,2021,7857.0,87857.0,43117.0,138831.0,Increased,Absolute Target,Yes,No,


# Data Preprocessing

In [15]:
# Step 1: Data Preprocessing

# Filter out the latest year for each supplier
latest_data = data.sort_values(by=['Supplier', 'Year']).groupby('Supplier').last().reset_index()

# Feature Engineering: Calculate the average and growth rate of past emissions
# Calculate average emissions for each supplier
average_emissions = data.groupby('Supplier')['Total Emissions'].mean().reset_index()
average_emissions.rename(columns={'Total Emissions': 'Average Emissions'}, inplace=True)

# Calculate growth rate of emissions (year over year)
data_sorted = data.sort_values(by=['Supplier', 'Year'])
data_sorted['Emissions Growth'] = data_sorted.groupby('Supplier')['Total Emissions'].pct_change()
growth_rate = data_sorted.groupby('Supplier')['Emissions Growth'].mean().reset_index()

# Merge these features with the latest data
model_data = pd.merge(latest_data, average_emissions, on='Supplier')
model_data = pd.merge(model_data, growth_rate, on='Supplier')

# Encode categorical variables
model_data_encoded = pd.get_dummies(model_data, columns=['Performance', 'Target Set', 'Reduction Initiatives', 'First Year'])

# Drop columns not used in prediction
model_data_encoded.drop(columns=['Supplier', 'Year'], inplace=True)

# Feature Selection

In [16]:
# Step 2: Feature Selection
# Using all the columns except 'Total Emissions' as features and 'Total Emissions' as the target variable
X = model_data_encoded.drop('Total Emissions', axis=1)
y = model_data_encoded['Total Emissions']

# Model Selection

In [17]:
# Step 3: Model Selection
# Using RandomForestRegressor as the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Training the Model

In [18]:
# Step 4: Training the Model
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

# Model Evaluation

In [19]:
# Step 5: Model Evaluation
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

rmse, r2

(427439.7831264296, 0.7643056050062453)

# Testing on Different Models

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error

# Initialize different regression models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regression': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting Regression': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Support Vector Regression': SVR()
}

# Function to evaluate a model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    return rmse, r2, mae, mape

# Evaluate each model
model_performance = {}
for name, model in models.items():
    rmse, r2, mae, mape = evaluate_model(model, X_train, X_test, y_train, y_test)
    model_performance[name] = {
        'RMSE': rmse,
        'R^2': r2,
        'MAE': mae,
        'MAPE': mape
    }

model_performance



{'Linear Regression': {'RMSE': 5.809112894290429e-10,
  'R^2': 1.0,
  'MAE': 5.238689482212067e-10,
  'MAPE': 2.4912612022208228e-14},
 'Random Forest Regression': {'RMSE': 427439.7831264296,
  'R^2': 0.7643056050062453,
  'MAE': 354818.051,
  'MAPE': 16.268675032928957},
 'Gradient Boosting Regression': {'RMSE': 269217.66836796515,
  'R^2': 0.9065011024019931,
  'MAE': 241607.76003493936,
  'MAPE': 9.936219720954417},
 'Support Vector Regression': {'RMSE': 887364.5258387093,
  'R^2': -0.015788984729105504,
  'MAE': 738083.5102835987,
  'MAPE': 40.4602245170201}}

# Feature Importance

In [21]:
from sklearn.inspection import permutation_importance

# Train and get feature importances for Linear Regression, Random Forest, and Gradient Boosting
lr_model = LinearRegression().fit(X_train, y_train)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_train, y_train)
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42).fit(X_train, y_train)

# Get feature importances
lr_importance = lr_model.coef_
rf_importance = rf_model.feature_importances_
gb_importance = gb_model.feature_importances_

# For Random Forest and Gradient Boosting, we can also use permutation importance for a more reliable measure
rf_perm_importance = permutation_importance(rf_model, X_test, y_test, n_repeats=30, random_state=42)
gb_perm_importance = permutation_importance(gb_model, X_test, y_test, n_repeats=30, random_state=42)

# Organizing the feature importance data
feature_names = X.columns
feature_importance_data = {
    'Linear Regression': lr_importance,
    'Random Forest': rf_importance,
    'Gradient Boosting': gb_importance,
    'Random Forest (Permutation)': rf_perm_importance.importances_mean,
    'Gradient Boosting (Permutation)': gb_perm_importance.importances_mean
}

# Creating a DataFrame for better visualization
feature_importance_df = pd.DataFrame(feature_importance_data, index=feature_names)

feature_importance_df



Unnamed: 0,Linear Regression,Random Forest,Gradient Boosting,Random Forest (Permutation),Gradient Boosting (Permutation)
Scope 1 Emissions,1.0,0.04066,0.02556,0.044652,0.050298
Scope 2 Emissions,1.0,0.109272,0.151035,0.231255,0.367282
Scope 3 Emissions,1.0,0.766966,0.800809,1.075591,1.048109
Target Year,1.941165e-11,0.015995,0.001074,-0.008151,0.000276
Average Emissions,1.665335e-16,0.012343,0.009468,-0.00574,-0.013117
Emissions Growth,-1.418521e-11,0.021657,0.010203,-0.012817,0.002038
Performance_Decreased,1.27218e-10,0.001539,0.000492,-0.000754,0.000842
Performance_Increased,-6.522149e-11,0.00109,9e-06,-0.001059,1.9e-05
Performance_na,-6.199651e-11,0.002582,0.000138,-0.005618,0.000385
Target Set_Absolute Target,8.147737e-11,0.001251,1.1e-05,0.000461,-0.000172


# Model Improvement

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

# Setting up hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting stages to be run
    'max_depth': [3, 4, 5],  # Maximum depth of the individual regression estimators
    'learning_rate': [0.01, 0.1, 0.2]  # Learning rate shrinks the contribution of each tree
}

# Initialize the Gradient Boosting Regressor
gb_reg = GradientBoostingRegressor(random_state=42)

# Setting up GridSearchCV for hyperparameter tuning with cross-validation
grid_search = GridSearchCV(estimator=gb_reg, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit the grid search to the data (replace X_train and y_train with your training data)
grid_search.fit(X_train, y_train)

# Getting the best parameters and the corresponding RMSE
best_params = grid_search.best_params_
best_rmse = np.sqrt(-grid_search.best_score_)

print("Best Parameters:", best_params)
print("Best RMSE:", best_rmse)


Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
Best RMSE: 387492.49366649246


In [24]:
# Retrain the model with the best parameters
optimized_gb_reg = GradientBoostingRegressor(
    learning_rate=0.1,
    max_depth=3,
    n_estimators=300,
    random_state=42
)

optimized_gb_reg.fit(X_train, y_train)

GradientBoostingRegressor(n_estimators=300, random_state=42)

# Final Results

In [25]:
# Evaluate the model on the test set
y_pred = optimized_gb_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE on Test Set:", rmse)
print("R-squared on Test Set:", r2)

RMSE on Test Set: 269073.380992264
R-squared on Test Set: 0.9066012971289573


# Prediction

In [26]:
# Select a supplier for demonstration
selected_supplier = 'Supplier_3'  # Replace with your chosen supplier's name

# Preparing the input data for the selected supplier
# Extracting the most recent year data for this supplier (excluding the target variable)
supplier_data = data[data['Supplier'] == selected_supplier].sort_values(by='Year', ascending=False)
latest_supplier_data = supplier_data.iloc[0].drop(['Supplier', 'Year', 'Total Emissions'])

# Encoding categorical variables and aligning with the model's features
latest_supplier_data_encoded = pd.get_dummies(pd.DataFrame([latest_supplier_data]), columns=['Performance', 'Target Set', 'Reduction Initiatives', 'First Year'])
latest_supplier_data_encoded = latest_supplier_data_encoded.reindex(columns=X_train.columns, fill_value=0)

# Make a prediction using the optimized model
predicted_emissions = optimized_gb_reg.predict(latest_supplier_data_encoded)

# Retrieve the actual emissions for the selected supplier for the latest year
actual_emissions = supplier_data.iloc[0]['Total Emissions']

# Displaying the predicted and actual emissions
print("Supplier:", selected_supplier)
print("Predicted Emissions:", predicted_emissions[0])
print("Actual Emissions:", actual_emissions)


Supplier: Supplier_3
Predicted Emissions: 2766609.358516167
Actual Emissions: 2622841.0
