In [None]:
import pandas as pd
import logging
import requests
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class CarbonDataCollector:
    def __init__(self, gcp_api_key):
        self.gcp_api_key = gcp_api_key
        self.carbon_factors = {}

    def fetch_real_time_carbon_factors(self):
        try:
            url = f"https://sustainability.googleapis.com/v1/carbonFactors?key={self.gcp_api_key}"
            response = requests.get(url)
            response.raise_for_status()
            self.carbon_factors = response.json()
            logging.info("Successfully fetched carbon factors from Google Cloud Sustainability API")
        except requests.exceptions.RequestException as e:
            logging.error(f"Failed to fetch carbon factors: {e}")

    def import_shipping_data(self, csv_path):
        try:
            df = pd.read_csv(csv_path)
            logging.info(f"Loaded shipping data from {csv_path}")
            return df
        except Exception as e:
            logging.error(f"Error loading shipping data: {e}")
            return pd.DataFrame()

    def import_energy_data(self, csv_path):
        try:
            df = pd.read_csv(csv_path)
            logging.info(f"Loaded warehouse energy data from {csv_path}")
            return df
        except Exception as e:
            logging.error(f"Error loading energy data: {e}")
            return pd.DataFrame()

    def import_supplier_data(self, csv_path):
        try:
            df = pd.read_csv(csv_path)
            logging.info(f"Loaded supplier carbon intensity data from {csv_path}")
            return df
        except Exception as e:
            logging.error(f"Error loading supplier data: {e}")
            return pd.DataFrame()

# Data Validation Functions
def check_missing_values(df, critical_columns):
    missing = df[critical_columns].isnull().sum()
    for col, count in missing.items():
        if count > 0:
            logging.warning(f"Missing values in column '{col}': {count}")
    return missing.sum() == 0

def validate_emission_factors(df, factor_column, min_val=0, max_val=100):
    invalid = df[(df[factor_column] < min_val) | (df[factor_column] > max_val)]
    if not invalid.empty:
        logging.warning(f"Found out-of-range emission factors in column '{factor_column}'")
        return False
    return True

def ensure_data_consistency(df_list, key_column):
    keys = [set(df[key_column].dropna()) for df in df_list if key_column in df.columns]
    common_keys = set.intersection(*keys) if keys else set()
    for df in df_list:
        inconsistent = ~df[key_column].isin(common_keys)
        if inconsistent.any():
            logging.warning(f"Inconsistencies found in {key_column} across datasets")
            return False
    return True

# Preprocessing Pipeline
def standardize_units(df, column, factor):
    df[column] *= factor
    logging.info(f"Standardized units in column '{column}' using factor {factor}")
    return df

def handle_date_formats(df, date_column):
    try:
        df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
        if df[date_column].isnull().any():
            logging.warning(f"Some dates could not be parsed in column '{date_column}'")
    except Exception as e:
        logging.error(f"Error converting date formats: {e}")
    return df

def categorize_emission_scope(df, scope_column, mapping):
    df['Emission Scope'] = df[scope_column].map(mapping)
    if df['Emission Scope'].isnull().any():
        logging.warning("Some emission scopes could not be mapped")
    return df

# Example usage (would be in main script, not module)
# collector = CarbonDataCollector(gcp_api_key="your-api-key")
# collector.fetch_real_time_carbon_factors()
# shipping_df = collector.import_shipping_data("shipping.csv")
# energy_df = collector.import_energy_data("energy.csv")
# supplier_df = collector.import_supplier_data("suppliers.csv")


In [None]:
import streamlit as st
import pandas as pd

# Page config
st.set_page_config(page_title="Carbon Data Uploader", layout="wide")
st.title("📦 E-Commerce Carbon Footprint Data Uploader")

# Required columns dictionary
REQUIRED_COLUMNS = {
    "Transportation": ["distance_km", "weight_kg", "transport_mode"],
    "Warehouse": ["electricity_kwh", "gas_m3", "renewables_kwh"],
    "Supplier": ["supplier_name", "carbon_intensity_kgco2e"],
    "Packaging": ["material_type", "weight_kg", "emission_factor"]
}

# File uploader section
st.sidebar.header("Upload CSV Data Files")

uploaded_files = {
    "Transportation": st.sidebar.file_uploader("Transportation Data", type="csv", key="transportation"),
    "Warehouse": st.sidebar.file_uploader("Warehouse Energy Data", type="csv", key="warehouse"),
    "Supplier": st.sidebar.file_uploader("Supplier Emissions Data", type="csv", key="supplier"),
    "Packaging": st.sidebar.file_uploader("Packaging Materials Data", type="csv", key="packaging")
}

# Session state management for dataframes
if "dataframes" not in st.session_state:
    st.session_state["dataframes"] = {}

# Function: Data Preview
def preview_data(df, label):
    st.subheader(f"🔍 Preview: {label} Data")
    st.dataframe(df.head(10))
    st.write("**Data Types:**")
    st.write(df.dtypes)
    st.write("**Null Values:**")
    st.write(df.isnull().sum())
    st.write("**Basic Statistics (numerical):**")
    st.write(df.describe())

# Function: Data Validation
def validate_data(df, required_cols):
    results = []
    for col in required_cols:
        if col not in df.columns:
            results.append((col, "❌ Missing"))
        else:
            results.append((col, "✅ Present"))
    st.write("### ✅ Data Validation")
    for col, status in results:
        st.write(f"- `{col}`: {status}")

    suggestions = []
    if df.isnull().sum().sum() > 0:
        suggestions.append("Check for and handle missing values (e.g., imputation or removal).")
    for col in required_cols:
        if col in df.columns and df[col].dtype == object:
            try:
                pd.to_numeric(df[col])
            except ValueError:
                suggestions.append(f"Column `{col}` might contain non-numeric data.")
    if suggestions:
        st.write("### ⚠️ Suggestions")
        for s in suggestions:
            st.write(f"- {s}")

# Main logic
for label, uploaded_file in uploaded_files.items():
    if uploaded_file is not None:
        try:
            df = pd.read_csv(uploaded_file)
            st.session_state["dataframes"][label] = df
            preview_data(df, label)
            validate_data(df, REQUIRED_COLUMNS[label])
        except Exception as e:
            st.error(f"Error reading {label} data: {e}")

# Final message
if st.session_state["dataframes"]:
    st.success("✅ Data loaded and validated. Ready for analysis or processing.")


In [None]:
import logging
import requests

logging.basicConfig(level=logging.INFO)

class EmissionFactorManager:
    def __init__(self):
        self.emission_factors = {
            "transportation": {
                "truck": 0.9,
                "van": 0.2,
                "car": 0.15
            },
            "energy": {
                "electricity": 0.5,
                "natural_gas": 0.2
            },
            "packaging": {
                "cardboard": 0.2,
                "plastic": 0.5
            }
        }

    def get_factor(self, category, item):
        try:
            factor = self.emission_factors[category][item]
            logging.info(f"Retrieved emission factor: {category}/{item} = {factor} kg CO2e")
            return factor
        except KeyError:
            logging.error(f"Emission factor not found for {category}/{item}")
            return None

    def update_from_api(self, source="DEFRA"):
        url_map = {
            "DEFRA": "https://api.defra.uk/emission-factors",
            "EPA": "https://api.epa.gov/emission-factors"
        }
        try:
            url = url_map.get(source.upper())
            if not url:
                raise ValueError("Unsupported data source")
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            self._update_factors(data)
            logging.info(f"Emission factors updated from {source}")
        except Exception as e:
            logging.error(f"Failed to update from {source}: {e}")

    def _update_factors(self, data):
        for category in data:
            if category not in self.emission_factors:
                self.emission_factors[category] = {}
            for item, value in data[category].items():
                self.emission_factors[category][item] = value

    def calculate_composite_factor(self, category, items_with_weights):
        total_emission = 0
        total_weight = 0
        for item, weight in items_with_weights.items():
            factor = self.get_factor(category, item)
            if factor is not None:
                total_emission += factor * weight
                total_weight += weight
        if total_weight == 0:
            logging.warning("Total weight is zero, cannot calculate composite factor")
            return 0
        composite = total_emission / total_weight
        logging.info(f"Composite emission factor for {category}: {composite:.3f} kg CO2e")
        return composite

# Example usage:
# manager = EmissionFactorManager()
# factor = manager.get_factor("transportation", "van")
# manager.update_from_api("DEFRA")
# composite = manager.calculate_composite_factor("packaging", {"cardboard": 2, "plastic": 1})


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import logging

logging.basicConfig(level=logging.INFO)

class CarbonEDA:
    def __init__(self, emissions_df):
        """
        emissions_df must contain at least:
        - scope: 1, 2, or 3
        - order_id or shipment_id
        - emission_kg_co2e
        - date
        - origin, destination (optional)
        - transport_mode (optional)
        - distance_km, weight_kg (optional)
        - supplier_location, carbon_intensity (optional)
        - warehouse_id, energy_type, energy_consumed (optional)
        """
        self.df = emissions_df.copy()
        self.df['date'] = pd.to_datetime(self.df['date'], errors='coerce')

    def summarize_emissions(self):
        logging.info("Summarizing total emissions by scope")
        by_scope = self.df.groupby('scope')['emission_kg_co2e'].sum()
        logging.info(f"Emissions by Scope:\n{by_scope}")

        logging.info("Calculating emissions per order")
        per_order = self.df.groupby('order_id')['emission_kg_co2e'].sum()

        logging.info("Detecting seasonal patterns")
        self.df['month'] = self.df['date'].dt.month
        monthly = self.df.groupby('month')['emission_kg_co2e'].sum()

        logging.info("Geographic high-emission route detection")
        if 'origin' in self.df.columns and 'destination' in self.df.columns:
            route_emissions = self.df.groupby(['origin', 'destination'])['emission_kg_co2e'].sum().sort_values(ascending=False).head(10)
        else:
            route_emissions = pd.Series()

        return {
            'by_scope': by_scope,
            'per_order': per_order,
            'monthly': monthly,
            'route_emissions': route_emissions
        }

    def correlation_analysis(self):
        corr_data = {}
        if {'weight_kg', 'emission_kg_co2e'}.issubset(self.df.columns):
            corr_data['weight_vs_emissions'] = self.df[['weight_kg', 'emission_kg_co2e']].corr().iloc[0,1]
        if {'distance_km', 'emission_kg_co2e', 'transport_mode'}.issubset(self.df.columns):
            mode_eff = self.df.groupby('transport_mode').apply(lambda x: (x['emission_kg_co2e']/x['distance_km']).mean())
            corr_data['transport_efficiency'] = mode_eff
        if {'supplier_location', 'carbon_intensity'}.issubset(self.df.columns):
            supplier_intensity = self.df.groupby('supplier_location')['carbon_intensity'].mean().sort_values(ascending=False)
            corr_data['supplier_intensity'] = supplier_intensity
        return corr_data

    def find_optimization_opportunities(self):
        suggestions = []
        if {'origin', 'destination', 'transport_mode'}.issubset(self.df.columns):
            route_summary = self.df.groupby(['origin', 'destination', 'transport_mode'])['emission_kg_co2e'].sum().reset_index()
            top_routes = route_summary.sort_values(by='emission_kg_co2e', ascending=False).head(5)
            for _, row in top_routes.iterrows():
                if row['transport_mode'] != 'rail':
                    suggestions.append(f"Route {row['origin']} → {row['destination']} using {row['transport_mode']} emits {row['emission_kg_co2e']:.2f} kg CO2e. Consider switching to rail.")

        if {'material_type', 'weight_kg', 'emission_kg_co2e'}.issubset(self.df.columns):
            packaging_avg = self.df.groupby('material_type').agg({
                'weight_kg': 'mean', 'emission_kg_co2e': 'mean'
            })
            high_emitters = packaging_avg.sort_values('emission_kg_co2e', ascending=False).head(3)
            for mat in high_emitters.index:
                suggestions.append(f"Material '{mat}' has high average emissions. Consider switching to a lower impact material.")

        if {'warehouse_id', 'energy_type', 'energy_consumed'}.issubset(self.df.columns):
            warehouse_energy = self.df.groupby(['warehouse_id', 'energy_type'])['energy_consumed'].sum().unstack().fillna(0)
            if 'renewable' in warehouse_energy.columns:
                warehouse_energy['renewable_ratio'] = warehouse_energy['renewable'] / warehouse_energy.sum(axis=1)
                low_renew = warehouse_energy[warehouse_energy['renewable_ratio'] < 0.3]
                for warehouse in low_renew.index:
                    suggestions.append(f"Warehouse {warehouse} has low renewable energy use. Consider solar or wind integration.")

        return suggestions

    def generate_automated_insights(self):
        summary = self.summarize_emissions()
        correlation = self.correlation_analysis()
        optimization = self.find_optimization_opportunities()

        insights = {
            "Summary": summary,
            "Correlations": correlation,
            "Recommendations": optimization
        }
        return insights

# Example usage:
# df = pd.read_csv("emissions_dataset.csv")
# eda = CarbonEDA(df)
# insights = eda.generate_automated_insights()
# print(insights['Summary'])
# print(insights['Correlations'])
# print(insights['Recommendations'])


In [None]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from io import BytesIO
from datetime import datetime

# Load processed EDA class
from eda_carbon_footprint import CarbonEDA

st.set_page_config(layout="wide")
st.title("📊 E-Commerce Carbon Footprint EDA Dashboard")

# --- File Upload --- #
uploaded_file = st.file_uploader("Upload emissions dataset (CSV)", type=["csv"])
if uploaded_file:
    df = pd.read_csv(uploaded_file, parse_dates=["date"], dayfirst=True)
    eda = CarbonEDA(df)

    # --- Filters --- #
    st.sidebar.header("📌 Filters")
    date_min, date_max = df['date'].min(), df['date'].max()
    date_range = st.sidebar.date_input("Select Date Range", [date_min, date_max])

    df_filtered = df[(df['date'] >= pd.to_datetime(date_range[0])) & (df['date'] <= pd.to_datetime(date_range[1]))]

    if 'region' in df.columns:
        regions = st.sidebar.multiselect("Select Regions", df['region'].dropna().unique(), default=df['region'].dropna().unique())
        df_filtered = df_filtered[df_filtered['region'].isin(regions)]

    if 'product_category' in df.columns:
        products = st.sidebar.multiselect("Select Product Categories", df['product_category'].dropna().unique(), default=df['product_category'].dropna().unique())
        df_filtered = df_filtered[df_filtered['product_category'].isin(products)]

    if 'transport_mode' in df.columns:
        modes = st.sidebar.multiselect("Select Transport Modes", df['transport_mode'].dropna().unique(), default=df['transport_mode'].dropna().unique())
        df_filtered = df_filtered[df_filtered['transport_mode'].isin(modes)]

    # --- Overview Metrics --- #
    st.subheader("📈 Overview Metrics")
    scope_emissions = df_filtered.groupby('scope')['emission_kg_co2e'].sum()
    cols = st.columns(3)
    for i, scope in enumerate([1, 2, 3]):
        cols[i].metric(label=f"Scope {scope} Emissions (kg CO₂e)", value=round(scope_emissions.get(scope, 0), 2))

    # Monthly trend
    df_filtered['month'] = df_filtered['date'].dt.to_period("M").astype(str)
    month_trend = df_filtered.groupby('month')['emission_kg_co2e'].sum().reset_index()
    fig1 = px.line(month_trend, x='month', y='emission_kg_co2e', title="📅 Monthly Emission Trend", markers=True)
    st.plotly_chart(fig1, use_container_width=True)

    # Top 10 Emission Sources
    if 'origin' in df.columns and 'destination' in df.columns:
        route_emissions = df_filtered.groupby(['origin', 'destination'])['emission_kg_co2e'].sum().sort_values(ascending=False).head(10).reset_index()
        route_emissions['route'] = route_emissions['origin'] + " → " + route_emissions['destination']
        fig2 = px.bar(route_emissions, x='route', y='emission_kg_co2e', title="🚛 Top 10 High-Emission Routes")
        st.plotly_chart(fig2, use_container_width=True)

    # --- Visualizations --- #
    st.subheader("📊 Visualizations")
    col1, col2 = st.columns(2)

    # Emission by Region (Heat Map)
    if 'region' in df.columns:
        with col1:
            region_emissions = df_filtered.groupby('region')['emission_kg_co2e'].sum().reset_index()
            fig3 = px.treemap(region_emissions, path=['region'], values='emission_kg_co2e', title="🗺️ Emission Intensity by Region")
            st.plotly_chart(fig3, use_container_width=True)

    # Correlation Scatter
    with col2:
        if {'distance_km', 'emission_kg_co2e'}.issubset(df_filtered.columns):
            fig4 = px.scatter(df_filtered, x='distance_km', y='emission_kg_co2e', color='transport_mode', trendline='ols', title="📍 Distance vs Emissions")
            st.plotly_chart(fig4, use_container_width=True)

    # Distribution of Emission Factors
    if 'emission_kg_co2e' in df_filtered.columns:
        st.subheader("📈 Emission Distribution")
        fig5 = px.histogram(df_filtered, x='emission_kg_co2e', nbins=50, title="Distribution of Emissions per Record")
        st.plotly_chart(fig5, use_container_width=True)

    # --- Automated Insights Report --- #
    st.subheader("🧠 Automated Insights & Recommendations")
    insights = eda.generate_automated_insights()
    st.write("### Summary")
    st.dataframe(insights['Summary']['by_scope'].reset_index(name='emissions (kg CO₂e)'))
    st.write("### Top Optimization Opportunities")
    for rec in insights['Recommendations']:
        st.info(rec)

    # Download Report
    def generate_report():
        buffer = BytesIO()
        with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
            insights['Summary']['by_scope'].to_excel(writer, sheet_name='Scope Summary')
            insights['Summary']['monthly'].to_excel(writer, sheet_name='Monthly Trends')
            if not insights['Summary']['route_emissions'].empty:
                insights['Summary']['route_emissions'].to_frame(name='kg CO₂e').to_excel(writer, sheet_name='Top Routes')
            pd.DataFrame(insights['Recommendations'], columns=["Recommendations"]).to_excel(writer, sheet_name='Recommendations')
        buffer.seek(0)
        return buffer

    st.download_button(
        label="📥 Download Insights Report",
        data=generate_report(),
        file_name=f"carbon_eda_report_{datetime.now().strftime('%Y%m%d')}.xlsx",
        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    )


ModuleNotFoundError: No module named 'streamlit'

In [None]:

import pandas as pd
from prophet import Prophet
from prophet.diagnostics import performance_metrics, cross_validation
import matplotlib.pyplot as plt
import numpy as np
import logging

logging.basicConfig(level=logging.INFO)

class EmissionForecaster:
    def __init__(self, df, date_col='date', emission_col='emission_kg_co2e', regressors=None):
        """
        df: DataFrame with time series emissions data
        regressors: list of additional column names to include as external regressors
        """
        self.raw_df = df.copy()
        self.date_col = date_col
        self.emission_col = emission_col
        self.regressors = regressors if regressors else []
        self.model = None

        # Prepare the data for Prophet
        self.df = self.prepare_data()

    def prepare_data(self):
        df = self.raw_df[[self.date_col, self.emission_col] + self.regressors].copy()
        df.rename(columns={self.date_col: 'ds', self.emission_col: 'y'}, inplace=True)
        df['ds'] = pd.to_datetime(df['ds'])

        # Fill missing dates
        df = df.set_index('ds').asfreq('D')
        df['y'].interpolate(method='linear', inplace=True)

        for reg in self.regressors:
            df[reg].fillna(method='ffill', inplace=True)

        return df.reset_index()

    def add_holidays(self):
        # Example shipping holidays (can be expanded/customized)
        return pd.DataFrame({
            'holiday': 'peak_shipping',
            'ds': pd.to_datetime([
                '2022-11-25', '2022-12-15', '2023-11-24', '2023-12-15',
                '2024-11-29', '2024-12-15'
            ]),
            'lower_window': -1,
            'upper_window': 2
        })

    def train_model(self):
        self.model = Prophet(
            daily_seasonality=True,
            weekly_seasonality=True,
            yearly_seasonality=True,
            holidays=self.add_holidays()
        )

        for reg in self.regressors:
            self.model.add_regressor(reg)

        self.model.fit(self.df)
        logging.info("Prophet model trained successfully.")

    def forecast(self, months=6):
        future = self.model.make_future_dataframe(periods=months*30)

        # Add external regressors if needed
        for reg in self.regressors:
            last_val = self.df[reg].iloc[-1]
            future[reg] = last_val  # simple constant extrapolation

        forecast = self.model.predict(future)
        return forecast

    def plot_forecast(self, forecast):
        fig1 = self.model.plot(forecast)
        plt.title("Carbon Emissions Forecast")
        plt.show()

    def detect_anomalies(self, forecast):
        forecast['residual'] = forecast['yhat'] - forecast['yhat_lower']
        anomalies = forecast[forecast['residual'] < -2 * forecast['residual'].std()]
        return anomalies[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

    def evaluate_model(self, horizon='90 days'):
        df_cv = cross_validation(self.model, horizon=horizon)
        df_p = performance_metrics(df_cv)
        return df_p[['mae', 'mape', 'rmse']].mean()

# Example Usage:
# df = pd.read_csv('emissions_time_series.csv')
# forecaster = EmissionForecaster(df, regressors=['economic_index', 'supply_disruption'])
# forecaster.train_model()
# forecast_df = forecaster.forecast(months=12)
# print(forecaster.evaluate_model())
# anomalies = forecaster.detect_anomalies(forecast_df)
# forecaster.plot_forecast(forecast_df)


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import shap

class CarbonLSTMPredictor:
    def __init__(self, sequence_length=30, feature_columns=None):
        self.sequence_length = sequence_length
        self.feature_columns = feature_columns
        self.model = None
        self.scaler = MinMaxScaler()
        self.history = None

    def preprocess_data(self, df, target_column):
        df = df.copy()
        df.dropna(inplace=True)

        if self.feature_columns is None:
            self.feature_columns = [col for col in df.columns if col != target_column]

        scaled = self.scaler.fit_transform(df[self.feature_columns + [target_column]])
        X, y = [], []
        for i in range(self.sequence_length, len(scaled)):
            X.append(scaled[i - self.sequence_length:i, :-1])
            y.append(scaled[i, -1])
        X, y = np.array(X), np.array(y)
        return X, y

    def build_model(self, input_shape):
        model = Sequential()
        model.add(LSTM(64, return_sequences=True, input_shape=input_shape))
        model.add(Dropout(0.2))
        model.add(LSTM(32))
        model.add(Dropout(0.2))
        model.add(Dense(1))

        def custom_loss(y_true, y_pred):
            return tf.reduce_mean(tf.square(y_true - y_pred)) * 0.7 + tf.reduce_mean(tf.abs(y_true - y_pred)) * 0.3

        model.compile(optimizer='adam', loss=custom_loss, metrics=['mae'])
        self.model = model

    def train(self, X, y, epochs=50, batch_size=32, val_split=0.2):
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
            ReduceLROnPlateau(monitor='val_loss', patience=5, factor=0.5, verbose=1),
            ModelCheckpoint("best_lstm_model.h5", save_best_only=True, monitor="val_loss")
        ]
        self.history = self.model.fit(
            X, y,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=val_split,
            callbacks=callbacks,
            verbose=1
        )

    def predict(self, X):
        preds = self.model.predict(X)
        return preds

    def evaluate(self, X, y_true):
        y_pred = self.model.predict(X)
        y_true = self.scaler.inverse_transform(
            np.hstack([X[:, -1, :-1], y_true.reshape(-1, 1)])
        )[:, -1]
        y_pred = self.scaler.inverse_transform(
            np.hstack([X[:, -1, :-1], y_pred])
        )[:, -1]
        mae = mean_absolute_error(y_true, y_pred)
        rmse = mean_squared_error(y_true, y_pred, squared=False)
        return mae, rmse

    def explain_feature_importance(self, X_sample):
        explainer = shap.DeepExplainer(self.model, X_sample[:100])
        shap_values = explainer.shap_values(X_sample[:100])
        shap.summary_plot(shap_values[0], X_sample[:100], feature_names=self.feature_columns)

# Example usage:
# df = pd.read_csv("carbon_data.csv")
# lstm_model = CarbonLSTMPredictor(sequence_length=30, feature_columns=["temperature", "economic_index"])
# X, y = lstm_model.preprocess_data(df, target_column="emission_kg_co2e")
# lstm_model.build_model(X.shape[1:])
# lstm_model.train(X, y)
# predictions = lstm_model.predict(X[-10:])
# mae, rmse = lstm_model.evaluate(X, y)
# lstm_model.explain_feature_importance(X[-100:])


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

class SupplierCarbonOptimizer:
    def __init__(self, df):
        self.df = df.copy()
        self.scaler = StandardScaler()
        self.kmeans = None
        self.features = [
            'carbon_intensity_per_unit',
            'transport_distance_km',
            'transport_mode_score',
            'renewable_energy_percent',
            'packaging_sustainability_score'
        ]

    def preprocess(self):
        self.df.dropna(subset=self.features, inplace=True)
        self.X = self.scaler.fit_transform(self.df[self.features])

    def determine_optimal_k(self, max_k=10):
        distortions = []
        for k in range(1, max_k + 1):
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(self.X)
            distortions.append(kmeans.inertia_)

        plt.figure(figsize=(8, 4))
        plt.plot(range(1, max_k + 1), distortions, 'bo-')
        plt.xlabel('Number of clusters (k)')
        plt.ylabel('Inertia')
        plt.title('Elbow Method for Optimal k')
        plt.grid(True)
        plt.show()

    def fit_clusters(self, n_clusters):
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        self.df['cluster'] = self.kmeans.fit_predict(self.X)

    def label_clusters(self):
        centroids = self.kmeans.cluster_centers_
        labels = []
        for center in centroids:
            if center[0] < 0 and center[1] < 0:  # low intensity, low distance
                labels.append('Low Carbon Efficient')
            elif center[0] > 0 and center[1] > 0:
                labels.append('High Carbon Intensive')
            else:
                labels.append('Moderate Carbon Supplier')
        self.df['cluster_label'] = self.df['cluster'].map(dict(enumerate(labels)))

    def recommend_switch(self, supplier_id_col='supplier_id'):
        recommendations = []
        for idx, row in self.df.iterrows():
            if row['cluster_label'] == 'High Carbon Intensive':
                alternatives = self.df[
                    (self.df['cluster_label'] == 'Low Carbon Efficient') &
                    (self.df[supplier_id_col] != row[supplier_id_col])
                ]
                if not alternatives.empty:
                    alt = alternatives.sample(1).iloc[0]
                    reduction = row['carbon_intensity_per_unit'] - alt['carbon_intensity_per_unit']
                    recommendations.append({
                        'current_supplier': row[supplier_id_col],
                        'suggested_supplier': alt[supplier_id_col],
                        'potential_reduction_kg_co2e': reduction
                    })
        return pd.DataFrame(recommendations)

    def plot_clusters(self):
        pca_df = pd.DataFrame(self.X, columns=self.features)
        pca_df['cluster'] = self.df['cluster']
        plt.figure(figsize=(10, 6))
        sns.scatterplot(
            x=pca_df[self.features[0]],
            y=pca_df[self.features[1]],
            hue=pca_df['cluster'],
            palette='Set2'
        )
        plt.title('Supplier Clusters (based on Carbon Optimization)')
        plt.xlabel(self.features[0])
        plt.ylabel(self.features[1])
        plt.grid(True)
        plt.show()

# Example Usage:
# df = pd.read_csv('supplier_emission_data.csv')
# optimizer = SupplierCarbonOptimizer(df)
# optimizer.preprocess()
# optimizer.determine_optimal_k()
# optimizer.fit_clusters(n_clusters=3)
# optimizer.label_clusters()
# recommendations = optimizer.recommend_switch()
# optimizer.plot_clusters()
# print(recommendations)
