In [None]:
!pip install fastapi uvicorn scikit-learn matplotlib seaborn pyngrok streamlit pandas openpyxl



In [None]:
import streamlit as st
import pandas as pd
import numpy as np
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from typing import Optional
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import uvicorn
from pyngrok import ngrok
import threading
import time

In [None]:
# Define Global Warming Potentials
substance_gwp = {
    'CO2': 1,
    'CH4': 25,
    'N2O': 298,
    'CO2eq_AR5': 1,
    'CO2eq_AR5_F_gases': 1,
    'Total_GHG': 1
}

In [None]:
def load_and_preprocess_data():
    try:
        # Load the dataset
        data = pd.read_excel("GHG_by_sector_and_Country.xlsx")

        # Convert year columns to numeric
        year_columns = [col for col in data.columns if isinstance(col, int)]

        # Melt the dataframe to convert years to rows
        melted_data = pd.melt(
            data,
            id_vars=['Substance', 'Sector', 'Country'],
            value_vars=year_columns,
            var_name='Year',
            value_name='Emissions'
        )

        # Handle missing values
        melted_data['Emissions'] = melted_data['Emissions'].fillna(0)

        # Create pivot table to get sectors as columns
        pivot_data = melted_data.pivot_table(
            index=['Country', 'Year', 'Substance'],
            columns='Sector',
            values='Emissions',
            aggfunc='sum'
        ).reset_index()

        # Clean column names
        pivot_data.columns.name = None
        pivot_data.columns = pivot_data.columns.str.strip().str.replace(' ', '_')

        # Fill NaN values with 0
        sector_columns = ['Agriculture', 'Buildings', 'Fuel_Exploitation',
                         'Industrial_Combustion', 'Power_Industry', 'Processes',
                         'Transport', 'Waste']
        pivot_data[sector_columns] = pivot_data[sector_columns].fillna(0)

        return pivot_data

    except Exception as e:
        print(f"Error loading data: {e}")
        return None

In [None]:
def create_pipeline(data):
    # Define features
    sector_columns = ['Agriculture', 'Buildings', 'Fuel_Exploitation',
                     'Industrial_Combustion', 'Power_Industry', 'Processes',
                     'Transport', 'Waste']

    categorical_features = ['Country', 'Substance']
    numerical_features = sector_columns

    # Create preprocessor with balanced handling of features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(drop='first', sparse_output=True, handle_unknown='ignore'), categorical_features)
        ],
        n_jobs=-1
    )

    # Create pipeline with modified RandomForest parameters
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(
            n_estimators=100,    # Increased number of trees
            max_depth=15,        # Increased max depth
            min_samples_split=10,
            min_samples_leaf=5,
            max_features='sqrt', # Use sqrt of number of features
            bootstrap=True,      # Enable bootstrapping
            n_jobs=-1,
            random_state=42
        ))
    ])

    return pipeline, numerical_features, categorical_features

In [None]:
# Pydantic model for input validation
class EmissionInput(BaseModel):
    country: str = Field(..., description="Country name")
    year: int = Field(..., ge=1970, le=2023, description="Year of prediction")
    substance: str = Field(..., description="Type of greenhouse gas (CO2, GWP_100_AR5_CH4, etc.)")
    agriculture: float = Field(0.0, ge=0, description="Agriculture emissions")
    buildings: float = Field(0.0, ge=0, description="Buildings emissions")
    fuel_exploitation: float = Field(0.0, ge=0, description="Fuel exploitation emissions")
    industrial_combustion: float = Field(0.0, ge=0, description="Industrial combustion emissions")
    power_industry: float = Field(0.0, ge=0, description="Power industry emissions")
    processes: float = Field(0.0, ge=0, description="Process emissions")
    transport: float = Field(0.0, ge=0, description="Transport emissions")
    waste: float = Field(0.0, ge=0, description="Waste emissions")

In [None]:
#FastAPI Endpoint
@app.post("/predict")
async def predict_emissions(input_data: EmissionInput):
    try:
        # Create input DataFrame
        df = pd.DataFrame({
            'Country': [input_data.country],
            'Year': [input_data.year],
            'Substance': [input_data.substance],
            'Agriculture': [input_data.agriculture],
            'Buildings': [input_data.buildings],
            'Fuel_Exploitation': [input_data.fuel_exploitation],
            'Industrial_Combustion': [input_data.industrial_combustion],
            'Power_Industry': [input_data.power_industry],
            'Processes': [input_data.processes],
            'Transport': [input_data.transport],
            'Waste': [input_data.waste]
        })

        # Make prediction
        prediction = model.predict(df)[0]

        # Calculate total current emissions
        total_emissions = sum([
            input_data.agriculture,
            input_data.buildings,
            input_data.fuel_exploitation,
            input_data.industrial_combustion,
            input_data.power_industry,
            input_data.processes,
            input_data.transport,
            input_data.waste
        ])

        # Calculate carbon credits
        carbon_credits = max(0, total_emissions - prediction)

        return {
            "country": input_data.country,
            "year": input_data.year,
            "substance": input_data.substance,
            "predicted_emissions": float(prediction),
            "total_current_emissions": float(total_emissions),
            "potential_carbon_credits": float(carbon_credits)
        }
    except Exception as e:
        raise HTTPException(status_code=400, detail=str(e))


In [None]:
# Streamlit interface
def create_streamlit_app():
    st.title("Carbon Credits Prediction System")

    # Sidebar inputs
    st.sidebar.header("Input Parameters")

    region = st.sidebar.selectbox("Region", sorted(data['Region'].unique()))
    year = st.sidebar.number_input("Year", min_value=2000, max_value=2050, value=2023)

    st.sidebar.subheader("Sector Emissions")
    agriculture = st.sidebar.number_input("Agriculture", min_value=0.0, value=0.0)
    land_use = st.sidebar.number_input("Land Use Change and Forestry", min_value=0.0, value=0.0)
    waste = st.sidebar.number_input("Waste", min_value=0.0, value=0.0)
    industrial = st.sidebar.number_input("Industrial Processes", min_value=0.0, value=0.0)
    manufacturing = st.sidebar.number_input("Manufacturing and Construction", min_value=0.0, value=0.0)
    transport = st.sidebar.number_input("Transport", min_value=0.0, value=0.0)
    electricity = st.sidebar.number_input("Electricity and Heat", min_value=0.0, value=0.0)
    buildings = st.sidebar.number_input("Buildings", min_value=0.0, value=0.0)
    fugitive = st.sidebar.number_input("Fugitive Emissions", min_value=0.0, value=0.0)
    other_fuel = st.sidebar.number_input("Other Fuel Combustion", min_value=0.0, value=0.0)
    aviation = st.sidebar.number_input("Aviation and Shipping", min_value=0.0, value=0.0)

    baseline = st.sidebar.number_input("Baseline Emissions (Optional)", min_value=0.0, value=0.0)

    if st.sidebar.button("Calculate"):
        input_data = EmissionInput(
            region=region,
            year=year,
            agriculture=agriculture,
            land_use=land_use,
            waste=waste,
            industrial=industrial,
            manufacturing=manufacturing,
            transport=transport,
            electricity=electricity,
            buildings=buildings,
            fugitive=fugitive,
            other_fuel=other_fuel,
            aviation=aviation,
            baseline_emissions=baseline if baseline > 0 else None
        )

        try:
            result = predict_emissions(input_data)

            st.subheader("Results")
            st.write(f"**Region:** {result['region']}")
            st.write(f"**Year:** {result['year']}")
            st.write(f"**Predicted Emissions:** {result['predicted_emissions']:.2f}")
            st.write(f"**Total Current Emissions:** {result['total_emissions']:.2f}")
            st.write(f"**Potential Carbon Credits:** {result['potential_carbon_credits']:.2f}")

        except Exception as e:
            st.error(f"An error occurred: {e}")

In [None]:
# Main execution
if __name__ == "__main__":
    # Load and preprocess data
    print("Loading data...")
    data = load_and_preprocess_data()

    # Create pipeline and get features
    pipeline, numerical_features, categorical_features = create_pipeline(data)

    # Prepare features and target with log transformation for emissions
    print("\nPreparing features and target...")
    X = data[categorical_features + numerical_features]
    # Log transform the target (emissions) to handle skewness
    y = np.log1p(data[numerical_features].sum(axis=1))

    # Split data with stratification by Substance
    print("Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=X['Substance']  # Stratify by substance type
    )

    # Train model
    print("\nTraining model...")
    model = pipeline.fit(X_train, y_train)

    # Evaluate model
    print("\nEvaluating model...")
    y_pred = model.predict(X_test)

    # Convert predictions back from log scale
    y_test_orig = np.expm1(y_test)
    y_pred_orig = np.expm1(y_pred)

    # Calculate metrics
    r2 = r2_score(y_test_orig, y_pred_orig)
    mse = mean_squared_error(y_test_orig, y_pred_orig)
    rmse = np.sqrt(mse)

    print("\nModel Performance:")
    print(f"R2 Score: {r2:.3f}")
    print(f"MSE: {mse:.3f}")
    print(f"RMSE: {rmse:.3f}")

        # Get feature importances
    print("\nCalculating feature importances...")
    try:
        cat_features = (pipeline.named_steps['preprocessor']
                       .named_transformers_['cat']
                       .get_feature_names_out(categorical_features))
        feature_names = np.concatenate([cat_features, numerical_features])

        importances = model.named_steps['regressor'].feature_importances_
        feature_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)

        # Group importances by feature type
        print("\nTop Features by Category:")

        # Country importances
        print("\nTop 5 Country Importances:")
        country_imp = feature_importance[feature_importance['feature'].str.startswith('Country_')].head()
        print(country_imp)

        # Substance importances
        print("\nSubstance Importances:")
        substance_imp = feature_importance[feature_importance['feature'].str.startswith('Substance_')]
        print(substance_imp)

        # Sector importances
        print("\nSector Importances:")
        sector_imp = feature_importance[feature_importance['feature'].isin(numerical_features)]
        print(sector_imp)

    except Exception as e:
        print(f"Could not calculate feature importances: {e}")

Loading data...

Preparing features and target...
Splitting data...

Training model...

Evaluating model...

Model Performance:
R2 Score: 0.968
MSE: 26432.170
RMSE: 162.580

Calculating feature importances...

Top Features by Category:

Top 5 Country Importances:
             feature  importance
1    Country_Algeria    0.167014
2     Country_Angola    0.144973
3   Country_Anguilla    0.106788
5  Country_Argentina    0.103368
6    Country_Armenia    0.102235

Substance Importances:
                           feature  importance
213      Substance_GWP_100_AR5_N2O    0.000275
211      Substance_GWP_100_AR5_CH4    0.000107
212  Substance_GWP_100_AR5_F-gases    0.000005

Sector Importances:
                   feature  importance
219              Processes    0.014451
220              Transport    0.013526
221                  Waste    0.007823
217  Industrial_Combustion    0.000127
214            Agriculture    0.000107
218         Power_Industry    0.000055
216      Fuel_Exploitation    0.