In [1]:
from pathlib import Path
import sys

import pandas as pd
import numpy as np

REPO_ROOT = Path.cwd().resolve().parents[3]
BACKEND_ROOT = REPO_ROOT / "backend"
DATA_DIR = BACKEND_ROOT / "app" / "data" / "impact_training_data"

# Load the four inputs (use str(Path) for pandas compatibility)
re_ee = pd.read_csv(str(DATA_DIR / "re_ee.csv"))
transport = pd.read_csv(str(DATA_DIR / "transport.csv"))
water = pd.read_csv(str(DATA_DIR / "water_urban_infra.csv"))
blue = pd.read_csv(str(DATA_DIR / "blue.csv"))

for name, df in [("re_ee", re_ee), ("transport", transport),
                ("water", water), ("blue", blue)]:
    print(f"{name}: {df.shape[0]} rows, {df.shape[1]} columns")

re_ee: 83 rows, 16 columns
transport: 64 rows, 9 columns
water: 19 rows, 9 columns
blue: 19 rows, 10 columns


In [2]:
# Cell 2: helper functions

import re

def find_target_results_column(df: pd.DataFrame) -> str | None:
    """
    For transport/water/blue: try to find a 'Target Results' style column.
    """
    for col in df.columns:
        if "target result" in col.lower():
            return col
    return None


def extract_co2_from_text(s: str):
    """
    Extract a numeric CO2 value in tons from free text like:
    'Avoids 15,000 tCO2e per year' or '10,500 tons of CO2 avoided'
    Returns float or None.
    """
    if not isinstance(s, str):
        return None

    text = s.lower()

    # look for patterns like '15,000', '10.5', etc. near 't' or 'ton'
    match = re.search(r"([\d.,]+)\s*(?:t|ton)", text)
    if not match:
        return None

    num_str = match.group(1).replace(",", "")
    try:
        return float(num_str)
    except ValueError:
        return None


In [3]:
# Cell 3: process RE/EE data (has explicit numeric label)

# find description column (if helper exists)
try:
    re_ee_desc_col = find_description_column(re_ee)
except NameError:
    re_ee_desc_col = None

# Known label names we want to map to normalized names
re_ee_label_col = 'Annual GHG Emission Avoided (ton of CO2 Equivalent)b'
proj_candidate = None
# Try to find the project/name/description column robustly
for c in re_ee.columns:
    cl = c.lower()
    if ('project name' in cl and 'description' in cl) or ('project' in cl and 'description' in cl):
        proj_candidate = c
        break
# Fallback to the exact known header if present
if proj_candidate is None and 'Project Name \n(Number/Year Loan Approved) \nand Description'.lower() in '\n'.join(re_ee.columns).lower():
    proj_candidate = 'Project Name \n(Number/Year Loan Approved) \nand Description'

# Build list of columns to keep (only keep those that exist)
keep_cols = []
if proj_candidate and proj_candidate in re_ee.columns:
    keep_cols.append(proj_candidate)
if re_ee_label_col in re_ee.columns:
    keep_cols.append(re_ee_label_col)
# keep the Ocean Finance Framework focus area if present
focus_col = None
for c in re_ee.columns:
    if 'ocean finance framework - primary focus area' in c.lower():
        focus_col = c
        keep_cols.append(c)

if not keep_cols:
    raise ValueError('No expected columns found in re_ee to build output dataframe')

# Create new dataframe with only the kept columns and rename them
col_renames = {}
if proj_candidate:
    col_renames[proj_candidate] = 'project_description'
if re_ee_label_col in re_ee.columns:
    col_renames[re_ee_label_col] = 'actual_co2_tons'
if focus_col:
    col_renames[focus_col] = 'primary_focus_area'

re_ee_clean = re_ee[keep_cols].rename(columns=col_renames).copy()

# Clean the actual_co2_tons column: remove non-numeric characters and convert to nullable Int64
if 'actual_co2_tons' in re_ee_clean.columns:
    re_ee_clean['actual_co2_tons'] = re_ee_clean['actual_co2_tons'].astype(str).str.replace('[^0-9.\-]', '', regex=True)
    re_ee_clean['actual_co2_tons'] = pd.to_numeric(re_ee_clean['actual_co2_tons'], errors='coerce')
    # Optionally drop rows missing numeric CO2 values
    re_ee_clean = re_ee_clean[re_ee_clean['actual_co2_tons'].notna()].copy()
    # Convert to integer using pandas nullable integer type
    re_ee_clean['actual_co2_tons'] = re_ee_clean['actual_co2_tons'].astype('Int64')

# Show result
print('Output columns:', list(re_ee_clean.columns))
re_ee_clean

Output columns: ['project_description', 'actual_co2_tons']


Unnamed: 0,project_description,actual_co2_tons
0,Indonesia: Java–Bali Electricity Distribution ...,330000
1,"China, People's Republic of: Integrated Renewa...",1000000
2,Papua New Guinea: Town Electrification Investm...,35000
4,"China, People’s Republic of:Agricultural and\n...",255200
5,Regional: Southeast Asia Energy Efficiency Pro...,90000
...,...,...
78,Bhutan: Distributed Solar for Public Infrastru...,39735
79,Solomon Islands: Renewable Energy Development ...,5600
80,Azerbaijan: Bilasuvar Solar Power Project (453...,426152
81,Azerbaijan: Banka Solar Power Project (4536/FY...,302972


In [4]:
# Cell 4: process transport / water / blue via text extraction

def process_text_dataset(df: pd.DataFrame, name: str) -> pd.DataFrame:
    """Try to detect a description/project column and extract CO2 from a target-results column.

    Detection strategy (in order):
    - column containing both 'project' and 'description'
    - column containing 'project' and 'name' or 'title'
    - any column containing 'description'
    - fallback to first object-typed column
    """
    # detect description column
    desc_col = None
    cols = list(df.columns)
    for c in cols:
        lc = c.lower()
        if 'project' in lc and 'description' in lc:
            desc_col = c
            break
    if desc_col is None:
        for c in cols:
            lc = c.lower()
            if 'project' in lc and ('name' in lc or 'title' in lc):
                desc_col = c
                break
    if desc_col is None:
        for c in cols:
            if 'description' in c.lower():
                desc_col = c
                break
    if desc_col is None:
        # fallback: first object/string column
        obj_cols = [c for c in cols if df[c].dtype == 'object']
        if obj_cols:
            desc_col = obj_cols[0]

    if desc_col is None:
        print(f"[{name}] Could not detect a description/project column. Available columns:\n  {cols}")
        return pd.DataFrame(columns=["project_description", "actual_co2_tons"])

    target_col = find_target_results_column(df)
    if target_col is None:
        print(f"[{name}] No 'Target Results' style column found. Skipping labels.")
        return pd.DataFrame(columns=["project_description", "actual_co2_tons"])

    # select columns safely
    if desc_col not in df.columns or target_col not in df.columns:
        print(f"[{name}] Required columns not found: desc={desc_col in df.columns}, target={target_col in df.columns}")
        return pd.DataFrame(columns=["project_description", "actual_co2_tons"])

    temp = df[[desc_col, target_col]].copy()
    temp = temp.rename(columns={desc_col: "project_description"})

    temp["actual_co2_tons"] = temp[target_col].apply(extract_co2_from_text)
    before = temp.shape[0]
    temp = temp.dropna(subset=["actual_co2_tons"]) 
    after = temp.shape[0]

    print(f"[{name}] kept {after}/{before} rows with numeric CO2 extracted.")
    return temp[["project_description", "actual_co2_tons"]]


transport_clean = process_text_dataset(transport, "transport")

transport_clean

[transport] kept 19/64 rows with numeric CO2 extracted.


Unnamed: 0,project_description,actual_co2_tons
0,Turkmenistan: North–South Railway (2737/FY2011...,26800.0
4,Bangladesh: Greater Dhaka Sustainable Urban Tr...,40000.0
7,"China, People's Republic of: Railway Energy Ef...",2008.0
9,"China, People's Republic of: Railway Energy Ef...",2009.0
10,Bangladesh: SASEC Railway Connectivity—Akhaura...,14.0
20,"Thailand: Bangkok Mass Rapid Transit (3669, 36...",50000.0
22,Uzbekistan: Railway Efficiency Improvement Pro...,900000.0
26,Pakistan: Karachi Bus Rapid Transit Red Line P...,77979.0
34,"China, People's Republic of: Jilin Yanji Low-C...",60000.0
38,India: Delhi–Meerut Regional Rapid Transit Sys...,258035.0


In [None]:
# Cell 5: combine all datasets & add log transform

all_data = pd.concat(
    [re_ee_clean, transport_clean],
    ignore_index=True
)

all_data["log_co2"] = np.log1p(all_data["actual_co2_tons"])

print("all_data shape:", all_data.shape)
print(all_data["actual_co2_tons"].describe())

all_data.head()


all_data shape: (96, 3)
count             96.0
mean     346930.320844
std      807911.577863
min                0.0
25%            29570.0
50%            88984.0
75%           381250.0
max          7000000.0
Name: actual_co2_tons, dtype: Float64


Unnamed: 0,project_description,actual_co2_tons,log_co2
0,Indonesia: Java–Bali Electricity Distribution ...,330000.0,12.706851
1,"China, People's Republic of: Integrated Renewa...",1000000.0,13.815512
2,Papua New Guinea: Town Electrification Investm...,35000.0,10.463132
3,"China, People’s Republic of:Agricultural and\n...",255200.0,12.449807
4,Regional: Southeast Asia Energy Efficiency Pro...,90000.0,11.407576


In [None]:
output_path = BACKEND_ROOT / 'app' / 'data' / 'impact_training_dataset.csv'
all_data.to_csv(output_path, index=False)
output_path

OSError: Cannot save file into a non-existent directory: 'data'