In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
SCRIPT_DIR_PATH = os.getcwd()
PARENT_DIR_PATH = os.path.dirname(SCRIPT_DIR_PATH)
CW_DIR_PATH = os.path.join(PARENT_DIR_PATH, "data")
SSP_MODELING_DIR_PATH = os.path.dirname(PARENT_DIR_PATH)
SSP_OUTPUT_DIR_PATH = os.path.join(SSP_MODELING_DIR_PATH, "ssp_run_output")

In [3]:
REGION = "morocco"
RUN_ID  = "sisepuede_results_sisepuede_run_2025-09-29T15;34;07.428105" 

## Load DFs

In [4]:
output_df = pd.read_csv(os.path.join(SSP_OUTPUT_DIR_PATH, RUN_ID, "sisepuede_results_sisepuede_run_2025-09-29T15;34;07.428105_WIDE_INPUTS_OUTPUTS.csv"))
output_df.head()

Unnamed: 0,primary_id,region,time_period,area_agrc_crops_bevs_and_spices,area_agrc_crops_cereals,area_agrc_crops_fibers,area_agrc_crops_fruits,area_agrc_crops_herbs_and_other_perennial_crops,area_agrc_crops_nuts,area_agrc_crops_other_annual,...,yf_agrc_herbs_and_other_perennial_crops_tonne_ha,yf_agrc_nuts_tonne_ha,yf_agrc_other_annual_tonne_ha,yf_agrc_other_woody_perennial_tonne_ha,yf_agrc_pulses_tonne_ha,yf_agrc_rice_tonne_ha,yf_agrc_sugar_cane_tonne_ha,yf_agrc_tubers_tonne_ha,yf_agrc_vegetables_and_vines_tonne_ha,yf_lndu_supremum_pastures_tonne_per_ha
0,0,morocco,0,0.0,4609331.0,4085.468561,1190441.0,28242.323335,152999.613918,950343.230957,...,12.059233,2.9468,6.177415,0.0,2.755621,8.636027,73.140598,37.5471,28.821448,92.81
1,0,morocco,1,0.0,4560555.0,4042.236066,1177843.0,27943.462606,151380.569494,940286.683299,...,12.5398,3.455483,7.165802,0.0,3.259699,8.228317,71.163825,39.0001,29.765171,92.81
2,0,morocco,2,0.0,4672405.0,4141.374219,1206731.0,28628.792023,155093.264619,963347.752475,...,12.161233,3.041817,11.119348,0.0,3.958685,8.302446,89.448975,39.119475,28.720595,92.81
3,0,morocco,3,0.0,4759936.0,4218.957338,1229337.0,29165.114227,157998.72991,981394.786726,...,12.289167,3.375283,5.436126,0.0,4.167037,8.846059,90.190264,37.352525,29.482348,92.81
4,0,morocco,4,0.0,4709498.0,4174.251507,1216311.0,28856.068519,156324.509507,970995.518476,...,12.3377,2.992183,6.177415,0.0,3.226093,7.882382,84.012849,38.55185,32.769776,92.81


In [5]:
emission_targets_df = pd.read_csv(os.path.join(CW_DIR_PATH, f"emission_targets_{REGION}_2022.csv"))
emission_targets_df.head()

Unnamed: 0,Subsector,Gas,Edgar_Sector,Edgar_Subsector,Edgar_Subsector_Synthetic,Vars,id,MAR,Edgar_Class
0,agrc,CH4,Agriculture,AG - Crops,AG - Crops,emission_co2e_ch4_agrc_anaerobicdom_rice:emiss...,AG - Crops - CH4,0.026607,AG - Crops:CH4
1,agrc,CO2,Agriculture,AG - Crops,AG - Crops,emission_co2e_co2_agrc_biomass_bevs_and_spices...,AG - Crops - CO2,0.611888,AG - Crops:CO2
2,agrc,N2O,Agriculture,AG - Crops,AG - Crops,emission_co2e_n2o_agrc_biomass_burning:emissio...,AG - Crops - N2O,4.621083,AG - Crops:N2O
3,lvst,CH4,Agriculture,AG - Livestock,AG - Livestock,emission_co2e_ch4_lvst_entferm_buffalo:emissio...,AG - Livestock - CH4,4.577545,AG - Livestock:CH4
4,lsmm,CH4,Agriculture,AG - Livestock,AG - Livestock,emission_co2e_ch4_lsmm_anaerobic_digester:emis...,AG - Livestock - CH4,4.577545,AG - Livestock:CH4


In [6]:
# Filter the emission targets for lndu, frst, and soil Subsector
emission_targets_filtered = emission_targets_df[
    emission_targets_df['Subsector'].isin(['lndu', 'frst', 'soil'])
]

emission_targets_filtered = emission_targets_filtered.reset_index(drop=True)
emission_targets_filtered

Unnamed: 0,Subsector,Gas,Edgar_Sector,Edgar_Subsector,Edgar_Subsector_Synthetic,Vars,id,MAR,Edgar_Class
0,lndu,CH4,"Land Use, Land Use Change, and Forestry",LULUCF - Deforestation,LULUCF - Deforestation,emission_co2e_ch4_lndu_wetlands,LULUCF - Deforestation - CH4,0.0,LULUCF - Deforestation:CH4
1,lndu,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Deforestation,LULUCF - Deforestation,emission_co2e_co2_lndu_conversion_forests_mang...,LULUCF - Deforestation - CO2,0.0,LULUCF - Deforestation:CO2
2,frst,CH4,"Land Use, Land Use Change, and Forestry",LULUCF - Forest Land,LULUCF - Forest Land,emission_co2e_ch4_frst_methane_mangroves:emiss...,LULUCF - Forest Land - CH4,0.0,LULUCF - Forest Land:CH4
3,frst,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Forest Land,LULUCF - Forest Land Removals,emission_co2e_co2_entc_bmass_processing_and_re...,LULUCF - Forest Land - CO2,-0.875,LULUCF - Forest Land:CO2
4,frst,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Forest Land,LULUCF - Forest Land Sequestration,emission_co2e_co2_frst_sequestration_mangroves...,LULUCF - Forest Land - CO2,-0.875,LULUCF - Forest Land:CO2
5,frst,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - HWP,LULUCF - HWP,emission_co2e_co2_frst_harvested_wood_products,LULUCF - HWP - CO2,0.0,LULUCF - HWP:CO2
6,soil,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Organic Soil,LULUCF - Organic Soil,emission_co2e_co2_soil_lime_use:emission_co2e_...,LULUCF - Organic Soil - CO2,0.0,LULUCF - Organic Soil:CO2
7,soil,N2O,"Land Use, Land Use Change, and Forestry",LULUCF - Organic Soil,LULUCF - Organic Soil,emission_co2e_n2o_soil_fertilizer:emission_co2...,LULUCF - Organic Soil - N2O,0.0,LULUCF - Organic Soil:N2O
8,lndu,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Other Land,LULUCF - Other Land,emission_co2e_co2_lndu_conversion_croplands_to...,LULUCF - Other Land - CO2,0.0,LULUCF - Other Land:CO2


In [7]:
# Filter the output dataframe for primary_id and time_period
output_filtered = output_df[
    (output_df['primary_id'] == 0) &
    (output_df['time_period'] == 7)
]
output_filtered = output_filtered.reset_index(drop=True)
output_filtered

Unnamed: 0,primary_id,region,time_period,area_agrc_crops_bevs_and_spices,area_agrc_crops_cereals,area_agrc_crops_fibers,area_agrc_crops_fruits,area_agrc_crops_herbs_and_other_perennial_crops,area_agrc_crops_nuts,area_agrc_crops_other_annual,...,yf_agrc_herbs_and_other_perennial_crops_tonne_ha,yf_agrc_nuts_tonne_ha,yf_agrc_other_annual_tonne_ha,yf_agrc_other_woody_perennial_tonne_ha,yf_agrc_pulses_tonne_ha,yf_agrc_rice_tonne_ha,yf_agrc_sugar_cane_tonne_ha,yf_agrc_tubers_tonne_ha,yf_agrc_vegetables_and_vines_tonne_ha,yf_lndu_supremum_pastures_tonne_per_ha
0,0,morocco,7,0.0,4834381.0,4284.941251,1248564.0,29621.252611,160469.80836,996743.666491,...,12.023165,2.949341,6.177415,0.0,3.474771,8.253027,87.719298,39.935028,30.906144,92.81


## Obtain Values

In [8]:
def sum_vars_from_df2(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    vars_col: str = "Vars",
    out_col: str = "df2_total",
    record_missing_col: str | None = "missing_vars",
    df2_filter: dict | None = None,
) -> pd.DataFrame:
    """
    For each row in df1, split the colon-separated strings in `vars_col`,
    find those columns in df2, sum their values (over rows & columns), and
    write the total to `out_col` in df1.

    Parameters
    ----------
    df1 : DataFrame
        Must contain a string column `vars_col` with colon-separated names.
    df2 : DataFrame
        Wide table whose columns include the names referenced by `df1[vars_col]`.
    vars_col : str
        Column in df1 with colon-separated variable names.
    out_col : str
        New column to create in df1 with totals from df2.
    record_missing_col : str | None
        If provided, creates a column listing any missing vars for each row.
    df2_filter : dict | None
        Optional filters to reduce df2 before summing, e.g.
        {"region": "egypt", "time_period": 7}

    Returns
    -------
    DataFrame
        df1 with new column `out_col` (and `record_missing_col` if requested).
    """
    # Optionally filter df2 by key=value pairs (e.g., region/time_period)
    if df2_filter:
        mask = pd.Series(True, index=df2.index)
        for k, v in df2_filter.items():
            mask &= (df2[k] == v)
        df2_view = df2.loc[mask]
    else:
        df2_view = df2

    # Ensure we only operate on numeric data when summing
    numeric_cols = set(df2_view.select_dtypes(include=[np.number]).columns)

    def _total_for_vars(vars_str: str):
        if pd.isna(vars_str) or not str(vars_str).strip():
            return np.nan, []

        # Split, strip, and deduplicate while preserving order
        raw = [s.strip() for s in str(vars_str).split(":") if s.strip()]
        seen = set()
        cols = [c for c in raw if not (c in seen or seen.add(c))]

        present = [c for c in cols if c in df2_view.columns and c in numeric_cols]
        missing = [c for c in cols if c not in df2_view.columns or c not in numeric_cols]

        if not present or df2_view.empty:
            return np.nan, missing

        # Sum over all filtered rows & all present columns
        vals = df2_view[present].to_numpy(dtype=float, copy=False)
        total = np.nansum(vals)
        return float(total), missing

    totals, missings = [], []
    for v in df1[vars_col].astype("string"):
        total, missing = _total_for_vars(v)
        totals.append(total)
        missings.append(missing)

    df1 = df1.copy()
    df1[out_col] = totals
    if record_missing_col is not None:
        df1[record_missing_col] = missings

    return df1


# -----------------------------
# Example usage
# -----------------------------

# If DF2 has a single row for the target (e.g., region="egypt", a specific time_period):
# df2_filter = {"region": "egypt"}              # or {"region": "egypt", "time_period": 7}
# If you want to sum across all rows of DF2, set df2_filter = None.

# df1_result = sum_vars_from_df2(DF1, DF2, vars_col="Vars",
#                                out_col="DF2_total",
#                                record_missing_col="Missing_in_DF2",
#                                df2_filter={"region": "egypt"})
# print(df1_result.head())


In [9]:
emission_targets_extended_result = sum_vars_from_df2(emission_targets_filtered, output_filtered, vars_col="Vars",
                               out_col="DF2_total",
                               record_missing_col="Missing_in_DF2",
                               df2_filter={"region": REGION})
emission_targets_extended_result

Unnamed: 0,Subsector,Gas,Edgar_Sector,Edgar_Subsector,Edgar_Subsector_Synthetic,Vars,id,MAR,Edgar_Class,DF2_total,Missing_in_DF2
0,lndu,CH4,"Land Use, Land Use Change, and Forestry",LULUCF - Deforestation,LULUCF - Deforestation,emission_co2e_ch4_lndu_wetlands,LULUCF - Deforestation - CH4,0.0,LULUCF - Deforestation:CH4,0.011666,[]
1,lndu,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Deforestation,LULUCF - Deforestation,emission_co2e_co2_lndu_conversion_forests_mang...,LULUCF - Deforestation - CO2,0.0,LULUCF - Deforestation:CO2,4.545796,[]
2,frst,CH4,"Land Use, Land Use Change, and Forestry",LULUCF - Forest Land,LULUCF - Forest Land,emission_co2e_ch4_frst_methane_mangroves:emiss...,LULUCF - Forest Land - CH4,0.0,LULUCF - Forest Land:CH4,0.162925,[]
3,frst,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Forest Land,LULUCF - Forest Land Removals,emission_co2e_co2_entc_bmass_processing_and_re...,LULUCF - Forest Land - CO2,-0.875,LULUCF - Forest Land:CO2,10.137287,[]
4,frst,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Forest Land,LULUCF - Forest Land Sequestration,emission_co2e_co2_frst_sequestration_mangroves...,LULUCF - Forest Land - CO2,-0.875,LULUCF - Forest Land:CO2,-12.049629,[]
5,frst,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - HWP,LULUCF - HWP,emission_co2e_co2_frst_harvested_wood_products,LULUCF - HWP - CO2,0.0,LULUCF - HWP:CO2,-2.426299,[]
6,soil,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Organic Soil,LULUCF - Organic Soil,emission_co2e_co2_soil_lime_use:emission_co2e_...,LULUCF - Organic Soil - CO2,0.0,LULUCF - Organic Soil:CO2,3.09711,[]
7,soil,N2O,"Land Use, Land Use Change, and Forestry",LULUCF - Organic Soil,LULUCF - Organic Soil,emission_co2e_n2o_soil_fertilizer:emission_co2...,LULUCF - Organic Soil - N2O,0.0,LULUCF - Organic Soil:N2O,17.205768,[]
8,lndu,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Other Land,LULUCF - Other Land,emission_co2e_co2_lndu_conversion_croplands_to...,LULUCF - Other Land - CO2,0.0,LULUCF - Other Land:CO2,-2.279446,[]


In [10]:
def apply_totals_to_original(
    original_df: pd.DataFrame,
    totals_df: pd.DataFrame,
    key_cols=("Subsector", "Gas", "Edgar_Class"),
    source_col="DF2_total",
    target_col="EGY",
    keep_old_as="EGY_old",
):
    """
    Left-join totals_df[source_col] to original_df on key_cols and
    overwrite original_df[target_col] where source_col is not NA.
    Optionally preserves the old target_col in keep_old_as.
    """
    # Prepare right side: one row per key (in case of duplicates)
    right = (
        totals_df
        .dropna(subset=[source_col])
        .drop_duplicates(subset=list(key_cols), keep="last")
        [list(key_cols) + [source_col]]
    )

    merged = original_df.merge(right, on=list(key_cols), how="left")

    out = merged.copy()
    if keep_old_as:
        out[keep_old_as] = out[target_col]

    # Overwrite only where we actually have a new total
    mask = ~out[source_col].isna()
    out[target_col] = np.where(mask, out[source_col], out[target_col])

    # Drop the helper column
    out = out.drop(columns=[source_col])
    return out

# Example usage:
# updated_df = apply_totals_to_original(original_df=original_df, totals_df=df_with_DF2_total)


In [11]:
updated_df = apply_totals_to_original(original_df=emission_targets_df, 
                                      totals_df=emission_targets_extended_result,
                                      target_col="MAR",
                                      keep_old_as="MAR_old")
updated_df

Unnamed: 0,Subsector,Gas,Edgar_Sector,Edgar_Subsector,Edgar_Subsector_Synthetic,Vars,id,MAR,Edgar_Class,MAR_old
0,agrc,CH4,Agriculture,AG - Crops,AG - Crops,emission_co2e_ch4_agrc_anaerobicdom_rice:emiss...,AG - Crops - CH4,0.026607,AG - Crops:CH4,0.026607
1,agrc,CO2,Agriculture,AG - Crops,AG - Crops,emission_co2e_co2_agrc_biomass_bevs_and_spices...,AG - Crops - CO2,0.611888,AG - Crops:CO2,0.611888
2,agrc,N2O,Agriculture,AG - Crops,AG - Crops,emission_co2e_n2o_agrc_biomass_burning:emissio...,AG - Crops - N2O,4.621083,AG - Crops:N2O,4.621083
3,lvst,CH4,Agriculture,AG - Livestock,AG - Livestock,emission_co2e_ch4_lvst_entferm_buffalo:emissio...,AG - Livestock - CH4,4.577545,AG - Livestock:CH4,4.577545
4,lsmm,CH4,Agriculture,AG - Livestock,AG - Livestock,emission_co2e_ch4_lsmm_anaerobic_digester:emis...,AG - Livestock - CH4,4.577545,AG - Livestock:CH4,4.577545
...,...,...,...,...,...,...,...,...,...,...
69,waso,CH4,Waste,Waste - Solid Waste,Waste - Solid Waste,emission_co2e_ch4_waso_biogas_food:emission_co...,Waste - Solid Waste - CH4,18.920253,Waste - Solid Waste:CH4,18.920253
70,waso,CO2,Waste,Waste - Solid Waste,Waste - Solid Waste,emission_co2e_co2_waso_incineration,Waste - Solid Waste - CO2,0.000000,Waste - Solid Waste:CO2,0.000000
71,waso,N2O,Waste,Waste - Solid Waste,Waste - Solid Waste,emission_co2e_n2o_waso_compost_food:emission_c...,Waste - Solid Waste - N2O,0.006684,Waste - Solid Waste:N2O,0.006684
72,trww,CH4,Waste,Waste - Wastewater Treatment,Waste - Wastewater Treatment,emission_co2e_ch4_trww_treated_advanced_aerobi...,Waste - Wastewater Treatment - CH4,5.047132,Waste - Wastewater Treatment:CH4,5.047132


## Check results and save

In [12]:
updated_df[
    updated_df['Subsector'].isin(['lndu', 'frst', 'soil'])
]


Unnamed: 0,Subsector,Gas,Edgar_Sector,Edgar_Subsector,Edgar_Subsector_Synthetic,Vars,id,MAR,Edgar_Class,MAR_old
59,lndu,CH4,"Land Use, Land Use Change, and Forestry",LULUCF - Deforestation,LULUCF - Deforestation,emission_co2e_ch4_lndu_wetlands,LULUCF - Deforestation - CH4,0.011666,LULUCF - Deforestation:CH4,0.0
60,lndu,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Deforestation,LULUCF - Deforestation,emission_co2e_co2_lndu_conversion_forests_mang...,LULUCF - Deforestation - CO2,4.545796,LULUCF - Deforestation:CO2,0.0
61,frst,CH4,"Land Use, Land Use Change, and Forestry",LULUCF - Forest Land,LULUCF - Forest Land,emission_co2e_ch4_frst_methane_mangroves:emiss...,LULUCF - Forest Land - CH4,0.162925,LULUCF - Forest Land:CH4,0.0
62,frst,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Forest Land,LULUCF - Forest Land Removals,emission_co2e_co2_entc_bmass_processing_and_re...,LULUCF - Forest Land - CO2,-12.049629,LULUCF - Forest Land:CO2,-0.875
63,frst,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Forest Land,LULUCF - Forest Land Sequestration,emission_co2e_co2_frst_sequestration_mangroves...,LULUCF - Forest Land - CO2,-12.049629,LULUCF - Forest Land:CO2,-0.875
64,frst,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - HWP,LULUCF - HWP,emission_co2e_co2_frst_harvested_wood_products,LULUCF - HWP - CO2,-2.426299,LULUCF - HWP:CO2,0.0
65,soil,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Organic Soil,LULUCF - Organic Soil,emission_co2e_co2_soil_lime_use:emission_co2e_...,LULUCF - Organic Soil - CO2,3.09711,LULUCF - Organic Soil:CO2,0.0
66,soil,N2O,"Land Use, Land Use Change, and Forestry",LULUCF - Organic Soil,LULUCF - Organic Soil,emission_co2e_n2o_soil_fertilizer:emission_co2...,LULUCF - Organic Soil - N2O,17.205768,LULUCF - Organic Soil:N2O,0.0
67,lndu,CO2,"Land Use, Land Use Change, and Forestry",LULUCF - Other Land,LULUCF - Other Land,emission_co2e_co2_lndu_conversion_croplands_to...,LULUCF - Other Land - CO2,-2.279446,LULUCF - Other Land:CO2,0.0


In [13]:
# Drop old field
updated_df = updated_df.drop(columns=['MAR_old'])

In [14]:
updated_df.to_csv(os.path.join(CW_DIR_PATH, f"emission_targets_{REGION}_2022_LULUCF_update.csv"), index=False)