# Calculate emission factors and confidence intervals

# Imports and Set-up

In [1]:
# Standard Imports
import sys
import pandas as pd

# Google Cloud Imports
import pandas_gbq

In [2]:
# Util imports
sys.path.append("../../")  # include parent directory
from src.settings import (
    GCP_PROJ_ID,
    CARBON_STOCK_OUTDIR,
    CARBON_POOLS_OUTDIR,
    PC_PLOT_LOOKUP_CSV,
)

from src.biomass_equations import calculate_statistics

In [3]:
import datetime

# Variables
PLOT_INFO_CSV = CARBON_POOLS_OUTDIR / "plot_info.csv"
LITTER_CSV = CARBON_STOCK_OUTDIR / "litter_carbon_stock.csv"
NTV_CSV = CARBON_STOCK_OUTDIR / "ntv_carbon_stock.csv"
DEADWOOD_CSV = CARBON_STOCK_OUTDIR / "deadwood_carbon_stock.csv"
TREES_CSV = CARBON_STOCK_OUTDIR / "trees_carbon_stock.csv"
SAPLINGS_CSV = CARBON_POOLS_OUTDIR / "saplings_carbon_stock.csv"

# Version Control
today = datetime.date.today()
VERSION = today.strftime("%Y%m%d")

# BigQuery Variables
SRC_DATASET_ID = "biomass_inventory"
DATASET_ID = "carbon_stock"
IF_EXISTS = "replace"

## Load data

### Plot Data

In [4]:
if PLOT_INFO_CSV.exists():
    plot_info = pd.read_csv(PLOT_INFO_CSV)
else:
    query = f"""
    SELECT
        * 
    FROM {GCP_PROJ_ID}.{SRC_DATASET_ID}.plot_info"""

    # Read the BigQuery table into a dataframe
    plot_info = pandas_gbq.read_gbq(query, project_id=GCP_PROJ_ID)
    plot_info.to_csv(PLOT_INFO_CSV, index=False)

In [5]:
plot_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   unique_id                  671 non-null    object 
 1   data_recorder              671 non-null    object 
 2   team_no                    671 non-null    int64  
 3   plot_code_nmbr             671 non-null    int64  
 4   plot_type                  671 non-null    object 
 5   sub_plot                   671 non-null    object 
 6   yes_no                     671 non-null    object 
 7   sub_plot_shift             633 non-null    object 
 8   GPS_waypt                  633 non-null    float64
 9   GPS_id                     633 non-null    float64
 10  GPS                        576 non-null    object 
 11  GPS_latitude               576 non-null    float64
 12  GPS_longitude              576 non-null    float64
 13  GPS_altitude               576 non-null    float64

In [6]:
if PC_PLOT_LOOKUP_CSV.exists():
    plot_info_lookup = pd.read_csv(PC_PLOT_LOOKUP_CSV)
else:
    print("PC Plot Lookup CSV not found")

In [7]:
plot_info_lookup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2910 entries, 0 to 2909
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   unique_id  2910 non-null   object
 1   Strata     2910 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 45.6+ KB


In [8]:
plot_info = pd.merge(plot_info, plot_info_lookup, on="unique_id", how="left")

### Trees

In [9]:
if TREES_CSV.exists():
    trees = pd.read_csv(TREES_CSV)
else:
    query = f"""
    SELECT
        * 
    FROM {GCP_PROJ_ID}.{SRC_DATASET_ID}.trees_carbon_stock"""

    # Read the BigQuery table into a dataframe
    trees = pandas_gbq.read_gbq(query, project_id=GCP_PROJ_ID)
    trees.to_csv(TREES_CSV, index=False)

In [10]:
trees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 519 entries, 0 to 518
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   unique_id                      519 non-null    object 
 1   aboveground_CO2e_per_ha        519 non-null    float64
 2   aboveground_tC_per_ha          519 non-null    float64
 3   belowground_CO2e_per_ha        519 non-null    float64
 4   belowground_tC_per_ha          519 non-null    float64
 5   sapling_CO2e_per_ha            519 non-null    float64
 6   saplings_tC_per_ha             519 non-null    float64
 7   total_aboveground_CO2e_per_ha  519 non-null    float64
dtypes: float64(7), object(1)
memory usage: 32.6+ KB


### Deadwood

In [11]:
if DEADWOOD_CSV.exists():
    deadwood = pd.read_csv(DEADWOOD_CSV)
else:
    query = f"""
    SELECT
        * 
    FROM {GCP_PROJ_ID}.{SRC_DATASET_ID}.deadwood_carbon_stock"""

    # Read the BigQuery table into a dataframe
    deadwood = pandas_gbq.read_gbq(query, project_id=GCP_PROJ_ID)
    deadwood.to_csv(DEADWOOD_CSV, index=False)

In [12]:
deadwood.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   unique_id                    569 non-null    object 
 1   stumps_tonnes_dry_matter_ha  440 non-null    float64
 2   ldw_tonnes_dry_matter_ha     476 non-null    float64
 3   sdw_tonnes_dry_matter_ha     190 non-null    float64
 4   all_tonnes_dry_matter_ha     569 non-null    float64
 5   deadwood_tC_per_ha           569 non-null    float64
 6   deadwood_CO2e_per_ha         569 non-null    float64
dtypes: float64(6), object(1)
memory usage: 31.2+ KB


### Litter

In [13]:
if LITTER_CSV.exists():
    litter = pd.read_csv(LITTER_CSV)
else:
    query = f"""
    SELECT
        * 
    FROM {GCP_PROJ_ID}.{SRC_DATASET_ID}.litter_carbon_stock"""

    # Read the BigQuery table into a dataframe
    litter = pandas_gbq.read_gbq(query, project_id=GCP_PROJ_ID)
    litter.to_csv(LITTER_CSV, index=False)

In [14]:
litter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   unique_id             671 non-null    object 
 1   litter_biomass_kg     619 non-null    float64
 2   litter_kg_dry_matter  619 non-null    float64
 3   litter_CO2e_per_ha    619 non-null    float64
dtypes: float64(3), object(1)
memory usage: 21.1+ KB


### Non-tree Vegetation

In [15]:
if NTV_CSV.exists():
    ntv = pd.read_csv(NTV_CSV)
else:
    query = f"""
    SELECT
        * 
    FROM {GCP_PROJ_ID}.{SRC_DATASET_ID}.ntv_carbon_stock"""

    # Read the BigQuery table into a dataframe
    ntv = pandas_gbq.read_gbq(query, project_id=GCP_PROJ_ID)
    ntv.to_csv(NTV_CSV, index=False)

In [16]:
ntv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   unique_id          671 non-null    object 
 1   ntv_biomass_kg     619 non-null    float64
 2   ntv_kg_dry_matter  619 non-null    float64
 3   ntv_CO2e_per_ha    619 non-null    float64
dtypes: float64(3), object(1)
memory usage: 21.1+ KB


# Create plot level summary

In [17]:
merged_df = plot_info[["unique_id", "plot_code_nmbr", "Strata"]].merge(
    trees, on="unique_id", how="left"
)
merged_df = merged_df.merge(deadwood, on="unique_id", how="left")
merged_df = merged_df.merge(ntv, on="unique_id", how="left")
merged_df = merged_df.merge(litter, on="unique_id", how="left")

In [18]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   unique_id                      671 non-null    object 
 1   plot_code_nmbr                 671 non-null    int64  
 2   Strata                         568 non-null    float64
 3   aboveground_CO2e_per_ha        519 non-null    float64
 4   aboveground_tC_per_ha          519 non-null    float64
 5   belowground_CO2e_per_ha        519 non-null    float64
 6   belowground_tC_per_ha          519 non-null    float64
 7   sapling_CO2e_per_ha            519 non-null    float64
 8   saplings_tC_per_ha             519 non-null    float64
 9   total_aboveground_CO2e_per_ha  519 non-null    float64
 10  stumps_tonnes_dry_matter_ha    440 non-null    float64
 11  ldw_tonnes_dry_matter_ha       476 non-null    float64
 12  sdw_tonnes_dry_matter_ha       190 non-null    flo

In [19]:
merged_df.head(2)

Unnamed: 0,unique_id,plot_code_nmbr,Strata,aboveground_CO2e_per_ha,aboveground_tC_per_ha,belowground_CO2e_per_ha,belowground_tC_per_ha,sapling_CO2e_per_ha,saplings_tC_per_ha,total_aboveground_CO2e_per_ha,...,sdw_tonnes_dry_matter_ha,all_tonnes_dry_matter_ha,deadwood_tC_per_ha,deadwood_CO2e_per_ha,ntv_biomass_kg,ntv_kg_dry_matter,ntv_CO2e_per_ha,litter_biomass_kg,litter_kg_dry_matter,litter_CO2e_per_ha
0,308D1,308,2.0,166.732961,45.472626,60.023866,16.370145,0.119829,0.032681,166.852789,...,,0.456559,0.214583,0.786803,0.8,0.12,8.272,0.8,0.12,6.512
1,308A1,308,,,,,,,,,...,,1.660009,0.780204,2.860748,0.06,0.009,0.6204,0.06,0.009,0.4884


In [20]:
plot_count = (
    merged_df[["unique_id", "plot_code_nmbr"]]
    .groupby("plot_code_nmbr")
    .count()
    .reset_index()
)

In [21]:
merged_df.drop(columns=["unique_id"], inplace=True)

In [22]:
# get mean value of emission factor for each plot
plot_CO2e_ha = merged_df.groupby(["plot_code_nmbr", "Strata"]).mean().reset_index()

In [23]:
# add count of subplots within each plot
plot_CO2e_ha = plot_CO2e_ha.merge(plot_count, on="plot_code_nmbr", how="left")

# rename unique_id to subplot_count
plot_CO2e_ha.rename(columns={"unique_id": "subplot_count"}, inplace=True)

In [24]:
# Drop plots that do not have any recorded data
plot_CO2e_ha.dropna(
    subset=[
        "aboveground_CO2e_per_ha",
        "belowground_CO2e_per_ha",
        "deadwood_CO2e_per_ha",
        "ntv_CO2e_per_ha",
        "litter_CO2e_per_ha",
    ],
    inplace=True,
)

# Drop saplings column and aboveground column since these are summarized in the total_aboveground
plot_CO2e_ha.drop(
    columns=["sapling_CO2e_per_ha", "aboveground_CO2e_per_ha"], inplace=True
)

In [25]:
plot_CO2e_ha.info()

<class 'pandas.core.frame.DataFrame'>
Index: 193 entries, 0 to 210
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   plot_code_nmbr                 193 non-null    int64  
 1   Strata                         193 non-null    float64
 2   aboveground_tC_per_ha          193 non-null    float64
 3   belowground_CO2e_per_ha        193 non-null    float64
 4   belowground_tC_per_ha          193 non-null    float64
 5   saplings_tC_per_ha             193 non-null    float64
 6   total_aboveground_CO2e_per_ha  193 non-null    float64
 7   stumps_tonnes_dry_matter_ha    175 non-null    float64
 8   ldw_tonnes_dry_matter_ha       178 non-null    float64
 9   sdw_tonnes_dry_matter_ha       111 non-null    float64
 10  all_tonnes_dry_matter_ha       193 non-null    float64
 11  deadwood_tC_per_ha             193 non-null    float64
 12  deadwood_CO2e_per_ha           193 non-null    float64


## Export data and Upload to BQ

In [26]:
# Upload to BQ
if len(plot_CO2e_ha) != 0:
    plot_CO2e_ha.to_csv(
        CARBON_STOCK_OUTDIR / f"plot_emission_factors_{VERSION}.csv", index=False
    )
    pandas_gbq.to_gbq(
        plot_CO2e_ha,
        f"{DATASET_ID}.plot_emission_factors_{VERSION}",
        project_id=GCP_PROJ_ID,
        if_exists=IF_EXISTS,
        progress_bar=True,
    )
else:
    raise ValueError("Dataframe is empty.")

100%|██████████| 1/1 [00:00<00:00, 7194.35it/s]


# Create Strata Level Summary

In [27]:
CO2e_ha_cols = plot_CO2e_ha.filter(like="CO2e_per_ha").columns
subset_cols = CO2e_ha_cols.insert(0, ["plot_code_nmbr", "Strata", "subplot_count"])

In [28]:
data = plot_CO2e_ha[subset_cols].copy()

In [29]:
data.rename(
    columns={"total_aboveground_CO2e_per_ha": "aboveground_CO2e_per_ha"}, inplace=True
)

In [30]:
data.head(2)

Unnamed: 0,plot_code_nmbr,Strata,subplot_count,belowground_CO2e_per_ha,aboveground_CO2e_per_ha,deadwood_CO2e_per_ha,ntv_CO2e_per_ha,litter_CO2e_per_ha
0,1,1.0,4,303.074187,842.121495,1.625643,0.7755,0.6105
1,1,2.0,4,244.564595,679.741039,3.540323,3.532833,2.781167


In [31]:
plot_count = data.groupby("Strata")["plot_code_nmbr"].count().reset_index()

In [32]:
columns = [
    "belowground_CO2e_per_ha",
    "aboveground_CO2e_per_ha",
    "deadwood_CO2e_per_ha",
    "ntv_CO2e_per_ha",
    "litter_CO2e_per_ha",
]

results_list = []

for strata, group in data.groupby("Strata"):
    for column in columns:
        stats = calculate_statistics(group, column)
        stats["Strata"] = strata
        stats["tCO2e_per_ha"] = column.split("_")[0]
        results_list.append(stats)

results_df = pd.DataFrame(results_list)

# Reordering columns for better readability
results_df = results_df[
    [
        "Strata",
        "tCO2e_per_ha",
        "weighted_mean",
        "confidence_interval_lower",
        "confidence_interval_upper",
        "uncertainty_90",
        "uncertainty_95",
        "margin_of_error",
        "weighted_std",
        "standard_error",
        "standard_error_perc_mean",
    ]
]

In [33]:
results_df.head(2)

Unnamed: 0,Strata,tCO2e_per_ha,weighted_mean,confidence_interval_lower,confidence_interval_upper,uncertainty_90,uncertainty_95,margin_of_error,weighted_std,standard_error,standard_error_perc_mean
0,1.0,belowground,179.37325,163.982878,194.763623,8.580082,10.223799,15.390373,85.243432,9.356682,5.21632
1,1.0,aboveground,499.279664,456.692287,541.867042,8.529764,10.163841,42.587377,235.880851,25.891287,5.185728


In [34]:
results_df.sort_values(by=["Strata", "tCO2e_per_ha"], inplace=True)

In [35]:
results_df[
    [
        "Strata",
        "tCO2e_per_ha",
        "weighted_mean",
        "confidence_interval_lower",
        "confidence_interval_upper",
        "uncertainty_90",
        "uncertainty_95",
        "standard_error_perc_mean",
    ]
]

Unnamed: 0,Strata,tCO2e_per_ha,weighted_mean,confidence_interval_lower,confidence_interval_upper,uncertainty_90,uncertainty_95,standard_error_perc_mean
1,1.0,aboveground,499.279664,456.692287,541.867042,8.529764,10.163841,5.185728
0,1.0,belowground,179.37325,163.982878,194.763623,8.580082,10.223799,5.21632
2,1.0,deadwood,8.789399,7.764351,9.814447,11.662324,13.896516,7.09019
4,1.0,litter,1.3075,1.156398,1.458601,11.55651,13.770431,7.025859
3,1.0,ntv,1.651327,1.457983,1.844671,11.708416,13.951438,7.118211
6,2.0,aboveground,313.727324,297.483437,329.97121,5.177709,6.16962,3.147823
5,2.0,belowground,112.697749,106.849317,118.546181,5.189484,6.183652,3.154982
7,2.0,deadwood,9.97695,9.286511,10.66739,6.920344,8.246099,4.207271
9,2.0,litter,1.359433,1.282333,1.436534,5.671548,6.758067,3.448057
8,2.0,ntv,1.725878,1.627951,1.823805,5.674048,6.761045,3.449576


## Get confidence by Strata

In [36]:
columns = [
    "belowground_CO2e_per_ha",
    "aboveground_CO2e_per_ha",
    "deadwood_CO2e_per_ha",
    "ntv_CO2e_per_ha",
    "litter_CO2e_per_ha",
]
data["all_CO2e_per_ha"] = data[columns].sum(axis=1)

In [46]:
strata_list = []

for strata, group in data.groupby("Strata"):
    stats = calculate_statistics(group, "all_CO2e_per_ha")
    stats["Strata"] = strata
    stats["tCO2e_per_ha"] = column.split("_")[0]
    strata_list.append(stats)

strata_df = pd.DataFrame(strata_list)

# Reordering columns for better readability
strata_df = strata_df[
    [
        "Strata",
        "tCO2e_per_ha",
        "weighted_mean",
        "confidence_interval_lower",
        "confidence_interval_upper",
        "uncertainty_90",
        "uncertainty_95",
        "margin_of_error",
        "weighted_std",
        "standard_error",
        "standard_error_perc_mean",
    ]
]

In [52]:
strata_df

Unnamed: 0,Strata,tCO2e_per_ha,weighted_mean,confidence_interval_lower,confidence_interval_upper,uncertainty_90,uncertainty_95,margin_of_error,weighted_std,standard_error,standard_error_perc_mean,plot_code_nmbr
0,1.0,all,690.401141,632.738875,748.063407,8.351995,9.952015,57.662266,319.376896,35.056169,5.077652,22
1,2.0,all,439.487334,417.379622,461.595046,5.030341,5.994021,22.107712,310.298836,13.440535,3.05823,147
2,3.0,all,288.35058,256.459146,320.242015,11.059952,13.178745,31.891435,169.026031,19.388616,6.723973,20
3,4.0,all,-1039.654993,-1039.654993,-1039.654993,-0.0,-0.0,0.0,0.0,0.0,-0.0,1
4,5.0,all,-209.006951,-287.143822,-130.870079,-37.38482,-44.54676,78.136871,134.361163,47.503845,-22.728356,2
5,6.0,all,-775.372414,-775.372414,-775.372414,-0.0,-0.0,0.0,0.0,0.0,-0.0,1


In [50]:
strata_df = strata_df.merge(plot_count)

In [51]:
strata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Strata                     6 non-null      float64
 1   tCO2e_per_ha               6 non-null      object 
 2   weighted_mean              6 non-null      float64
 3   confidence_interval_lower  6 non-null      float64
 4   confidence_interval_upper  6 non-null      float64
 5   uncertainty_90             6 non-null      float64
 6   uncertainty_95             6 non-null      float64
 7   margin_of_error            6 non-null      float64
 8   weighted_std               6 non-null      float64
 9   standard_error             6 non-null      float64
 10  standard_error_perc_mean   6 non-null      float64
 11  plot_code_nmbr             6 non-null      int64  
dtypes: float64(10), int64(1), object(1)
memory usage: 708.0+ bytes


## Export data and Upload to BQ

In [None]:
# Upload to BQ
if len(results_df) != 0:
    results_df.to_csv(
        CARBON_STOCK_OUTDIR / f"strata_emission_factors_{VERSION}.csv", index=False
    )
    pandas_gbq.to_gbq(
        results_df,
        f"{DATASET_ID}.strata_emission_factors_{VERSION}",
        project_id=GCP_PROJ_ID,
        if_exists=IF_EXISTS,
        progress_bar=True,
    )
else:
    raise ValueError("Dataframe is empty.")