# Calculate emission factors and confidence intervals

# Imports and Set-up

In [41]:
# Standard Imports
import sys
import pandas as pd

# Google Cloud Imports
import pandas_gbq

In [2]:
# Util imports
sys.path.append("../../")  # include parent directory
from src.settings import (
    GCP_PROJ_ID,
    CARBON_STOCK_OUTDIR,
    CARBON_POOLS_OUTDIR,
    PC_PLOT_LOOKUP_CSV,
)

from src.biomass_equations import calculate_statistics

In [3]:
import datetime

# Variables
PLOT_INFO_CSV = CARBON_POOLS_OUTDIR / "plot_info.csv"
LITTER_CSV = CARBON_STOCK_OUTDIR / "litter_carbon_stock.csv"
NTV_CSV = CARBON_STOCK_OUTDIR / "ntv_carbon_stock.csv"
DEADWOOD_CSV = CARBON_STOCK_OUTDIR / "deadwood_carbon_stock.csv"
TREES_CSV = CARBON_STOCK_OUTDIR / "trees_carbon_stock.csv"
SAPLINGS_CSV = CARBON_POOLS_OUTDIR / "saplings_carbon_stock.csv"

# Version Control
today = datetime.date.today()
VERSION = today.strftime("%Y%m%d")

# BigQuery Variables
SRC_DATASET_ID = "biomass_inventory"
DATASET_ID = "carbon_stock"
IF_EXISTS = "replace"

## Load data

### Plot Data

In [4]:
if PLOT_INFO_CSV.exists():
    plot_info = pd.read_csv(PLOT_INFO_CSV)
else:
    query = f"""
    SELECT
        * 
    FROM {GCP_PROJ_ID}.{SRC_DATASET_ID}.plot_info"""

    # Read the BigQuery table into a dataframe
    plot_info = pandas_gbq.read_gbq(query, project_id=GCP_PROJ_ID)
    plot_info.to_csv(PLOT_INFO_CSV, index=False)

In [5]:
plot_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   unique_id                  671 non-null    object 
 1   data_recorder              671 non-null    object 
 2   team_no                    671 non-null    int64  
 3   plot_code_nmbr             671 non-null    int64  
 4   plot_type                  671 non-null    object 
 5   sub_plot                   671 non-null    object 
 6   yes_no                     671 non-null    object 
 7   sub_plot_shift             633 non-null    object 
 8   GPS_waypt                  633 non-null    float64
 9   GPS_id                     633 non-null    float64
 10  GPS                        576 non-null    object 
 11  GPS_latitude               576 non-null    float64
 12  GPS_longitude              576 non-null    float64
 13  GPS_altitude               576 non-null    float64

In [6]:
if PC_PLOT_LOOKUP_CSV.exists():
    plot_info_lookup = pd.read_csv(PC_PLOT_LOOKUP_CSV)
else:
    print("PC Plot Lookup CSV not found")

In [7]:
plot_info_lookup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3508 entries, 0 to 3507
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   unique_id  3508 non-null   object
 1   Strata     3508 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 54.9+ KB


In [8]:
plot_info = pd.merge(plot_info, plot_info_lookup, on="unique_id", how="left")

### Trees

In [9]:
if TREES_CSV.exists():
    trees = pd.read_csv(TREES_CSV)
else:
    query = f"""
    SELECT
        * 
    FROM {GCP_PROJ_ID}.{SRC_DATASET_ID}.trees_carbon_stock"""

    # Read the BigQuery table into a dataframe
    trees = pandas_gbq.read_gbq(query, project_id=GCP_PROJ_ID)
    trees.to_csv(TREES_CSV, index=False)

In [10]:
trees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   unique_id                      618 non-null    object 
 1   aboveground_CO2e_per_ha        618 non-null    float64
 2   aboveground_tC_per_ha          618 non-null    float64
 3   belowground_CO2e_per_ha        618 non-null    float64
 4   belowground_tC_per_ha          618 non-null    float64
 5   sapling_CO2e_per_ha            618 non-null    float64
 6   saplings_tC_per_ha             618 non-null    float64
 7   total_aboveground_CO2e_per_ha  618 non-null    float64
dtypes: float64(7), object(1)
memory usage: 38.8+ KB


### Deadwood

In [11]:
if DEADWOOD_CSV.exists():
    deadwood = pd.read_csv(DEADWOOD_CSV)
else:
    query = f"""
    SELECT
        * 
    FROM {GCP_PROJ_ID}.{SRC_DATASET_ID}.deadwood_carbon_stock"""

    # Read the BigQuery table into a dataframe
    deadwood = pandas_gbq.read_gbq(query, project_id=GCP_PROJ_ID)
    deadwood.to_csv(DEADWOOD_CSV, index=False)

In [12]:
deadwood.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 570 entries, 0 to 569
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   unique_id                    570 non-null    object 
 1   stumps_tonnes_dry_matter_ha  440 non-null    float64
 2   ldw_tonnes_dry_matter_ha     476 non-null    float64
 3   sdw_tonnes_dry_matter_ha     191 non-null    float64
 4   all_tonnes_dry_matter_ha     570 non-null    float64
 5   deadwood_tC_per_ha           570 non-null    float64
 6   deadwood_CO2e_per_ha         570 non-null    float64
dtypes: float64(6), object(1)
memory usage: 31.3+ KB


### Litter

In [13]:
if LITTER_CSV.exists():
    litter = pd.read_csv(LITTER_CSV)
else:
    query = f"""
    SELECT
        * 
    FROM {GCP_PROJ_ID}.{SRC_DATASET_ID}.litter_carbon_stock"""

    # Read the BigQuery table into a dataframe
    litter = pandas_gbq.read_gbq(query, project_id=GCP_PROJ_ID)
    litter.to_csv(LITTER_CSV, index=False)

In [14]:
litter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   unique_id             671 non-null    object 
 1   litter_biomass_kg     619 non-null    float64
 2   litter_kg_dry_matter  619 non-null    float64
 3   litter_CO2e_per_ha    619 non-null    float64
dtypes: float64(3), object(1)
memory usage: 21.1+ KB


### Non-tree Vegetation

In [15]:
if NTV_CSV.exists():
    ntv = pd.read_csv(NTV_CSV)
else:
    query = f"""
    SELECT
        * 
    FROM {GCP_PROJ_ID}.{SRC_DATASET_ID}.ntv_carbon_stock"""

    # Read the BigQuery table into a dataframe
    ntv = pandas_gbq.read_gbq(query, project_id=GCP_PROJ_ID)
    ntv.to_csv(NTV_CSV, index=False)

In [16]:
ntv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   unique_id          671 non-null    object 
 1   ntv_biomass_kg     619 non-null    float64
 2   ntv_kg_dry_matter  619 non-null    float64
 3   ntv_CO2e_per_ha    619 non-null    float64
dtypes: float64(3), object(1)
memory usage: 21.1+ KB


# Create plot level summary

In [17]:
merged_df = plot_info[["unique_id", "plot_code_nmbr", "Strata"]].merge(
    trees, on="unique_id", how="left"
)
merged_df = merged_df.merge(deadwood, on="unique_id", how="left")
merged_df = merged_df.merge(ntv, on="unique_id", how="left")
merged_df = merged_df.merge(litter, on="unique_id", how="left")

In [18]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 674 entries, 0 to 673
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   unique_id                      674 non-null    object 
 1   plot_code_nmbr                 674 non-null    int64  
 2   Strata                         674 non-null    int64  
 3   aboveground_CO2e_per_ha        618 non-null    float64
 4   aboveground_tC_per_ha          618 non-null    float64
 5   belowground_CO2e_per_ha        618 non-null    float64
 6   belowground_tC_per_ha          618 non-null    float64
 7   sapling_CO2e_per_ha            618 non-null    float64
 8   saplings_tC_per_ha             618 non-null    float64
 9   total_aboveground_CO2e_per_ha  618 non-null    float64
 10  stumps_tonnes_dry_matter_ha    442 non-null    float64
 11  ldw_tonnes_dry_matter_ha       479 non-null    float64
 12  sdw_tonnes_dry_matter_ha       190 non-null    flo

In [19]:
merged_df.head(2)

Unnamed: 0,unique_id,plot_code_nmbr,Strata,aboveground_CO2e_per_ha,aboveground_tC_per_ha,belowground_CO2e_per_ha,belowground_tC_per_ha,sapling_CO2e_per_ha,saplings_tC_per_ha,total_aboveground_CO2e_per_ha,...,sdw_tonnes_dry_matter_ha,all_tonnes_dry_matter_ha,deadwood_tC_per_ha,deadwood_CO2e_per_ha,ntv_biomass_kg,ntv_kg_dry_matter,ntv_CO2e_per_ha,litter_biomass_kg,litter_kg_dry_matter,litter_CO2e_per_ha
0,308D1,308,2,166.732961,45.472626,60.023866,16.370145,11.98285,3.26805,178.715811,...,,16.323138,7.671875,28.130208,0.8,0.12,8.272,0.8,0.12,6.512
1,308A1,308,2,75.508237,20.593155,27.182965,7.413536,0.0,0.0,75.508237,...,,148.919571,69.992199,256.638062,0.06,0.009,0.6204,0.06,0.009,0.4884


In [20]:
plot_count = (
    merged_df[["unique_id", "plot_code_nmbr"]]
    .groupby("plot_code_nmbr")
    .count()
    .reset_index()
)

In [21]:
merged_df.drop(columns=["unique_id"], inplace=True)

In [22]:
# get mean value of emission factor for each plot
plot_CO2e_ha = merged_df.groupby(["plot_code_nmbr", "Strata"]).mean().reset_index()

In [23]:
# add count of subplots within each plot
plot_CO2e_ha = plot_CO2e_ha.merge(plot_count, on="plot_code_nmbr", how="left")

# rename unique_id to subplot_count
plot_CO2e_ha.rename(columns={"unique_id": "subplot_count"}, inplace=True)

In [24]:
# Drop plots that do not have any recorded data
plot_CO2e_ha.dropna(
    subset=[
        "aboveground_CO2e_per_ha",
        "belowground_CO2e_per_ha",
        "deadwood_CO2e_per_ha",
        "ntv_CO2e_per_ha",
        "litter_CO2e_per_ha",
    ],
    inplace=True,
)

# Drop saplings column and aboveground column since these are summarized in the total_aboveground
plot_CO2e_ha.drop(
    columns=["sapling_CO2e_per_ha", "aboveground_CO2e_per_ha"], inplace=True
)

In [25]:
plot_CO2e_ha.info()

<class 'pandas.core.frame.DataFrame'>
Index: 181 entries, 0 to 188
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   plot_code_nmbr                 181 non-null    int64  
 1   Strata                         181 non-null    int64  
 2   aboveground_tC_per_ha          181 non-null    float64
 3   belowground_CO2e_per_ha        181 non-null    float64
 4   belowground_tC_per_ha          181 non-null    float64
 5   saplings_tC_per_ha             181 non-null    float64
 6   total_aboveground_CO2e_per_ha  181 non-null    float64
 7   stumps_tonnes_dry_matter_ha    173 non-null    float64
 8   ldw_tonnes_dry_matter_ha       173 non-null    float64
 9   sdw_tonnes_dry_matter_ha       121 non-null    float64
 10  all_tonnes_dry_matter_ha       181 non-null    float64
 11  deadwood_tC_per_ha             181 non-null    float64
 12  deadwood_CO2e_per_ha           181 non-null    float64


## Export data and Upload to BQ

In [26]:
# Upload to BQ
if len(plot_CO2e_ha) != 0:
    plot_CO2e_ha.to_csv(
        CARBON_STOCK_OUTDIR / f"plot_emission_factors_{VERSION}.csv", index=False
    )
    pandas_gbq.to_gbq(
        plot_CO2e_ha,
        f"{DATASET_ID}.plot_emission_factors_{VERSION}",
        project_id=GCP_PROJ_ID,
        if_exists=IF_EXISTS,
        progress_bar=True,
    )
else:
    raise ValueError("Dataframe is empty.")

100%|██████████| 1/1 [00:00<00:00, 9892.23it/s]


# Create Strata Level Summary

In [33]:
CO2e_ha_cols = plot_CO2e_ha.filter(like="CO2e_per_ha").columns
subset_cols = CO2e_ha_cols.insert(0, ["plot_code_nmbr", "Strata", "subplot_count"])

In [34]:
data = plot_CO2e_ha[subset_cols].copy()

In [66]:
data.rename(
    columns={"total_aboveground_CO2e_per_ha": "aboveground_CO2e_per_ha"}, inplace=True
)

In [None]:
data.head(2)

Unnamed: 0,plot_code_nmbr,Strata,subplot_count,belowground_CO2e_per_ha,total_aboveground_CO2e_per_ha,deadwood_CO2e_per_ha,ntv_CO2e_per_ha,litter_CO2e_per_ha
0,1,1,4,259.191993,755.817271,135.094665,2.8435,2.2385
1,3,1,4,104.337814,365.517107,516.08927,2.55915,2.035


In [68]:
columns = [
    "belowground_CO2e_per_ha",
    "aboveground_CO2e_per_ha",
    "deadwood_CO2e_per_ha",
    "ntv_CO2e_per_ha",
    "litter_CO2e_per_ha",
]

results_list = []

for strata, group in data.groupby("Strata"):
    for column in columns:
        stats = calculate_statistics(group, column)
        stats["Strata"] = strata
        stats["tCO2e_per_ha"] = column.split("_")[0]
        results_list.append(stats)

results_df = pd.DataFrame(results_list)

# Reordering columns for better readability
results_df = results_df[
    [
        "Strata",
        "tCO2e_per_ha",
        "weighted_mean",
        "confidence_interval_lower",
        "confidence_interval_upper",
        "uncertainty_90",
        "uncertainty_95",
        "margin_of_error",
        "weighted_std",
        "standard_error",
        "standard_error_perc_mean",
    ]
]

In [70]:
results_df.head(2)

Unnamed: 0,Strata,tCO2e_per_ha,weighted_mean,confidence_interval_lower,confidence_interval_upper,uncertainty_90,uncertainty_95,margin_of_error,weighted_std,standard_error,standard_error_perc_mean
0,1,belowground,144.767099,136.314343,153.219856,5.838866,6.957437,8.452756,85.528524,5.138911,3.549778
1,1,aboveground,458.834899,434.806932,482.862866,5.236735,6.239954,24.027967,243.125021,14.607967,3.183709


In [71]:
results_df.sort_values(by=["Strata", "tCO2e_per_ha"], inplace=True)

In [74]:
results_df[
    [
        "Strata",
        "tCO2e_per_ha",
        "weighted_mean",
        "confidence_interval_lower",
        "confidence_interval_upper",
        "uncertainty_90",
        "uncertainty_95",
        "standard_error_perc_mean",
    ]
]

Unnamed: 0,Strata,tCO2e_per_ha,weighted_mean,confidence_interval_lower,confidence_interval_upper,uncertainty_90,uncertainty_95,standard_error_perc_mean
1,1,aboveground,458.834899,434.806932,482.862866,5.236735,6.239954,3.183709
0,1,belowground,144.767099,136.314343,153.219856,5.838866,6.957437,3.549778
2,1,deadwood,662.658117,603.924793,721.391441,8.863292,10.561264,5.388499
4,1,litter,1.392513,1.298771,1.486255,6.73189,8.021542,4.092699
3,1,ntv,1.765882,1.646693,1.885071,6.749545,8.042579,4.103432
6,2,aboveground,317.891018,301.433932,334.348103,5.176958,6.168726,3.147367
5,2,belowground,88.53749,83.306287,93.768693,5.908461,7.040366,3.592089
7,2,deadwood,788.277743,714.787962,861.767523,9.322828,11.108835,5.667877
9,2,litter,1.422163,1.318398,1.525929,7.296335,8.694119,4.435856
8,2,ntv,1.8059,1.674181,1.937619,7.29383,8.691135,4.434334


## Export data and Upload to BQ

In [80]:
# Upload to BQ
if len(results_df) != 0:
    results_df.to_csv(
        CARBON_STOCK_OUTDIR / f"strata_emission_factors_{VERSION}.csv", index=False
    )
    pandas_gbq.to_gbq(
        results_df,
        f"{DATASET_ID}.strata_emission_factors_{VERSION}",
        project_id=GCP_PROJ_ID,
        if_exists=IF_EXISTS,
        progress_bar=True,
    )
else:
    raise ValueError("Dataframe is empty.")

100%|██████████| 1/1 [00:00<00:00, 4364.52it/s]
