# Preprocess ODK data to organized tables

This script downloads the biomass inventory data collected on the ground using ODK and processes it to extract individual measurements for each carbon pool, per plot

# Imports and Set-up

In [1]:
# Standard Imports
import sys
import urllib.request
import pandas as pd
import numpy as np
from math import atan

# geospatial imports
import geopandas as gpd

# Google Cloud Imports
import pandas_gbq

In [2]:
# Util imports
sys.path.append("../../")  # include parent directory
from src.settings import DATA_DIR, GCP_PROJ_ID, CARBON_POOLS_OUTDIR
from src.odk_data_parsing import (
    extract_trees,
    extract_stumps,
    extract_dead_trees_class1,
    extract_dead_trees_class2s,
    extract_dead_trees_class2t,
    extract_ldw_with_hollow,
    extract_ldw_wo_hollow,
)

In [3]:
# Variables
URL = "https://api.ona.io/api/v1/data/763932.csv"
FILE_RAW = DATA_DIR / "csv" / "biomass_inventory_raw.csv"
NESTS = [2, 3, 4]

# BigQuery Variables
DATASET_ID = "biomass_inventory"
IF_EXISTS = "replace"

## Get Data from ONA

In [4]:
column_types = {
    col: str
    for col in (
        28,
        399,
        400,
        407,
        408,
        415,
        416,
        845,
        846,
        853,
        854,
        861,
        862,
        869,
        870,
        877,
        878,
        885,
        886,
        893,
        894,
        901,
        902,
        909,
        910,
        1179,
        1180,
        1187,
        1188,
        1195,
        1196,
        1203,
        1204,
        1211,
        1212,
        1219,
        1220,
        1286,
        1337,
        1342,
        1347,
        1352,
        1357,
        1362,
        1378,
        1392,
    )
}

In [5]:
if FILE_RAW.exists():
    data = pd.read_csv(FILE_RAW, dtype=column_types)
else:
    urllib.request.urlretrieve(URL, FILE_RAW)
    data = pd.read_csv(FILE_RAW, dtype=column_types)

## Add a unique ID

In [6]:
plot_types = {"primary": 1, "backup": 2}

In [7]:
# Create a new column with "1" for Primary and "2" for Backup
data["plot_type_short"] = data["plot_info/plot_type"].replace(plot_types)

# Extract subplot letters (assuming they are included in the 'plot_info.sub_plot' column)
data["subplot_letter"] = data["plot_info/sub_plot"].str.replace("sub_plot", "")

# Create the unique ID by concatenating the specified columns
data["unique_id"] = (
    data["plot_info/plot_code_nmbr"].astype(str)
    + data["subplot_letter"]
    + data["plot_type_short"].astype(str)
)

  data["plot_type_short"] = data["plot_info/plot_type"].replace(plot_types)


## Check for duplicate plot IDs

In [8]:
data[
    data.unique_id.isin(
        data.loc[data.duplicated(subset="unique_id"), "unique_id"].unique()
    )
].sort_values("unique_id")

Unnamed: 0,start,end,today,deviceid,plot_info/data_recorder,plot_info/team_no,plot_info/plot_code_nmbr,plot_info/plot_type,plot_info/sub_plot,plot_info/yes_no,...,_version,_duration,_submitted_by,_total_media,_media_count,_media_all_received,_xform_id,plot_type_short,subplot_letter,unique_id
397,2024-01-25T13:26:36.663+08:00,2024-01-25T20:16:28.902+08:00,2024-01-25,collect:0WR9wt7aHF26K47U,Rodney,5,17,backup,sub_plotD,yes,...,13,24592.0,clearwind,2.0,2.0,True,763932.0,2,D,17D2
394,2024-01-25T09:56:23.141+08:00,2024-01-25T19:33:21.013+08:00,2024-01-25,collect:0WR9wt7aHF26K47U,Rodney,5,17,backup,sub_plotD,yes,...,13,34618.0,clearwind,2.0,2.0,True,763932.0,2,D,17D2
391,2024-01-25T11:36:54.122+08:00,2024-01-25T12:16:15.249+08:00,2024-01-25,collect:iLqyO6ooVrqKuBLN,Christmel,4,198,primary,sub_plotA,yes,...,13,2361.0,clearwind,2.0,2.0,True,763932.0,1,A,198A1
475,2024-02-09T13:26:18.048+08:00,2024-02-09T13:28:17.765+08:00,2024-02-09,collect:PidmSuC0Q54OLneq,Sham,2,198,primary,sub_plotA,no,...,13,119.0,clearwind,0.0,0.0,True,763932.0,1,A,198A1
228,2023-12-02T10:54:21.245+08:00,2023-12-03T15:23:56.573+08:00,2023-12-02,collect:49ETbl4W1cfU6fo2,Steve,1,224,primary,sub_plotA,yes,...,13,102575.0,clearwind,2.0,2.0,True,763932.0,1,A,224A1
221,2023-12-01T14:18:02.307+08:00,2023-12-01T14:30:26.364+08:00,2023-12-01,collect:49ETbl4W1cfU6fo2,Steve,1,224,primary,sub_plotA,yes,...,13,744.0,clearwind,2.0,2.0,True,763932.0,1,A,224A1
230,2023-12-02T13:15:53.287+08:00,2023-12-03T15:24:09.825+08:00,2023-12-02,collect:49ETbl4W1cfU6fo2,Steve,1,224,primary,sub_plotB,yes,...,13,94096.0,clearwind,2.0,2.0,True,763932.0,1,B,224B1
220,2023-12-01T13:06:16.217+08:00,2023-12-01T13:23:39.449+08:00,2023-12-01,collect:49ETbl4W1cfU6fo2,Steve,1,224,primary,sub_plotB,yes,...,13,1043.0,clearwind,2.0,2.0,True,763932.0,1,B,224B1
219,2023-12-01T10:49:57.082+08:00,2023-12-01T11:25:20.448+08:00,2023-12-01,collect:49ETbl4W1cfU6fo2,Steve,1,224,primary,sub_plotC,yes,...,13,2123.0,clearwind,2.0,2.0,True,763932.0,1,C,224C1
227,2023-12-02T09:57:49.379+08:00,2023-12-03T15:23:50.879+08:00,2023-12-02,collect:49ETbl4W1cfU6fo2,Steve,1,224,primary,sub_plotC,yes,...,13,105961.0,clearwind,2.0,2.0,True,763932.0,1,C,224C1


### Assign corrected plot IDs to each duplicate
The subset of duplicates were manually inspected to correct the issue. The common source of the duplicates were typo errors in the plot ID or sublot letter, there were other instances that abandoned subplots persisted in the dataset

In [9]:
# load dataframe with manually annotated plot id corrections
plot_id_corrections = gpd.read_file(
    DATA_DIR / "gpkg" / "duplicate_plots_corrected.gpkg"
)

In [10]:
plot_id_corrections.head(2)

Unnamed: 0,field_1,unique_id,data_recorder,team_no,plot_code_nmbr,plot_type,sub_plot,yes_no,sub_plot_shift,GPS_waypt,...,slope,canopy_avg_height,canopy_cover,access_reason,slope_radians,corrected_plot_area_n2_m2,corrected_plot_area_n3_m2,corrected_plot_area_n4_m2,unique_id_updated,geometry
0,397,17D2,Rodney,5,17,backup,sub_plotD,True,no_shift,10.0,...,23.0,29.0,3.0,,0.226068,82.694573,744.251154,1323.113162,17C2,POINT (126.04161 9.16407)
1,394,17D2,Rodney,5,17,backup,sub_plotD,True,no_shift,99.0,...,9.0,30.0,3.0,,0.089758,79.175989,712.5839,1266.815822,17D2,POINT (126.03970 9.16615)


In [11]:
# Drop rows where the geometry is empty, this indicates that these plots were abandoned
plot_id_corrections = plot_id_corrections[~plot_id_corrections.geometry.is_empty].copy()

In [12]:
plot_id_corrections = plot_id_corrections[
    plot_id_corrections["unique_id_updated"] != "flag"
]

In [13]:
plot_id_corrections.shape

(26, 34)

In [14]:
# create a uuid to match duplicates from the original data
plot_id_corrections["uuid"] = (
    plot_id_corrections["unique_id"]
    + plot_id_corrections["slope"].astype(str)
    + plot_id_corrections["team_no"].astype(str)
)

In [15]:
plot_id_corrections.uuid.nunique()

26

In [16]:
# define a dictionary to map the updated unique_id to the uuid
uuid_dict = (
    plot_id_corrections[["unique_id_updated", "uuid"]]
    .set_index("uuid")
    .to_dict()["unique_id_updated"]
)

In [17]:
uuid_dict

{'17D223.05': '17C2',
 '17D29.05': '17D2',
 '198A115.04': '197A1',
 '224A134.01': '224A1',
 '224A110.01': '223A1',
 '224B14.01': '224B1',
 '224B136.01': '223B1',
 '224C145.01': '223C1',
 '224C13.01': '224C1',
 '224D136.01': '224D1',
 '224D14.01': '223D1',
 '266D126.05': '226D1',
 '293A134.01': '293A1',
 '293A122.01': '293B1',
 '303B261.04': '303B2',
 '303B224.04': '303C2',
 '330C155.01': '330C1',
 '334A151.01': '334A1',
 '343B138.05': '343B1',
 '343B111.05': '343D1',
 '366A162.05': '336A1',
 '366A137.01': '366A1',
 '66D124.05': '66D1',
 '66D138.04': '297D1',
 '71B119.05': '71B1',
 '71B113.05': '71D1'}

In [18]:
data["uuid"] = (
    data["unique_id"]
    + data["slope/slope"].astype(str)
    + data["plot_info/team_no"].astype(str)
)

In [19]:
data["unique_id_updated"] = data["uuid"].map(uuid_dict).fillna(np.nan)

In [20]:
# update the unique_id where it is necessary
data["unique_id"] = data["unique_id_updated"].fillna(data["unique_id"])

In [21]:
# check for remaining duplicates
duplicates = data[
    data.unique_id.isin(
        data.loc[data.duplicated(subset="unique_id"), "unique_id"].unique()
    )
].sort_values("unique_id")

In [22]:
# these are duplicate entries where both abandoned and active plots
# are present, drop those without an updated unique_id
duplicates

Unnamed: 0,start,end,today,deviceid,plot_info/data_recorder,plot_info/team_no,plot_info/plot_code_nmbr,plot_info/plot_type,plot_info/sub_plot,plot_info/yes_no,...,_submitted_by,_total_media,_media_count,_media_all_received,_xform_id,plot_type_short,subplot_letter,unique_id,uuid,unique_id_updated
171,2023-11-15T12:36:54.645+08:00,2023-11-15T12:38:43.325+08:00,2023-11-15,collect:iLqyO6ooVrqKuBLN,Christmel,4,303,backup,sub_plotB,no,...,clearwind,0.0,0.0,True,763932.0,2,B,303B2,303B2nan4,
172,2023-11-15T14:36:08.751+08:00,2023-11-15T15:06:03.264+08:00,2023-11-15,collect:iLqyO6ooVrqKuBLN,Christmel,4,303,backup,sub_plotB,yes,...,clearwind,2.0,2.0,True,763932.0,2,B,303B2,303B261.04,303B2
122,2023-10-23T10:29:04.246+08:00,2023-10-23T10:58:03.321+08:00,2023-10-23,collect:49ETbl4W1cfU6fo2,Steve,1,330,primary,sub_plotC,yes,...,clearwind,2.0,2.0,True,763932.0,1,C,330C1,330C155.01,330C1
128,2023-10-26T13:28:15.654+08:00,2023-10-26T13:34:48.967+08:00,2023-10-26,collect:49ETbl4W1cfU6fo2,Steve,1,330,primary,sub_plotC,no,...,clearwind,0.0,0.0,True,763932.0,1,C,330C1,330C1nan1,
154,2023-11-14T11:55:58.545+08:00,2023-11-14T12:04:33.726+08:00,2023-11-14,collect:49ETbl4W1cfU6fo2,Steve,1,334,primary,sub_plotA,yes,...,clearwind,2.0,2.0,True,763932.0,1,A,334A1,334A151.01,334A1
155,2023-11-14T13:58:59.204+08:00,2023-11-14T14:08:21.656+08:00,2023-11-14,collect:49ETbl4W1cfU6fo2,Steve,1,334,primary,sub_plotA,yes,...,clearwind,2.0,2.0,True,763932.0,1,A,334A1,334A165.01,


In [23]:
# save uuid to a list, do not drop here yet since it appears that these rows contain data
duplicates_drop = duplicates.loc[duplicates["unique_id_updated"].isna(), "uuid"]

# Extract Plot info

In [24]:
plot_info_cols = [
    "unique_id",
    "plot_info/data_recorder",
    "plot_info/team_no",
    "plot_info/plot_code_nmbr",
    "plot_info/plot_type",
    "plot_info/sub_plot",
    "plot_info/yes_no",
    "plot_shift/sub_plot_shift",
    "plot_GPS/GPS_waypt",
    "plot_GPS/GPS_id",
    "plot_GPS/GPS",
    "plot_GPS/_GPS_latitude",
    "plot_GPS/_GPS_longitude",
    "plot_GPS/_GPS_altitude",
    "plot_GPS/_GPS_precision",
    "plot_GPS/photo",
    "access/access_reason/slope",
    "access/access_reason/danger",
    "access/access_reason/distance",
    "access/access_reason/water",
    "access/access_reason/prohibited",
    "access/access_reason/other",
    "access/manual_reason",
    "lc_data/lc_type",
    "lc_class/lc_class",
    "lc_class/lc_class_other",
    "disturbance/disturbance_yesno",
    "disturbance_data/disturbance_type",
    "disturbance_class/disturbance_class",
    "slope/slope",
    "canopy/avg_height",
    "canopy/can_cov",
    "uuid",
]

In [25]:
plot_info = data[plot_info_cols].copy()

In [26]:
# rename columns
plot_info_cols = {
    "plot_info/data_recorder": "data_recorder",
    "plot_info/team_no": "team_no",
    "plot_info/plot_code_nmbr": "plot_code_nmbr",
    "plot_info/plot_type": "plot_type",
    "plot_info/sub_plot": "sub_plot",
    "plot_info/yes_no": "yes_no",
    "plot_shift/sub_plot_shift": "sub_plot_shift",
    "plot_GPS/GPS_waypt": "GPS_waypt",
    "plot_GPS/GPS_id": "GPS_id",
    "plot_GPS/GPS": "GPS",
    "plot_GPS/_GPS_latitude": "GPS_latitude",
    "plot_GPS/_GPS_longitude": "GPS_longitude",
    "plot_GPS/_GPS_altitude": "GPS_altitude",
    "plot_GPS/_GPS_precision": "GPS_precision",
    "plot_GPS/photo": "photo",
    "access/access_reason/slope": "access_reason_slope",
    "access/access_reason/danger": "access_reason_danger",
    "access/access_reason/distance": "access_reason_distance",
    "access/access_reason/water": "access_reason_water",
    "access/access_reason/prohibited": "access_reason_prohibited",
    "access/access_reason/other": "access_reason_other",
    "access/manual_reason": "manual_reason",
    "lc_data/lc_type": "lc_type",
    "lc_class/lc_class": "lc_class",
    "lc_class/lc_class_other": "lc_class_other",
    "disturbance/disturbance_yesno": "disturbance_yesno",
    "disturbance_data/disturbance_type": "disturbance_type",
    "disturbance_class/disturbance_class": "disturbance_class",
    "slope/slope": "slope",
    "canopy/avg_height": "canopy_avg_height",
    "canopy/can_cov": "canopy_cover",
}

In [27]:
plot_info.rename(columns=plot_info_cols, inplace=True)

In [28]:
# drop duplicate plot id here since remaining duplicates
# have empty geometry
plot_info = plot_info[~plot_info["uuid"].isin(duplicates_drop)].copy()

In [29]:
plot_info.drop(columns=["uuid"], inplace=True)

### Set correct data types

In [30]:
column_types = {
    "unique_id": str,
    "data_recorder": str,
    "team_no": int,
    "plot_code_nmbr": int,
    "plot_type": str,
    "sub_plot": str,
    "yes_no": str,
    "sub_plot_shift": str,
    "GPS": str,
    "photo": str,
    "access_reason_slope": str,
    "access_reason_danger": str,
    "access_reason_distance": str,
    "access_reason_water": str,
    "access_reason_prohibited": str,
    "access_reason_other": str,
    "manual_reason": str,
    "lc_type": str,
    "lc_class": str,
    "lc_class_other": str,
    "disturbance_yesno": str,
    "disturbance_type": str,
    "disturbance_class": str,
}

In [31]:
plot_info = plot_info.astype(column_types)

### Compress access reasons to one column

In [32]:
plot_info["access_reason"] = np.nan
plot_info["access_reason"] = plot_info["access_reason"].astype(str)
for index, row in plot_info.iterrows():
    if row["access_reason_slope"] == "True":
        plot_info.loc[index, "access_reason"] = "slope"
    elif row["access_reason_danger"] == "True":
        plot_info.loc[index, "access_reason"] = "danger"
    elif row["access_reason_distance"] == "True":
        plot_info.loc[index, "access_reason"] = "distance"
    elif row["access_reason_water"] == "True":
        plot_info.loc[index, "access_reason"] = "water"
    elif row["access_reason_prohibited"] == "True":
        plot_info.loc[index, "access_reason"] = "prohibited"
    elif row["access_reason_other"] == "True":
        plot_info.loc[index, "access_reason"] = row["manual_reason"]

In [33]:
# Categorize manual reasons
plot_info.loc[plot_info.access_reason == "90 degree slope ", "access_reason"] = "slope"
plot_info.loc[
    plot_info.access_reason == "Slippery due to rainfall and sharp stones..too risky.",
    "access_reason",
] = "danger"
plot_info.loc[
    plot_info.access_reason == "Creek plot and slope 90 degree", "access_reason"
] = "slope"
plot_info.loc[
    plot_info.access_reason == "Near creek 90 degree slope", "access_reason"
] = "slope"

In [34]:
# drop access_reason columns
plot_info.drop(
    columns=[
        "access_reason_slope",
        "access_reason_danger",
        "access_reason_distance",
        "access_reason_water",
        "access_reason_prohibited",
        "access_reason_other",
    ],
    inplace=True,
)

In [35]:
plot_info.access_reason.value_counts()

access_reason
nan       633
slope      29
danger      9
Name: count, dtype: int64

## Calculate corrected plot area

In [36]:
# Convert slope from percentage to radians
plot_info["slope_radians"] = plot_info["slope"].apply(lambda x: atan(x / 100))

In [37]:
# Calculate corrected radius based on slope (in radians)
corrected_radius_n2 = 5 / np.cos(plot_info["slope_radians"])
corrected_radius_n3 = 15 / np.cos(plot_info["slope_radians"])
corrected_radius_n4 = 20 / np.cos(plot_info["slope_radians"])

In [38]:
# Calculate new total subplot area based on corrected radius
plot_info["corrected_plot_area_n2_m2"] = np.pi * corrected_radius_n2**2
plot_info["corrected_plot_area_n3_m2"] = np.pi * corrected_radius_n3**2
plot_info["corrected_plot_area_n4_m2"] = np.pi * corrected_radius_n4**2

In [39]:
plot_info.info(), plot_info.head(2)

<class 'pandas.core.frame.DataFrame'>
Index: 671 entries, 0 to 673
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   unique_id                  671 non-null    object 
 1   data_recorder              671 non-null    object 
 2   team_no                    671 non-null    int64  
 3   plot_code_nmbr             671 non-null    int64  
 4   plot_type                  671 non-null    object 
 5   sub_plot                   671 non-null    object 
 6   yes_no                     671 non-null    object 
 7   sub_plot_shift             671 non-null    object 
 8   GPS_waypt                  633 non-null    float64
 9   GPS_id                     633 non-null    float64
 10  GPS                        671 non-null    object 
 11  GPS_latitude               576 non-null    float64
 12  GPS_longitude              576 non-null    float64
 13  GPS_altitude               576 non-null    float64
 14 

(None,
   unique_id data_recorder  team_no  plot_code_nmbr plot_type   sub_plot  \
 0     308D1         Steve        1             308   primary  sub_plotD   
 1     308A1         Steve        1             308   primary  sub_plotA   
 
   yes_no sub_plot_shift  GPS_waypt  GPS_id  ... disturbance_type  \
 0    yes       no_shift        7.0     1.0  ...              nan   
 1    yes       no_shift        8.0     1.0  ...              nan   
 
    disturbance_class  slope  canopy_avg_height  canopy_cover access_reason  \
 0                nan   23.0               12.0           3.0           nan   
 1                nan   13.0                8.0           4.0           nan   
 
   slope_radians corrected_plot_area_n2_m2 corrected_plot_area_n3_m2  \
 0      0.226068                 82.694573                744.251154   
 1      0.129275                 79.867139                718.804253   
 
   corrected_plot_area_n4_m2  
 0               1323.113162  
 1               1277.874228  
 
 [

## Export data and upload to BQ

In [40]:
# Export CSV
if len(plot_info) != 0:
    plot_info.to_csv(CARBON_POOLS_OUTDIR / "plot_info.csv", index=False)

In [41]:
# Upload to BQ
if len(plot_info) != 0:
    pandas_gbq.to_gbq(
        plot_info,
        f"{DATASET_ID}.plot_info",
        project_id=GCP_PROJ_ID,
        if_exists=IF_EXISTS,
    )

100%|██████████| 1/1 [00:00<00:00, 9000.65it/s]


# Extract info per carbon pool

# Saplings, Non tree vegetation and litter

In [81]:
cols = [
    "unique_id",
    "sapling_data/count_saplings",
    "ntv_data/litter_data/litter_bag_weight",
    "ntv_data/litter_data/litter_sample_weight",
    "ntv_data/ntv_bag_weight",
    "ntv_data/ntv_sample_weight",
    "slope/slope",
    "plot_info/team_no",
]

In [82]:
# rename columns
col_names = {
    "sapling_data/count_saplings": "count_saplings",
    "ntv_data/litter_data/litter_bag_weight": "litter_bag_weight",
    "ntv_data/litter_data/litter_sample_weight": "litter_sample_weight",
    "ntv_data/ntv_bag_weight": "ntv_bag_weight",
    "ntv_data/ntv_sample_weight": "ntv_sample_weight",
}

In [83]:
ntv = data[cols].copy()

In [84]:
ntv.rename(columns=col_names, inplace=True)

## remove duplicates

In [86]:
ntv["uuid"] = (
    ntv["unique_id"]
    + ntv["slope/slope"].astype(str)
    + ntv["plot_info/team_no"].astype(str)
)

In [87]:
# drop duplicate plot id here since remaining duplicates
# have empty geometry
ntv = ntv[~ntv["uuid"].isin(duplicates_drop)].copy()

In [90]:
ntv[ntv.duplicated(subset="unique_id")]

Unnamed: 0,unique_id,count_saplings,litter_bag_weight,litter_sample_weight,ntv_bag_weight,ntv_sample_weight,slope/slope,plot_info/team_no,uuid


In [91]:
ntv.drop(columns=["uuid", "slope/slope", "plot_info/team_no"], inplace=True)

In [93]:
ntv.info(), ntv.head(2)

<class 'pandas.core.frame.DataFrame'>
Index: 671 entries, 0 to 673
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   unique_id             671 non-null    object 
 1   count_saplings        589 non-null    float64
 2   litter_bag_weight     619 non-null    float64
 3   litter_sample_weight  619 non-null    float64
 4   ntv_bag_weight        619 non-null    float64
 5   ntv_sample_weight     619 non-null    float64
dtypes: float64(5), object(1)
memory usage: 36.7+ KB


(None,
   unique_id  count_saplings  litter_bag_weight  litter_sample_weight  \
 0     308D1             2.0               70.0                 770.0   
 1     308A1             NaN               50.0                 260.0   
 
    ntv_bag_weight  ntv_sample_weight  
 0            70.0              870.0  
 1            50.0              110.0  )

## Export data and upload to BQ

In [95]:
# Export CSV
if len(ntv) != 0:
    ntv.to_csv(CARBON_POOLS_OUTDIR / "saplings_ntv_litter.csv", index=False)

In [96]:
# Upload to BQ
if len(ntv) != 0:
    pandas_gbq.to_gbq(
        ntv,
        f"{DATASET_ID}.saplings_ntv_litter",
        project_id=GCP_PROJ_ID,
        if_exists=IF_EXISTS,
    )

100%|██████████| 1/1 [00:00<00:00, 7752.87it/s]


# Living Trees

In [49]:
trees = extract_trees(data, NESTS)

In [50]:
trees.info(), trees.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6579 entries, 0 to 6578
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   unique_id     6579 non-null   object 
 1   nest          6579 non-null   int64  
 2   species_name  5718 non-null   float64
 3   family_name   1330 non-null   float64
 4   DBH           6579 non-null   float64
dtypes: float64(3), int64(1), object(1)
memory usage: 257.1+ KB


(None,
   unique_id  nest  species_name  family_name   DBH
 0     308D1     2           NaN         25.0  10.8
 1     308D1     2           NaN         25.0  17.3)

In [51]:
trees.describe()

Unnamed: 0,nest,species_name,family_name,DBH
count,6579.0,5718.0,1330.0,6579.0
mean,2.736434,326.618573,84.354135,40.478231
std,0.713525,269.467916,228.708801,25.398493
min,2.0,2.0,2.0,10.0
25%,2.0,194.0,22.0,19.8
50%,3.0,280.0,22.0,35.9
75%,3.0,313.0,33.0,52.9
max,4.0,999.0,999.0,199.0


## Export data and upload to BQ

In [52]:
# Export to CSV
trees.to_csv(CARBON_POOLS_OUTDIR / "trees.csv", index=False)

In [53]:
# Upload to BQ
pandas_gbq.to_gbq(
    trees, f"{DATASET_ID}.trees", project_id=GCP_PROJ_ID, if_exists=IF_EXISTS
)

100%|██████████| 1/1 [00:00<00:00, 9892.23it/s]


# Tree Stumps

[delete when fixed] Note: removed `'biomass_per_kg_tree': [biomass_per_kg_tree],`. In the original code there was a placeholder column created, this can be added later in the process when biomass per tree is actually calculated

In [54]:
stumps = extract_stumps(data, NESTS)

In [55]:
stumps.info(), stumps.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754 entries, 0 to 1753
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   unique_id      1754 non-null   object 
 1   nest           1754 non-null   int64  
 2   Diam1          1754 non-null   float64
 3   Diam2          1754 non-null   float64
 4   slope          1754 non-null   float64
 5   height         1754 non-null   float64
 6   cut_cl         1754 non-null   object 
 7   hollow_go      1754 non-null   object 
 8   hollow_d1      171 non-null    float64
 9   hollow_d2      171 non-null    float64
 10  stump_density  1754 non-null   float64
dtypes: float64(7), int64(1), object(3)
memory usage: 150.9+ KB


(None,
   unique_id  nest  Diam1  Diam2  slope  height   cut_cl hollow_go  hollow_d1  \
 0     308C1     2   30.0   29.0   43.0    18.0  saw_axe        no        NaN   
 1     249B1     2   15.0   10.0   51.0    80.0  saw_axe        no        NaN   
 
    hollow_d2  stump_density  
 0        NaN            1.0  
 1        NaN            3.0  )

In [56]:
stumps.describe()

Unnamed: 0,nest,Diam1,Diam2,slope,height,hollow_d1,hollow_d2,stump_density
count,1754.0,1754.0,1754.0,1754.0,1754.0,171.0,171.0,1754.0
mean,3.111745,40.57561,36.617423,32.697834,83.215314,36.059532,33.77345,2.128848
std,0.636456,28.664704,24.662826,18.110894,40.670748,23.908597,21.810063,0.791743
min,2.0,10.0,10.0,1.0,1.5,5.0,5.0,1.0
25%,3.0,20.0,18.7,16.0,52.0,18.55,18.25,1.0
50%,3.0,32.0,30.0,33.0,80.0,30.0,30.0,2.0
75%,4.0,53.725,48.875,47.0,110.0,48.0,43.0,3.0
max,4.0,195.0,198.0,80.0,199.1,160.0,150.0,3.0


## Export data and upload to BQ

In [57]:
# Export to CSV
stumps.to_csv(CARBON_POOLS_OUTDIR / "stumps.csv", index=False)

In [58]:
# Upload to BQ
pandas_gbq.to_gbq(
    stumps, f"{DATASET_ID}.stumps", project_id=GCP_PROJ_ID, if_exists=IF_EXISTS
)

100%|██████████| 1/1 [00:00<00:00, 11915.64it/s]


# Dead Trees: Class 1

In [59]:
dead_trees_c1 = extract_dead_trees_class1(data, NESTS)

No class 1 dead trees found in nest 2


In [60]:
if not dead_trees_c1.empty:
    dead_trees_c1["class"] = 1
    dead_trees_c1["subclass"] = np.nan

In [61]:
dead_trees_c1.info(), dead_trees_c1.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   unique_id     2 non-null      object 
 1   nest          2 non-null      int64  
 2   species_name  2 non-null      float64
 3   DBH_cl1       2 non-null      float64
 4   class         2 non-null      int64  
 5   subclass      0 non-null      float64
dtypes: float64(3), int64(2), object(1)
memory usage: 228.0+ bytes


(None,
   unique_id  nest  species_name  DBH_cl1  class  subclass
 0     290C1     3         145.0     38.0      1       NaN
 1     290C1     4         177.0     58.8      1       NaN)

In [62]:
dead_trees_c1.describe()

Unnamed: 0,nest,species_name,DBH_cl1,class,subclass
count,2.0,2.0,2.0,2.0,0.0
mean,3.5,161.0,48.4,1.0,
std,0.707107,22.627417,14.707821,0.0,
min,3.0,145.0,38.0,1.0,
25%,3.25,153.0,43.2,1.0,
50%,3.5,161.0,48.4,1.0,
75%,3.75,169.0,53.6,1.0,
max,4.0,177.0,58.8,1.0,


# Dead Trees: Class 2 - short

In [63]:
dead_trees_c2s = extract_dead_trees_class2s(data, NESTS)

No dead trees of class 2 found in nest 2
No dead trees of class 2 found in nest 3
No dead trees of class 2 found in nest 4


In [64]:
dead_trees_c2s.info(), dead_trees_c2s.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame


(None,
 Empty DataFrame
 Columns: []
 Index: [])

In [65]:
if not dead_trees_c2s.empty:
    dead_trees_c2s["class"] = 2
    dead_trees_c2s["subclass"] = "short"

# Dead Trees: Class 2 - Tall

In [66]:
dead_trees_c2t = extract_dead_trees_class2t(data, NESTS)

In [67]:
dead_trees_c2t.info(), dead_trees_c2t.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   unique_id     254 non-null    object 
 1   nest          254 non-null    int64  
 2   species_name  201 non-null    float64
 3   family_name   12 non-null     float64
 4   dbh_tall      254 non-null    float64
 5   db_tall       254 non-null    float64
 6   tall_density  254 non-null    float64
 7   slope_t_tall  254 non-null    float64
 8   slope_b_tall  254 non-null    float64
 9   dist_t_tall   254 non-null    float64
 10  class         254 non-null    int64  
dtypes: float64(8), int64(2), object(1)
memory usage: 22.0+ KB


(None,
   unique_id  nest  species_name  family_name  dbh_tall  db_tall  tall_density  \
 0     368A1     2         999.0          NaN      21.9     24.1           1.0   
 1     281A1     2           NaN          NaN      32.6     40.8           1.0   
 
    slope_t_tall  slope_b_tall  dist_t_tall  class  
 0          61.0          56.0         10.0      2  
 1         115.0          20.0          9.3      2  )

In [68]:
if not dead_trees_c2t.empty:
    dead_trees_c2t["class"] = 2
    dead_trees_c2t["subclass"] = "tall"

# Combine into one table

In [69]:
dead_trees = pd.concat([dead_trees_c1, dead_trees_c2s, dead_trees_c2t])

In [70]:
dead_trees.info(), dead_trees.head(2)

<class 'pandas.core.frame.DataFrame'>
Index: 256 entries, 0 to 253
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   unique_id     256 non-null    object 
 1   nest          256 non-null    int64  
 2   species_name  203 non-null    float64
 3   DBH_cl1       2 non-null      float64
 4   class         256 non-null    int64  
 5   subclass      254 non-null    object 
 6   family_name   12 non-null     float64
 7   dbh_tall      254 non-null    float64
 8   db_tall       254 non-null    float64
 9   tall_density  254 non-null    float64
 10  slope_t_tall  254 non-null    float64
 11  slope_b_tall  254 non-null    float64
 12  dist_t_tall   254 non-null    float64
dtypes: float64(9), int64(2), object(2)
memory usage: 28.0+ KB


(None,
   unique_id  nest  species_name  DBH_cl1  class subclass  family_name  \
 0     290C1     3         145.0     38.0      1      NaN          NaN   
 1     290C1     4         177.0     58.8      1      NaN          NaN   
 
    dbh_tall  db_tall  tall_density  slope_t_tall  slope_b_tall  dist_t_tall  
 0       NaN      NaN           NaN           NaN           NaN          NaN  
 1       NaN      NaN           NaN           NaN           NaN          NaN  )

## Export data and upload to BQ

In [71]:
# Export CSV
if len(dead_trees) != 0:
    dead_trees.to_csv(CARBON_POOLS_OUTDIR / "dead_trees.csv", index=False)

In [72]:
# Upload to BQ
if len(dead_trees) != 0:
    pandas_gbq.to_gbq(
        dead_trees,
        f"{DATASET_ID}.dead_trees",
        project_id=GCP_PROJ_ID,
        if_exists=IF_EXISTS,
    )

100%|██████████| 1/1 [00:00<00:00, 10754.63it/s]


# Lying Deadwood: Hollow

In [73]:
ldw_hollow = extract_ldw_with_hollow(data)

In [74]:
ldw_hollow.info(), ldw_hollow.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   unique_id   15 non-null     object 
 1   repetition  15 non-null     int64  
 2   type        15 non-null     object 
 3   class       15 non-null     object 
 4   hollow_d1   15 non-null     float64
 5   hollow_d2   15 non-null     float64
 6   diameter    15 non-null     float64
 7   density     15 non-null     float64
dtypes: float64(4), int64(1), object(3)
memory usage: 1.1+ KB


(None,
   unique_id  repetition type class  hollow_d1  hollow_d2  diameter  density
 0     249D1           1  tr2   MDF       12.0       10.0      16.7      3.0
 1     290A1           2  tr2   MCB       68.0       28.0      62.0      1.0)

## Export data and upload to BQ

In [75]:
# Export CSV
if len(ldw_hollow) != 0:
    ldw_hollow.to_csv(CARBON_POOLS_OUTDIR / "lying_deadwood_hollow.csv", index=False)

In [76]:
# Upload to BQ
if len(ldw_hollow) != 0:
    pandas_gbq.to_gbq(
        ldw_hollow,
        f"{DATASET_ID}.lying_deadwood_hollow",
        project_id=GCP_PROJ_ID,
        if_exists=IF_EXISTS,
    )

100%|██████████| 1/1 [00:00<00:00, 11491.24it/s]


# Lying Deadwood without hollow

In [77]:
ldw_wo_hollow = extract_ldw_wo_hollow(data)

In [78]:
ldw_wo_hollow.info(), ldw_wo_hollow.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1826 entries, 0 to 1825
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   unique_id   1826 non-null   object 
 1   repetition  1826 non-null   int64  
 2   type        1826 non-null   object 
 3   class       1826 non-null   object 
 4   diameter    1826 non-null   float64
 5   density     1826 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 85.7+ KB


(None,
   unique_id  repetition type class  diameter  density
 0     308D1           1  tr1    FC      16.5      3.0
 1     308D1           2  tr1    FC      18.3      3.0)

## Export data and upload to BQ

In [79]:
# Export CSV
if len(ldw_wo_hollow) != 0:
    ldw_wo_hollow.to_csv(
        CARBON_POOLS_OUTDIR / "lying_deadwood_wo_hollow.csv", index=False
    )

In [80]:
# Upload to BQ
if len(ldw_wo_hollow) != 0:
    pandas_gbq.to_gbq(
        ldw_wo_hollow,
        f"{DATASET_ID}.lying_deadwood_wo_hollow",
        project_id=GCP_PROJ_ID,
        if_exists=IF_EXISTS,
    )

100%|██████████| 1/1 [00:00<00:00, 10131.17it/s]
