# Preprocess ODK data to organized tables

# Imports and Set-up

In [29]:
# Standard Imports
import sys
import os
import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

# Google Cloud Imports
import pandas_gbq

In [2]:
# Util imports
sys.path.append("../../")  # include parent directory
from src.settings import DATA_DIR, GCP_PROJ_ID
from src.biomass_inventory import (
    extract_trees,
    extract_stumps,
    extract_dead_trees_class1,
    extract_dead_trees_class2,
    extract_dead_trees_class3,
    extract_ldw_with_hollow,
    extract_ldw_wo_hollow,
)

In [3]:
# Variables
URL = "https://api.ona.io/api/v1/data/763932.csv"
FILE_RAW = DATA_DIR / "csv" / "biomass_inventory_raw.csv"
CARBON_POOLS_OUTDIR = DATA_DIR / "csv" / "carbon_pools"
NESTS = [2, 3, 4]

# BigQuery Variables
DATASET_ID = "biomass_inventory"

In [4]:
# Create output directory
CARBON_POOLS_OUTDIR.mkdir(parents=True, exist_ok=True)

# Get Data from ONA

In [5]:
column_types = {
    col: str
    for col in (
        28,
        399,
        400,
        407,
        408,
        415,
        416,
        845,
        846,
        853,
        854,
        861,
        862,
        869,
        870,
        877,
        878,
        885,
        886,
        893,
        894,
        901,
        902,
        909,
        910,
        1179,
        1180,
        1187,
        1188,
        1195,
        1196,
        1203,
        1204,
        1211,
        1212,
        1219,
        1220,
        1286,
        1337,
        1342,
        1347,
        1352,
        1357,
        1362,
        1378,
        1392,
    )
}

In [6]:
if FILE_RAW.exists():
    data = pd.read_csv(FILE_RAW, dtype=column_types)
else:
    urllib.request.urlretrieve(URL, FILE_RAW)
    data = pd.read_csv(FILE_RAW, dtype=column_types)

## Add a unique ID

In [7]:
# Create a new column with "1" for Primary and "2" for Backup
data["plot_info/plot_type_short"] = data["plot_info/plot_type"].apply(
    lambda x: "1" if x == "Primary" else "2"
)

# Extract subplot letters (assuming they are included in the 'plot_info.sub_plot' column)
data["subplot_letter"] = data["plot_info/sub_plot"].str.replace("sub_plot", "")

# Create the unique ID by concatenating the specified columns
data["unique_id"] = (
    data["plot_info/plot_code_nmbr"].astype(str)
    + data["subplot_letter"]
    + data["plot_info/plot_type_short"]
)

# Extract Plot info

In [8]:
plot_info_cols = [
    "unique_id",
    "plot_info/data_recorder",
    "plot_info/team_no",
    "plot_info/plot_code_nmbr",
    "plot_info/plot_type",
    "plot_info/sub_plot",
    "plot_info/yes_no",
    "plot_shift/sub_plot_shift",
    "plot_GPS/GPS_waypt",
    "plot_GPS/GPS_id",
    "plot_GPS/GPS",
    "plot_GPS/_GPS_latitude",
    "plot_GPS/_GPS_longitude",
    "plot_GPS/_GPS_altitude",
    "plot_GPS/_GPS_precision",
    "plot_GPS/photo",
    "access/access_reason/slope",
    "access/access_reason/danger",
    "access/access_reason/distance",
    "access/access_reason/water",
    "access/access_reason/prohibited",
    "access/access_reason/other",
    "access/manual_reason",
    "lc_data/lc_type",
    "lc_class/lc_class",
    "lc_class/lc_class_other",
    "disturbance/disturbance_yesno",
    "disturbance_data/disturbance_type",
    "disturbance_class/disturbance_class",
    "slope/slope",
    "canopy/avg_height",
    "canopy/can_cov",
]

In [9]:
plot_info = data[plot_info_cols]

In [10]:
plot_info.head(2)

Unnamed: 0,unique_id,plot_info/data_recorder,plot_info/team_no,plot_info/plot_code_nmbr,plot_info/plot_type,plot_info/sub_plot,plot_info/yes_no,plot_shift/sub_plot_shift,plot_GPS/GPS_waypt,plot_GPS/GPS_id,...,access/manual_reason,lc_data/lc_type,lc_class/lc_class,lc_class/lc_class_other,disturbance/disturbance_yesno,disturbance_data/disturbance_type,disturbance_class/disturbance_class,slope/slope,canopy/avg_height,canopy/can_cov
0,308D2,Steve,1,308,primary,sub_plotD,yes,no_shift,7.0,1.0,...,,f_plant,FC,,,,,23.0,12.0,3.0
1,308A2,Steve,1,308,primary,sub_plotA,yes,no_shift,8.0,1.0,...,,f_nat,MDF,,no,,,13.0,8.0,4.0


# Extract info per carbon pool

# Living Trees

In [11]:
trees = extract_trees(data, NESTS)

In [12]:
trees.info(), trees.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6579 entries, 0 to 6578
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   unique_id     6579 non-null   object 
 1   nest          6579 non-null   int64  
 2   species_name  5718 non-null   float64
 3   family_name   1330 non-null   float64
 4   DBH           6579 non-null   float64
dtypes: float64(3), int64(1), object(1)
memory usage: 257.1+ KB


(None,
   unique_id  nest  species_name  family_name   DBH
 0     308D2     2           NaN         25.0  10.8
 1     308D2     2           NaN         25.0  17.3)

In [13]:
trees.describe()

Unnamed: 0,nest,species_name,family_name,DBH
count,6579.0,5718.0,1330.0,6579.0
mean,2.736434,326.618573,84.354135,40.478231
std,0.713525,269.467916,228.708801,25.398493
min,2.0,2.0,2.0,10.0
25%,2.0,194.0,22.0,19.8
50%,3.0,280.0,22.0,35.9
75%,3.0,313.0,33.0,52.9
max,4.0,999.0,999.0,199.0


## Export data and upload to BQ

In [14]:
# Export to CSV
trees.to_csv(CARBON_POOLS_OUTDIR / "trees.csv", index=False)

In [None]:
# Upload to BQ
pandas_gbq.to_gbq(trees, f"{DATASET_ID}.trees", project_id=GCP_PROJ_ID)

# Tree Stumps

note (delete when addressed): removed `'biomass_per_kg_tree': [biomass_per_kg_tree],`. In the original code there was a placeholder column created, this can be added later in the process when biomass per tree is actually calculated

In [16]:
stumps = extract_stumps(data, NESTS)

In [17]:
stumps.info(), stumps.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754 entries, 0 to 1753
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   unique_id      1754 non-null   object 
 1   nest           1754 non-null   int64  
 2   Diam1          1754 non-null   float64
 3   Diam2          1754 non-null   float64
 4   slope          1754 non-null   float64
 5   height         1754 non-null   float64
 6   cut_cl         1754 non-null   object 
 7   hollow_go      1754 non-null   object 
 8   hollow_d1      171 non-null    float64
 9   hollow_d2      171 non-null    float64
 10  stump_density  1754 non-null   float64
dtypes: float64(7), int64(1), object(3)
memory usage: 150.9+ KB


(None,
   unique_id  nest  Diam1  Diam2  slope  height   cut_cl hollow_go  hollow_d1  \
 0     308C2     2   30.0   29.0   43.0    18.0  saw_axe        no        NaN   
 1     249B2     2   15.0   10.0   51.0    80.0  saw_axe        no        NaN   
 
    hollow_d2  stump_density  
 0        NaN            1.0  
 1        NaN            3.0  )

In [18]:
stumps.describe()

Unnamed: 0,nest,Diam1,Diam2,slope,height,hollow_d1,hollow_d2,stump_density
count,1754.0,1754.0,1754.0,1754.0,1754.0,171.0,171.0,1754.0
mean,3.111745,40.57561,36.617423,32.697834,83.215314,36.059532,33.77345,2.128848
std,0.636456,28.664704,24.662826,18.110894,40.670748,23.908597,21.810063,0.791743
min,2.0,10.0,10.0,1.0,1.5,5.0,5.0,1.0
25%,3.0,20.0,18.7,16.0,52.0,18.55,18.25,1.0
50%,3.0,32.0,30.0,33.0,80.0,30.0,30.0,2.0
75%,4.0,53.725,48.875,47.0,110.0,48.0,43.0,3.0
max,4.0,195.0,198.0,80.0,199.1,160.0,150.0,3.0


## Export data and upload to BQ

In [None]:
# Export to CSV
stumps.to_csv(CARBON_POOLS_OUTDIR / "stumps.csv", index=False)

In [None]:
# Upload to BQ
pandas_gbq.to_gbq(stumps, f"{DATASET_ID}.stumps", project_id=GCP_PROJ_ID)

# Dead Trees: Class 1

In [19]:
dead_trees_c1 = extract_dead_trees_class1(data, NESTS)

No class 1 dead trees found in nest 2


In [20]:
dead_trees_c1.info(), dead_trees_c1.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   unique_id     2 non-null      object 
 1   nest          2 non-null      int64  
 2   species_name  2 non-null      float64
 3   DBH_cl1       2 non-null      float64
 4   class         2 non-null      int64  
 5   subclass      2 non-null      object 
dtypes: float64(2), int64(2), object(2)
memory usage: 228.0+ bytes


(None,
   unique_id  nest  species_name  DBH_cl1  class subclass
 0     290C2     3         145.0     38.0      1      n/a
 1     290C2     4         177.0     58.8      1      n/a)

In [21]:
dead_trees_c1.describe()

Unnamed: 0,nest,species_name,DBH_cl1,class
count,2.0,2.0,2.0,2.0
mean,3.5,161.0,48.4,1.0
std,0.707107,22.627417,14.707821,0.0
min,3.0,145.0,38.0,1.0
25%,3.25,153.0,43.2,1.0
50%,3.5,161.0,48.4,1.0
75%,3.75,169.0,53.6,1.0
max,4.0,177.0,58.8,1.0


## Export data and upload to BQ

In [None]:
dead_trees_c1.to_csv(CARBON_POOLS_OUTDIR / "dead_trees_class1.csv", index=False)

In [None]:
# Upload to BQ
pandas_gbq.to_gbq(dead_trees_c1, f"{DATASET_ID}.dead_trees_c1", project_id=GCP_PROJ_ID)

# Dead Trees: Class 2

In [22]:
dead_trees_c2 = extract_dead_trees_class2(data, NESTS)

No dead trees of class 2 found in nest 2
No dead trees of class 2 found in nest 3
No dead trees of class 2 found in nest 4


In [23]:
dead_trees_c2.info(), dead_trees_c2.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame


(None,
 Empty DataFrame
 Columns: []
 Index: [])

## Export data and upload to BQ

In [None]:
# Export CSV
if len(dead_trees_c2) != 0:
    dead_trees_c2.to_csv(CARBON_POOLS_OUTDIR / "dead_trees_class2.csv", index=False)

In [None]:
# Upload to BQ
if len(dead_trees_c2) != 0:
    pandas_gbq.to_gbq(
        dead_trees_c2, f"{DATASET_ID}.dead_trees_c2", project_id=GCP_PROJ_ID
    )

# Dead Trees: Class 3

In [None]:
dead_trees_c3 = extract_dead_trees_c3(data, NESTS)

In [None]:
dead_trees_c3.info(), dead_trees_c3.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   unique_id     254 non-null    object 
 1   nest          254 non-null    int64  
 2   species_name  201 non-null    float64
 3   family_name   12 non-null     float64
 4   dbh_tall      254 non-null    float64
 5   db_tall       254 non-null    float64
 6   tall_density  254 non-null    float64
 7   slope_t_tall  254 non-null    float64
 8   slope_b_tall  254 non-null    float64
 9   dist_t_tall   254 non-null    float64
 10  class         254 non-null    int64  
dtypes: float64(8), int64(2), object(1)
memory usage: 22.0+ KB


(None,
   unique_id  nest  species_name  family_name  dbh_tall  db_tall  tall_density  \
 0     368A2     2         999.0          NaN      21.9     24.1           1.0   
 1     281A2     2           NaN          NaN      32.6     40.8           1.0   
 
    slope_t_tall  slope_b_tall  dist_t_tall  class  
 0          61.0          56.0         10.0      2  
 1         115.0          20.0          9.3      2  )

## Export data and upload to BQ

In [None]:
# Export CSV
if len(dead_trees_c3) != 0:
    dead_trees_c3.to_csv(CARBON_POOLS_OUTDIR / "dead_trees_class3.csv", index=False)

In [None]:
# Upload to BQ
if len(dead_trees_c3) != 0:
    pandas_gbq.to_gbq(
        dead_trees_c3, f"{DATASET_ID}.dead_trees_c3", project_id=GCP_PROJ_ID
    )

100%|██████████| 1/1 [00:00<00:00, 10205.12it/s]


# Lying Deadwood: Hollow

In [None]:
ldw_hollow = extract_ldw_with_hollow(data)

In [None]:
ldw_hollow.info(), ldw_hollow.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   unique_id   15 non-null     object 
 1   repetition  15 non-null     int64  
 2   type        15 non-null     object 
 3   class       15 non-null     object 
 4   hollow_d1   15 non-null     float64
 5   hollow_d2   15 non-null     float64
 6   diameter    15 non-null     float64
 7   density     15 non-null     float64
dtypes: float64(4), int64(1), object(3)
memory usage: 1.1+ KB


(None,
   unique_id  repetition type class  hollow_d1  hollow_d2  diameter  density
 0     249D2           1  tr2   MDF       12.0       10.0      16.7      3.0
 1     290A2           2  tr2   MCB       68.0       28.0      62.0      1.0)

## Export data and upload to BQ

In [None]:
# Export CSV
if len(ldw_hollow) != 0:
    ldw_hollow.to_csv(CARBON_POOLS_OUTDIR / "lying_deadwood_hollow.csv", index=False)

In [None]:
# Upload to BQ
if len(ldw_hollow) != 0:
    pandas_gbq.to_gbq(
        ldw_hollow, f"{DATASET_ID}.lying_deadwood_hollow", project_id=GCP_PROJ_ID
    )

100%|██████████| 1/1 [00:00<00:00, 14665.40it/s]


# Lying Deadwood without hollow

In [39]:
ldw_wo_hollow = extract_ldw_wo_hollow(data)

In [38]:
ldw_wo_hollow.info(), ldw_wo_hollow.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1826 entries, 0 to 1825
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   unique_id   1826 non-null   object 
 1   repetition  1826 non-null   int64  
 2   type        1826 non-null   object 
 3   class       1826 non-null   object 
 4   diameter    1826 non-null   float64
 5   density     1826 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 85.7+ KB


(None,
   unique_id  repetition type class  diameter  density
 0     308D2           1  tr1    FC      16.5      3.0
 1     308D2           2  tr1    FC      18.3      3.0)

## Export data and upload to BQ

In [40]:
# Export CSV
if len(ldw_without_hollow) != 0:
    ldw_wo_hollow.to_csv(
        CARBON_POOLS_OUTDIR / "lying_deadwood_wo_hollow.csv", index=False
    )

In [41]:
# Upload to BQ
if len(ldw_wo_hollow) != 0:
    pandas_gbq.to_gbq(
        ldw_wo_hollow, f"{DATASET_ID}.lying_deadwood_wo_hollow", project_id=GCP_PROJ_ID
    )

100%|██████████| 1/1 [00:00<00:00, 6278.90it/s]
