# Preprocess ODK data to organized tables

# Imports and Set-up

In [1]:
# Standard Imports
import sys
import os
import urllib.request
import pandas as pd
import numpy as np

# Google Cloud Imports
import pandas_gbq

In [2]:
# Util imports
sys.path.append("../../")  # include parent directory
from src.settings import DATA_DIR, GCP_PROJ_ID
from src.biomass_inventory import (
    extract_trees,
    extract_stumps,
    extract_dead_trees_class1,
)

In [3]:
# Variables
URL = "https://api.ona.io/api/v1/data/763932.csv"
FILE_RAW = DATA_DIR / "csv" / "biomass_inventory_raw.csv"
CARBON_POOLS_OUTDIR = DATA_DIR / "csv" / "carbon_pools"
NESTS = [2, 3, 4]

# BigQuery Variables
DATASET_ID = "biomass_inventory"

In [4]:
# Create output directory
CARBON_POOLS_OUTDIR.mkdir(parents=True, exist_ok=True)

# Get Data from ONA

In [5]:
if FILE_RAW.exists():
    data = pd.read_csv(FILE_RAW, low_memory=False)
else:
    urllib.request.urlretrieve(URL, FILE_RAW)
    data = pd.read_csv(FILE_RAW, low_memory=False)

In [6]:
data.head(2)

Unnamed: 0,start,end,today,deviceid,plot_info/data_recorder,plot_info/team_no,plot_info/plot_code_nmbr,plot_info/plot_type,plot_info/sub_plot,plot_info/yes_no,...,_date_modified,_tags,_notes,_version,_duration,_submitted_by,_total_media,_media_count,_media_all_received,_xform_id
0,2023-09-13T08:34:39.601+08:00,2023-10-10T12:36:28.070+08:00,2023-09-13,collect:49ETbl4W1cfU6fo2,Steve,1,308,primary,sub_plotD,yes,...,2023-10-10T04:36:29.296858+00:00,,,13,2347309.0,clearwind,2.0,2.0,True,763932.0
1,2023-09-13T09:51:59.092+08:00,2023-10-10T12:36:09.000+08:00,2023-09-13,collect:49ETbl4W1cfU6fo2,Steve,1,308,primary,sub_plotA,yes,...,2023-10-10T04:36:10.231153+00:00,,,13,2342650.0,clearwind,2.0,2.0,True,763932.0


## Add a unique ID

In [7]:
# Create a new column with "1" for Primary and "2" for Backup
data["plot_info/plot_type_short"] = data["plot_info/plot_type"].apply(
    lambda x: "1" if x == "Primary" else "2"
)

# Extract subplot letters (assuming they are included in the 'plot_info.sub_plot' column)
data["subplot_letter"] = data["plot_info/sub_plot"].str.replace("sub_plot", "")

# Create the unique ID by concatenating the specified columns
data["unique_id"] = (
    data["plot_info/plot_code_nmbr"].astype(str)
    + data["subplot_letter"]
    + data["plot_info/plot_type_short"]
)

# Extract Plot info

In [None]:
plot_info_cols = [
    "unique_id",
    "plot_info/data_recorder",
    "plot_info/team_no",
    "plot_info/plot_code_nmbr",
    "plot_info/plot_type",
    "plot_info/sub_plot",
    "plot_info/yes_no",
    "plot_shift/sub_plot_shift",
    "plot_GPS/GPS_waypt",
    "plot_GPS/GPS_id",
    "plot_GPS/GPS",
    "plot_GPS/_GPS_latitude",
    "plot_GPS/_GPS_longitude",
    "plot_GPS/_GPS_altitude",
    "plot_GPS/_GPS_precision",
    "plot_GPS/photo",
    "access/access_reason/slope",
    "access/access_reason/danger",
    "access/access_reason/distance",
    "access/access_reason/water",
    "access/access_reason/prohibited",
    "access/access_reason/other",
    "access/manual_reason",
    "lc_data/lc_type",
    "lc_class/lc_class",
    "lc_class/lc_class_other",
    "disturbance/disturbance_yesno",
    "disturbance_data/disturbance_type",
    "disturbance_class/disturbance_class",
    "slope/slope",
    "canopy/avg_height",
    "canopy/can_cov",
]

In [None]:
plot_info = data[plot_info_cols]

In [None]:
plot_info.head(2)

# Extract info per carbon pool

# Living Trees

In [None]:
trees = extract_trees(data, NESTS)

In [None]:
trees.info(), trees.head(2)

## Export data and upload to BQ

In [None]:
# Export to CSV
trees.to_csv(CARBON_POOLS_OUTDIR / "trees.csv", index=False)

In [None]:
# Upload to BQ
pandas_gbq.to_gbq(trees, f"{DATASET_ID}.trees", project_id=GCP_PROJ_ID)

# Tree Stumps

note (delete when addressed): removed `'biomass_per_kg_tree': [biomass_per_kg_tree],`. In the original code there was a placeholder column created, this can be added later in the process when biomass per tree is actually calculated

In [None]:
stumps = extract_stumps(data, NESTS)

In [None]:
stumps.info(), stumps.head(2)

## Export data and upload to BQ

In [None]:
# Export to CSV
stumps.to_csv(CARBON_POOLS_OUTDIR / "stumps.csv", index=False)

In [None]:
# Upload to BQ
pandas_gbq.to_gbq(stumps, f"{DATASET_ID}.stumps", project_id=GCP_PROJ_ID)

# Dead Trees: Class 1

In [8]:
dead_trees_c1 = extract_dead_trees_class1(data, NESTS)

In [9]:
dead_trees_c1.info(), dead_trees_c1.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   unique_id     2 non-null      object 
 1   nest          2 non-null      int64  
 2   species_name  2 non-null      float64
 3   DBH_cl1       2 non-null      float64
 4   class         2 non-null      int64  
 5   subclass      2 non-null      object 
dtypes: float64(2), int64(2), object(2)
memory usage: 228.0+ bytes


(None,
   unique_id  nest  species_name  DBH_cl1  class subclass
 0     290C2     3         145.0     38.0      1      n/a
 1     290C2     4         177.0     58.8      1      n/a)

## Export data and upload to BQ

In [10]:
# Upload to BQ
pandas_gbq.to_gbq(dead_trees_c1, f"{DATASET_ID}.dead_trees_c1", project_id=GCP_PROJ_ID)

100%|██████████| 1/1 [00:00<00:00, 9597.95it/s]


trees above ground
trees below ground (roots)
saplings
non-tree and litter 
stumps
lying deadwood
standing deadwood
dead trees


aggregation
by subplot
by plot
by strata
