# Preprocess ODK data to organized tables

# Imports and Set-up

In [1]:
import sys
import urllib.request
import pandas as pd

In [2]:
# Util imports
sys.path.append("../../")  # include parent directory
from src.settings import DATA_DIR
from src.biomass_inventory import extract_trees

In [3]:
# Variables
URL = "https://api.ona.io/api/v1/data/763932.csv"
FILE_RAW = DATA_DIR / "csv" / "biomass_inventory_raw.csv"

# Get Data from ONA

In [4]:
if FILE_RAW.exists():
    data = pd.read_csv(FILE_RAW, low_memory=False)
else:
    urllib.request.urlretrieve(URL, FILE_RAW)
    data = pd.read_csv(FILE_RAW, low_memory=False)

In [5]:
data.head(2)

Unnamed: 0,start,end,today,deviceid,plot_info/data_recorder,plot_info/team_no,plot_info/plot_code_nmbr,plot_info/plot_type,plot_info/sub_plot,plot_info/yes_no,...,_date_modified,_tags,_notes,_version,_duration,_submitted_by,_total_media,_media_count,_media_all_received,_xform_id
0,2023-09-13T08:34:39.601+08:00,2023-10-10T12:36:28.070+08:00,2023-09-13,collect:49ETbl4W1cfU6fo2,Steve,1,308,primary,sub_plotD,yes,...,2023-10-10T04:36:29.296858+00:00,,,13,2347309.0,clearwind,2.0,2.0,True,763932.0
1,2023-09-13T09:51:59.092+08:00,2023-10-10T12:36:09.000+08:00,2023-09-13,collect:49ETbl4W1cfU6fo2,Steve,1,308,primary,sub_plotA,yes,...,2023-10-10T04:36:10.231153+00:00,,,13,2342650.0,clearwind,2.0,2.0,True,763932.0


## Add a unique ID

In [6]:
# Create a new column with "1" for Primary and "2" for Backup
data["plot_info/plot_type_short"] = data["plot_info/plot_type"].apply(
    lambda x: "1" if x == "Primary" else "2"
)

# Extract subplot letters (assuming they are included in the 'plot_info.sub_plot' column)
data["subplot_letter"] = data["plot_info/sub_plot"].str.replace("sub_plot", "")

# Create the unique ID by concatenating the specified columns
data["unique_id"] = (
    data["plot_info/plot_code_nmbr"].astype(str)
    + data["subplot_letter"]
    + data["plot_info/plot_type_short"]
)

# Extract Plot info

In [13]:
plot_info_cols = [
    "unique_id",
    "plot_info/data_recorder",
    "plot_info/team_no",
    "plot_info/plot_code_nmbr",
    "plot_info/plot_type",
    "plot_info/sub_plot",
    "plot_info/yes_no",
    "plot_shift/sub_plot_shift",
    "plot_GPS/GPS_waypt",
    "plot_GPS/GPS_id",
    "plot_GPS/GPS",
    "plot_GPS/_GPS_latitude",
    "plot_GPS/_GPS_longitude",
    "plot_GPS/_GPS_altitude",
    "plot_GPS/_GPS_precision",
    "plot_GPS/photo",
    "access/access_reason/slope",
    "access/access_reason/danger",
    "access/access_reason/distance",
    "access/access_reason/water",
    "access/access_reason/prohibited",
    "access/access_reason/other",
    "access/manual_reason",
    "lc_data/lc_type",
    "lc_class/lc_class",
    "lc_class/lc_class_other",
    "disturbance/disturbance_yesno",
    "disturbance_data/disturbance_type",
    "disturbance_class/disturbance_class",
    "slope/slope",
    "canopy/avg_height",
    "canopy/can_cov",
]

In [14]:
plot_info = data[plot_info_cols]

In [15]:
plot_info.head(2)

Unnamed: 0,unique_id,plot_info/data_recorder,plot_info/team_no,plot_info/plot_code_nmbr,plot_info/plot_type,plot_info/sub_plot,plot_info/yes_no,plot_shift/sub_plot_shift,plot_GPS/GPS_waypt,plot_GPS/GPS_id,...,access/manual_reason,lc_data/lc_type,lc_class/lc_class,lc_class/lc_class_other,disturbance/disturbance_yesno,disturbance_data/disturbance_type,disturbance_class/disturbance_class,slope/slope,canopy/avg_height,canopy/can_cov
0,308D2,Steve,1,308,primary,sub_plotD,yes,no_shift,7.0,1.0,...,,f_plant,FC,,,,,23.0,12.0,3.0
1,308A2,Steve,1,308,primary,sub_plotA,yes,no_shift,8.0,1.0,...,,f_nat,MDF,,no,,,13.0,8.0,4.0


# Extract info per carbon pool

## Living Trees

In [7]:
extract_trees(data, [2, 3, 4])

Unnamed: 0,unique_id,nest,species_name,family_name,DBH
0,308D2,2,,25.0,10.8
1,308D2,2,,25.0,17.3
2,308D2,2,,25.0,12.8
3,308D2,2,,25.0,28.1
4,308A2,2,999.0,,18.7
...,...,...,...,...,...
6574,38C2,4,278.0,,50.1
6575,38C2,4,292.0,,80.8
6576,4B2,4,205.0,,81.7
6577,4A2,4,289.0,,84.7
