# Extract data from GCS and upload to BigQuery

In [1]:
# Standard Imports
import sys
import os
import re
from tqdm import tqdm
import pandas as pd
import geopandas as gpd

# Google Cloud Imports

In [2]:
# Util imports
sys.path.append("../../../")  # include parent directory
from src.settings import GEOJSON_DATA_DIR

In [3]:
# Variables
SILUP_DIR = GEOJSON_DATA_DIR / "SILUP"

# GCS Variables
SILUP_GCS_DIR = "gs://silup-gis/onebase/"

# BigQuery Variables
SRC_DATASET_ID = "biomass_inventory"
DATASET_ID = "carbon_stock"
IF_EXISTS = "replace"

## Downlaod data

In [4]:
SILUP_DIR.mkdir(exist_ok=True)

In [None]:
!gsutil -m cp $SILUP_GCS_DIR"*.geojson" $SILUP_DIR

# Combine Separate SILUP and Format data

In [5]:
file_list = os.listdir(SILUP_DIR)

In [6]:
silup_gdf = []

for filename in tqdm(file_list):
    data = gpd.read_file(SILUP_DIR / filename)

    # Extract CADT number
    cadt_num = re.findall(r"CADT (\d+)", filename)[0]

    data = data[["ELI_TYPE", "geometry"]].copy()

    # Extract version
    if "v0" in file_list[2]:
        version = "v0"
    else:
        version = "final"

    data["cadt_num"] = cadt_num
    data["version"] = version
    silup_gdf.append(data)

100%|██████████| 30/30 [01:02<00:00,  2.08s/it]


In [7]:
combined_gdf = pd.concat(silup_gdf)

In [8]:
combined_gdf

Unnamed: 0,ELI_TYPE,geometry,cadt_num,version
0,AUD,MULTIPOLYGON Z (((557914.356 1019548.719 0.000...,092,v0
1,AUD,MULTIPOLYGON Z (((558141.679 1019360.271 0.000...,092,v0
2,AUD,MULTIPOLYGON Z (((558250.240 1019380.301 0.000...,092,v0
3,AUD,MULTIPOLYGON Z (((558181.153 1019370.263 0.000...,092,v0
4,AUD,MULTIPOLYGON Z (((557894.296 1019767.274 0.000...,092,v0
...,...,...,...,...
2226,NOT ELIGIBLE,"MULTIPOLYGON (((572304.290 1001588.313, 572300...",256,v0
2227,NOT ELIGIBLE,"MULTIPOLYGON (((577451.745 1001359.358, 577446...",256,v0
2228,NOT ELIGIBLE,"MULTIPOLYGON (((577292.164 1001766.425, 577287...",256,v0
2229,NOT ELIGIBLE,"MULTIPOLYGON (((576884.034 1002663.864, 576852...",256,v0
