# Extract data from GCS and upload to BigQuery

In [25]:
# Standard Imports
import sys
import os
import re
from tqdm import tqdm
import pandas as pd
import geopandas as gpd

# Google Cloud Imports

In [2]:
# Util imports
sys.path.append("../../")  # include parent directory
from src.settings import GEOJSON_DATA_DIR

In [4]:
# Variables
SILUP_DIR = GEOJSON_DATA_DIR / "SILUP"

# GCS Variables
SILUP_GCS_DIR = "gs://silup-gis/onebase/"

# BigQuery Variables
SRC_DATASET_ID = "biomass_inventory"
DATASET_ID = "carbon_stock"
IF_EXISTS = "replace"

## Downlaod data

In [6]:
SILUP_DIR.mkdir(exist_ok=True)

In [None]:
!gsutil -m cp $SILUP_GCS_DIR"*.geojson" $SILUP_DIR

# Combine Separate SILUP and Format data

In [28]:
file_list = os.listdir(SILUP_DIR)

In [32]:
file_list

['CADT 092.geojson',
 'CADT 153.geojson',
 'v0 CADT 117.geojson',
 'CADT 116.geojson',
 'v0 CADT 077.geojson',
 'CADT 048.geojson',
 'v0 CADT 246.geojson',
 'v0 CADT 093.geojson',
 'v0 CADT 142.geojson',
 'CADT 114.geojson',
 'CADT 210.geojson',
 'CADT 254.geojson',
 'CADT 237.geojson',
 'v0 CADT 090.geojson',
 'v0 CADT 255.geojson',
 'CADT 115.geojson',
 'v0 CADT 089.geojson',
 'v0 CADT 078.geojson',
 'v0 CADT 136.geojson',
 'v0 CADT 250.geojson',
 'v0 CADT 223.geojson',
 'CADT 002.geojson',
 'CADT 118.geojson',
 'v0 CADT 079.geojson',
 'CADT 253.geojson',
 'v0 CADT 238.geojson',
 'CADT 239.geojson',
 'v0 CADT 252.geojson',
 'CADT 134.geojson',
 'CADT 256 .geojson']

In [None]:
columns_to_check = ["BAU_0"]

In [82]:
silup_gdf = []

for filename in tqdm(file_list):
    data = gpd.read_file(SILUP_DIR / filename)

    # Extract CADT number
    cadt_num = re.findall(r"CADT (\d+)", filename)[0]

    # Extract version
    if "v0" in file_list[2]:
        version = "v0"
    else:
        version = "final"

    for col in columns_to_check:
        if "BAU" in col:
            data.rename(columns={col: "BAU"}, inplace=True)
        if "PLAN" in col:
            data.rename(columns={col: "PLAN"}, inplace=True)

    data.reset_index(drop=True, inplace=True)
    data["cadt_num"] = cadt_num
    data["version"] = version
    silup_gdf.append(data)

100%|██████████| 30/30 [01:03<00:00,  2.13s/it]


In [84]:
for df in silup_gdf:
    print(df.shape)

(3027, 11)
(6970, 10)
(24922, 12)
(2803, 11)
(10466, 12)
(2346, 12)
(26457, 12)
(17013, 12)
(1773, 12)
(33304, 10)
(19061, 10)
(753, 11)
(341, 11)
(4808, 12)
(29262, 12)
(5408, 12)
(27480, 12)
(15274, 12)
(19186, 12)
(18248, 12)
(18531, 12)
(1793, 11)
(17832, 10)
(9003, 12)
(2183, 11)
(14087, 12)
(2912, 11)
(32134, 12)
(4666, 13)
(2231, 11)


In [65]:
silup_gdf[0].columns

Index(['index', 'fid', 'F_NF', 'LCC_GEN', 'PLAN', 'BAU', 'ELI_TYPE', 'AREA',
       'PROVINCE', 'geometry', 'cadt_num', 'version'],
      dtype='object')

In [67]:
silup_gdf[2].columns

Index(['index', 'fid', 'CADT_No', 'PROVINCE', 'LCC_GEN', 'PLAN', 'BAU',
       'ELI_TYPE', 'AREA', 'CADT_Name', 'geometry', 'cadt_num', 'version'],
      dtype='object')

In [69]:
silup_gdf[3].columns

Index(['index', 'fid', 'LCC_GEN', 'F_NF', 'PLAN', 'BAU', 'ELI_TYPE', 'AREA',
       'PROVINCE', 'geometry', 'cadt_num', 'version'],
      dtype='object')

In [71]:
file_list[3]

'CADT 116.geojson'

In [70]:
silup_gdf[3]

Unnamed: 0,index,fid,LCC_GEN,F_NF,PLAN,BAU,ELI_TYPE,AREA,PROVINCE,geometry,cadt_num,version
0,0,1,DIPTEROCARP FOREST,FOREST,CONSERVATION,ARTISANAL MINING,AUD,0.471283,Surigao Del Sur,"MULTIPOLYGON (((593981.461 995571.964, 593980....",116,v0
1,1,2,DIPTEROCARP FOREST,FOREST,CONSERVATION,ARTISANAL MINING,AUD,9.145701,Surigao Del Sur,"MULTIPOLYGON (((595480.841 984704.305, 595480....",116,v0
2,2,3,DIPTEROCARP FOREST,FOREST,CONSERVATION,ARTISANAL MINING,AUD,0.325669,Surigao Del Sur,"MULTIPOLYGON (((599979.938 992172.819, 599980....",116,v0
3,3,4,DIPTEROCARP FOREST,FOREST,CONSERVATION,ARTISANAL MINING,AUD,0.010772,Surigao Del Sur,"MULTIPOLYGON (((599147.657 993128.575, 599147....",116,v0
4,4,5,DIPTEROCARP FOREST,FOREST,CONSERVATION,ARTISANAL MINING,AUD,0.033285,Surigao Del Sur,"MULTIPOLYGON (((603199.248 993129.817, 603196....",116,v0
...,...,...,...,...,...,...,...,...,...,...,...,...
2798,2798,2799,DIPTEROCARP FOREST,FOREST,CONSERVATION,ARTISANAL MINING,NOT ELIGIBLE,0.318219,Surigao Del Sur,"MULTIPOLYGON (((618157.319 988560.219, 618156....",116,v0
2799,2799,2800,DIPTEROCARP FOREST,FOREST,CONSERVATION,ARTISANAL MINING,NOT ELIGIBLE,0.042018,Surigao Del Sur,"MULTIPOLYGON (((618443.535 988672.718, 618442....",116,v0
2800,2800,2801,DIPTEROCARP FOREST,FOREST,CONSERVATION,ARTISANAL MINING,NOT ELIGIBLE,0.206183,Surigao Del Sur,"MULTIPOLYGON (((618235.647 988808.415, 618215....",116,v0
2801,2801,2802,DIPTEROCARP FOREST,FOREST,CONSERVATION,ARTISANAL MINING,NOT ELIGIBLE,0.738461,Surigao Del Sur,"MULTIPOLYGON (((618512.758 988901.141, 618512....",116,v0


In [83]:
combined_gdf = pd.concat(silup_gdf)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [54]:
combined_gdf.columns

Index(['index', 'fid', 'F_NF', 'LCC_GEN', 'PLAN_2', 'BAU_1', 'ELI_TYPE',
       'AREA', 'PROVINCE', 'geometry', 'cadt_num', 'version', 'PLAN_1',
       'CADT_No', 'PLAN_0', 'BAU_0', 'CADT_Name', 'BAU_1Fin', 'Area_ha',
       'LCC_Gen', 'Shape_Area', 'LCC', 'BAU_0PC', 'BAU_1OLD', 'FOR_NFOR'],
      dtype='object')

In [19]:
filename = file_list[1]
number = re.findall(r"CADT (\d+)", filename)[0]
print(number)

153


In [23]:
if "v0" in file_list[2]:
    print("The filename contains 'v0'")
else:
    print("The filename does not contain 'v0'")

The filename contains 'v0'


In [10]:
test = gpd.read_file(SILUP_DIR / "CADT 002.geojson")

In [13]:
test.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1793 entries, 0 to 1792
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   fid       1793 non-null   int64   
 1   LCC_GEN   1793 non-null   object  
 2   F_NF      1793 non-null   object  
 3   ELI_TYPE  1793 non-null   object  
 4   BAU_1     1793 non-null   object  
 5   PLAN_1    1793 non-null   object  
 6   AREA      1793 non-null   float64 
 7   PROVINCE  1793 non-null   object  
 8   geometry  1793 non-null   geometry
dtypes: float64(1), geometry(1), int64(1), object(6)
memory usage: 126.2+ KB


In [36]:
test2 = gpd.read_file(SILUP_DIR / file_list[3])

In [37]:
test2.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 2803 entries, 0 to 2802
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   fid       2803 non-null   int64   
 1   LCC_GEN   2803 non-null   object  
 2   F_NF      2803 non-null   object  
 3   PLAN_1    2803 non-null   object  
 4   BAU_1Fin  2803 non-null   object  
 5   ELI_TYPE  2803 non-null   object  
 6   AREA      2803 non-null   float64 
 7   PROVINCE  2803 non-null   object  
 8   geometry  2803 non-null   geometry
dtypes: float64(1), geometry(1), int64(1), object(6)
memory usage: 197.2+ KB


In [None]:
test.explore