In [1]:
# Standard Imports
import sys
import pandas as pd
import geopandas as gpd

# Google Cloud Imports

In [2]:
# Util imports
sys.path.append("../../../")  # include parent directory
from src.settings import PARQUET_DATA_DIR, DATA_DIR, SRC_DIR

In [3]:
# Variables
STRATA_DIR = PARQUET_DATA_DIR / "pre-strata"

# GCS Variables
STRATA_GCS_DIR = "gs://00_extract_vectors/"

# BigQuery Variables
SRC_DATASET_ID = "biomass_inventory"
DATASET_ID = "carbon_stock"
IF_EXISTS = "replace"

## Downlaod data

In [4]:
STRATA_DIR.mkdir(exist_ok=True)

In [5]:
!gsutil -m cp $STRATA_GCS_DIR"pre-strata*.parquet.gzip" $STRATA_DIR

If you experience problems with multiprocessing on MacOS, they might be related to https://bugs.python.org/issue33725. You can disable multiprocessing by editing your .boto config or by adding the following flag to your command: `-o "GSUtil:parallel_process_count=1"`. Note that multithreading is still available even if you disable multiprocessing.

Copying gs://00_extract_vectors/pre-strata_1.parquet.gzip...
Copying gs://00_extract_vectors/pre-strata_0.parquet.gzip...
| [2/2 files][293.7 MiB/293.7 MiB] 100% Done                                    
Operation completed over 2 objects/293.7 MiB.                                    


In [6]:
import pathlib

file_list = list(pathlib.Path(STRATA_DIR).glob("*"))

In [7]:
# Read all files in file_list as separate dataframes
dfs = [pd.read_parquet(file) for file in file_list]

# Concatenate the dataframes into one
combined_df = pd.concat(dfs)

In [8]:
combined_df["geometry"] = gpd.GeoSeries.from_wkt(combined_df["geometry"])

In [9]:
combined_df = gpd.GeoDataFrame(combined_df, geometry="geometry", crs="EPSG:4326")

In [10]:
combined_df["strata"].unique()

array(['pre_strata_6', 'pre_strata_1', 'pre_strata_2', 'pre_strata_5',
       'pre_strata_3', 'pre_strata_4'], dtype=object)

In [81]:
primary_plots = gpd.read_file(
    DATA_DIR / "shp" / "subplot_map" / "ALL-recoded_plots_primary.shp"
)
primary_plots.head(2)

Unnamed: 0,ID,Plot_No,Strata_No,LCC,Pro_Name,Mun_Name,Bgy_Name,Plot_type,Letter_CD,Plot_uniqu,Plot_ID,geometry
0,101019,1,Strata_1,Dipterocarp Forest Closed,SURIGAO DEL SUR,LANUZA,Pakwan,Primary_point,A,001A1_SDS_8314_23_1,001A1,POINT (825367.557 1012520.782)
1,101020,2,Strata_1,Dipterocarp Forest Closed,AGUSAN DEL SUR,LA PAZ,Lydia,Primary_point,A,002A1_ADS_8508_23_1,002A1,POINT (773490.067 919489.258)


In [82]:
primary_plots["Plot_ID"] = (
    primary_plots["Plot_No"].astype(str) + primary_plots["Letter_CD"] + "1"
)

In [83]:
primary_plots[primary_plots.duplicated(subset="Plot_ID", keep=False)]

Unnamed: 0,ID,Plot_No,Strata_No,LCC,Pro_Name,Mun_Name,Bgy_Name,Plot_type,Letter_CD,Plot_uniqu,Plot_ID,geometry


In [72]:
# primary_plots = primary_plots.loc[primary_plots.Plot_type == "Primary_point"].copy()

In [70]:
primary_plots.Plot_type.unique()

array(['Primary_point', 'Subplot'], dtype=object)

In [84]:
backup_plots = gpd.read_file(
    DATA_DIR / "shp" / "subplot_map" / "ALL-recoded_plots_backup.shp"
)
backup_plots.head(2)

Unnamed: 0,ID,Plot_type,Plot_No,Strata_No,LCC,Pro_Name,Mun_Name,Bgy_Name,Plot_uniqu,Plot_ID,geometry
0,99687,Subplot,1,Strata_1,Dipterocarp Forest Closed,SURIGAO DEL SUR,LANUZA,Pakwan,001B2_SDS_8314_23_1,001B2,POINT (825122.697 1012156.979)
1,99688,Subplot,1,Strata_1,Dipterocarp Forest Closed,SURIGAO DEL SUR,LANUZA,Pakwan,001C2_SDS_8314_23_1,001C2,POINT (825302.023 1012006.011)


In [85]:
backup_plots["Plot_ID"] = backup_plots["Plot_ID"].str.lstrip("0")

In [88]:
backup_plots[backup_plots.duplicated(subset="Plot_ID", keep=False)]

Unnamed: 0,ID,Plot_type,Plot_No,Strata_No,LCC,Pro_Name,Mun_Name,Bgy_Name,Plot_uniqu,Plot_ID,geometry
886,100573,Subplot,296,Strata_2,Dipterocarp Forest Closed,SURIGAO DEL SUR,LIANGA,Poblacion,296C2_SDS_8307_23_1,296C2,POINT (838282.061 956050.902)
887,100574,Subplot,296,Strata_2,Dipterocarp Forest Closed,SURIGAO DEL SUR,LIANGA,Poblacion,296C2_SDS_8307_23_1,296C2,POINT (838112.314 955785.591)


In [87]:
backup_plots.loc[backup_plots.ID == 100988, "Plot_ID"] = "425D2"

In [74]:
backup_plots = backup_plots.loc[backup_plots.Plot_type == "Subplot"].copy()

array(['Subplot', 'Primary'], dtype=object)

In [90]:
subset = backup_plots[backup_plots["Plot_ID"].str.endswith("1")]
subset

Unnamed: 0,ID,Plot_type,Plot_No,Strata_No,LCC,Pro_Name,Mun_Name,Bgy_Name,Plot_uniqu,Plot_ID,geometry
1143,100830,Subplot,382,Strata_2,Dipterocarp Forest Closed,SURIGAO DEL SUR,MADRID,Bayogo,382B1_SDS_8316_23_1,382B1,POINT (815930.146 1024634.342)
1303,100990,Subplot,401,Strata_6,Swamp forest closed,AGUSAN DEL SUR,SAN FRANCISCO,Caimpugan,401C1_ADS_8501_23_1,401C1,POINT (819520.527 929747.432)
1304,100991,Subplot,401,Strata_6,Swamp forest closed,AGUSAN DEL SUR,SAN FRANCISCO,Caimpugan,401D1_ADS_8501_23_1,401D1,POINT (819275.332 929867.562)
1305,100992,Subplot,402,Strata_6,Swamp forest closed,AGUSAN DEL SUR,SAN FRANCISCO,New Visayas,402B1_ADS_8501_23_1,402B1,POINT (819264.266 932487.699)
1306,100993,Subplot,402,Strata_6,Swamp forest closed,AGUSAN DEL SUR,SAN FRANCISCO,New Visayas,402C1_ADS_8501_23_1,402C1,POINT (819219.675 932648.254)
1307,100994,Subplot,402,Strata_6,Swamp forest closed,AGUSAN DEL SUR,SAN FRANCISCO,New Visayas,402D1_ADS_8501_23_1,402D1,POINT (818913.314 932440.727)
1755,120387,Primary,402,Strata_6,Swamp forest closed,AGUSAN DEL SUR,SAN FRANCISCO,New Visayas,402A1_ADS_8501_23_1,402A1,POINT (819065.056 932539.434)
1756,120388,Primary,403,Strata_6,Swamp forest closed,AGUSAN DEL SUR,ROSARIO,Wasi-an,403A1_ADS_8504_23_1,403A1,POINT (825435.501 921261.078)


In [91]:
backup_plots = backup_plots[~backup_plots["Plot_ID"].str.endswith("1")]

In [59]:
backup_plots.loc[backup_plots.Plot_No == 296].explore()

In [107]:
plots = pd.concat([primary_plots, backup_plots], ignore_index=True)

In [108]:
plots.drop(columns=["Strata_No"], inplace=True)

In [109]:
plots.to_crs("EPSG:4326", inplace=True)

In [110]:
plots = plots.sjoin(
    combined_df[["strata", "geometry"]], how="left", predicate="intersects"
)

In [111]:
plots["Plot_ID"] = plots["Plot_ID"].str.lstrip("0")
plots["strata"] = plots["strata"].str.extract(r"(\d+)")

In [112]:
plots

Unnamed: 0,ID,Plot_No,LCC,Pro_Name,Mun_Name,Bgy_Name,Plot_type,Letter_CD,Plot_uniqu,Plot_ID,geometry,index_right,strata
0,101019,1,Dipterocarp Forest Closed,SURIGAO DEL SUR,LANUZA,Pakwan,Primary_point,A,001A1_SDS_8314_23_1,1A1,POINT (125.96015 9.14772),180013.0,2
1,101020,2,Dipterocarp Forest Closed,AGUSAN DEL SUR,LA PAZ,Lydia,Primary_point,A,002A1_ADS_8508_23_1,2A1,POINT (125.48295 8.31057),478223.0,2
2,101021,3,Dipterocarp Forest Closed,SURIGAO DEL SUR,CARMEN,Hinapoyan,Primary_point,A,003A1_SDS_8315_23_1,3A1,POINT (125.89403 9.15614),180013.0,2
3,101022,4,Dipterocarp Forest Closed,SURIGAO DEL SUR,LIANGA,Manyayay,Primary_point,A,004A1_SDS_8307_23_1,4A1,POINT (126.13197 8.71834),180013.0,2
4,101023,5,Dipterocarp Forest Closed,SURIGAO DEL SUR,LIANGA,Ganayon,Primary_point,A,005A1_SDS_8307_23_1,5A1,POINT (126.08604 8.72452),180013.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3509,120389,404,Swamp forest closed,AGUSAN DEL SUR,SAN FRANCISCO,New Visayas,Primary,,404A2_ADS_8501_23_1,404A2,POINT (125.88713 8.43701),45.0,6
3510,120390,405,Swamp forest closed,AGUSAN DEL SUR,LA PAZ,Villa Paz,Primary,,405A2_ADS_8508_23_1,405A2,POINT (125.83924 8.24765),13.0,6
3510,120390,405,Swamp forest closed,AGUSAN DEL SUR,LA PAZ,Villa Paz,Primary,,405A2_ADS_8508_23_1,405A2,POINT (125.83924 8.24765),287316.0,6
3511,120391,406,Swamp forest closed,AGUSAN DEL SUR,TALACOGON,Zillovia,Primary,,406A2_ADS_8510_23_1,406A2,POINT (125.75887 8.41506),151970.0,6


In [113]:
plots.rename(columns={"strata": "Strata", "Plot_ID": "unique_id"}, inplace=True)
# plots.to_csv(SRC_DIR / "lookup" / "pc_plot_lookup_20240802.csv", index=False)

In [126]:
plots.loc[plots.ID == 118481, "Strata"] = 1

In [130]:
plots.dropna(subset=["Strata"], inplace=True)

In [133]:
plots.shape

(2910, 13)

In [134]:
plots = plots[["unique_id", "Strata"]]

In [135]:
plots.to_csv(SRC_DIR / "lookup" / "pc_plot_lookup_20240802.csv", index=False)

In [103]:
plots.shape

(4142, 2)

In [104]:
plots.unique_id.nunique()

3511

In [131]:
subset = plots[plots.duplicated(subset="unique_id", keep=False)]

In [132]:
subset

Unnamed: 0,ID,Plot_No,LCC,Pro_Name,Mun_Name,Bgy_Name,Plot_type,Letter_CD,Plot_uniqu,unique_id,geometry,index_right,Strata


In [124]:
plots[plots.unique_id.isin(subset.unique_id)].to_csv(
    DATA_DIR / "tmp" / "duplicate_plots_new_strata.csv", index=False
)

In [127]:
plots.drop_duplicates(subset=["Strata", "unique_id"], inplace=True)

In [42]:
combined_df.to_file(DATA_DIR / "gpkg" / "strata.gpkg")