In [22]:
# Standard Imports
import sys
import pandas as pd

# Google Cloud Imports
import pandas_gbq

In [23]:
# Util imports
sys.path.append("../../")  # include parent directory
from src.settings import CARBON_POOLS_OUTDIR, GCP_PROJ_ID, TMP_OUT_DIR

PLOT_INFO_CSV = CARBON_POOLS_OUTDIR / "plot_info.csv"
SRC_DATASET_ID = "biomass_inventory"

In [24]:
if PLOT_INFO_CSV.exists():
    plot_info = pd.read_csv(PLOT_INFO_CSV)
else:
    query = f"""
    SELECT
        * 
    FROM {GCP_PROJ_ID}.{SRC_DATASET_ID}.plot_info"""

    # Read the BigQuery table into a dataframe
    plot_info = pandas_gbq.read_gbq(query, project_id=GCP_PROJ_ID)
    plot_info.to_csv(PLOT_INFO_CSV, index=False)

In [25]:
plot_info.plot_type.unique()

array(['primary', 'backup'], dtype=object)

In [26]:
plot_info.loc[plot_info.duplicated(subset="unique_id"), "unique_id"].unique()

array(['66D1', '293A1', '330C1', '366A1', '71B1', '334A1', '303B2',
       '343B1', '224C1', '224A1', '224D1', '224B1', '17D2', '198A1',
       '266D1'], dtype=object)

In [27]:
duplicates = plot_info[
    plot_info.unique_id.isin(
        plot_info.loc[plot_info.duplicated(subset="unique_id"), "unique_id"].unique()
    )
].sort_values("unique_id")

In [37]:
duplicates.groupby("unique_id")["sub_plot_shift"].nunique().sort_values()

unique_id
17D2     1
198A1    1
224A1    1
224C1    1
224D1    1
266D1    1
293A1    1
330C1    1
343B1    1
366A1    1
224B1    2
303B2    2
334A1    2
66D1     2
71B1     2
Name: sub_plot_shift, dtype: int64

In [38]:
duplicates

Unnamed: 0,unique_id,data_recorder,team_no,plot_code_nmbr,plot_type,sub_plot,yes_no,sub_plot_shift,GPS_waypt,GPS_id,...,disturbance_type,disturbance_class,slope,canopy_avg_height,canopy_cover,access_reason,slope_radians,corrected_plot_area_n2_m2,corrected_plot_area_n3_m2,corrected_plot_area_n4_m2
397,17D2,Rodney,5,17,backup,sub_plotD,yes,no_shift,10.0,5.0,...,,,23.0,29.0,3.0,,0.226068,82.694573,744.251154,1323.113162
394,17D2,Rodney,5,17,backup,sub_plotD,yes,no_shift,99.0,5.0,...,dist_hum,sawmills,9.0,30.0,3.0,,0.089758,79.175989,712.5839,1266.815822
391,198A1,Christmel,4,198,primary,sub_plotA,yes,no_shift,167.0,4.0,...,,,15.0,10.0,3.0,,0.14889,80.306962,722.76266,1284.911395
475,198A1,Sham,2,198,primary,sub_plotA,no,,,,...,,,,,,slope,,,,
228,224A1,Steve,1,224,primary,sub_plotA,yes,no_shift,91.0,1.0,...,,,34.0,15.0,3.0,,0.327739,87.619019,788.571172,1401.904306
221,224A1,Steve,1,224,primary,sub_plotA,yes,no_shift,89.0,1.0,...,dist_hum,local_cutting,10.0,12.0,4.0,,0.099669,79.325215,713.926931,1269.203432
230,224B1,Steve,1,224,primary,sub_plotB,yes,east,95.0,1.0,...,,,4.0,4.0,3.0,,0.039979,78.66548,707.98932,1258.647681
220,224B1,Steve,1,224,primary,sub_plotB,yes,no_shift,87.0,1.0,...,dist_hum,sawmills,36.0,10.0,2.0,,0.345556,88.718577,798.467189,1419.497225
219,224C1,Steve,1,224,primary,sub_plotC,yes,no_shift,86.0,1.0,...,,,45.0,7.0,2.0,,0.422854,94.444129,849.997162,1511.106066
227,224C1,Steve,1,224,primary,sub_plotC,yes,no_shift,90.0,1.0,...,,,3.0,4.0,2.0,,0.029991,78.610502,707.49452,1257.768035


In [29]:
duplicates.to_csv(TMP_OUT_DIR / "duplicate_plot_ids.csv")