# Process BasinATLAS Attrtibutes

### Prepare Workspace

In [1]:
# Import system libraries
import os
import sys

# Import data manipulation libraries
import pandas as pd
import numpy as np

# Import geospatial libraries
from shapely.geometry import Polygon, mapping
import geopandas as gpd

# Set working directory
os.chdir('/Users/jessicarapson/Documents/GitHub/water-supply-forecast')

### Extract Atttibutes

In [105]:
# Load in site geospatial data
gdf_sites = gpd.read_file('assets/data/geospatial.gpkg')

# Load in attribute geospatial data (this takes a while)
gdf_basins = gpd.read_file('assets/data/hydrobasins/hydroBASINS.gpkg')

# Calculate basin centroids
basin_centroids = gdf_basins.copy()
basin_centroids['geometry'] = basin_centroids['geometry'].centroid

# Perform spatial join based on centroids
merged_basins = gpd.sjoin(gdf_sites, basin_centroids, how='left', op='contains')

# Select columns of interest
cols_avg = ['inu_pc_smn','inu_pc_smx','inu_pc_slt','inu_pc_umn','inu_pc_umx',
            'inu_pc_ult','lka_pc_sse','lka_pc_use','dor_pc_pva','slp_dg_sav',
            'slp_dg_uav','sgr_dk_sav','tmp_dc_uyr','ari_ix_sav','ari_ix_uav',
            'cmi_ix_uyr','snw_pc_uyr','glc_pc_s01','glc_pc_s02','glc_pc_s03',
            'glc_pc_s04','glc_pc_s05','glc_pc_s06','glc_pc_s07','glc_pc_s08',
            'glc_pc_s09','glc_pc_s10','glc_pc_s11','glc_pc_s12','glc_pc_s13',
            'glc_pc_s14','glc_pc_s15','glc_pc_s16','glc_pc_s17','glc_pc_s18',
            'glc_pc_s19','glc_pc_s20','glc_pc_s21','glc_pc_s22','glc_pc_u01',
            'glc_pc_u02','glc_pc_u03','glc_pc_u04','glc_pc_u05','glc_pc_u06',
            'glc_pc_u07','glc_pc_u08','glc_pc_u09','glc_pc_u10','glc_pc_u11',
            'glc_pc_u12','glc_pc_u13','glc_pc_u14','glc_pc_u15','glc_pc_u16',
            'glc_pc_u17','glc_pc_u18','glc_pc_u19','glc_pc_u20','glc_pc_u21',
            'glc_pc_u22','wet_pc_sg1','wet_pc_sg2','wet_pc_ug1','wet_pc_ug2',
            'for_pc_sse','for_pc_use','crp_pc_sse','crp_pc_use','pst_pc_sse',
            'pst_pc_use','ire_pc_sse','ire_pc_use','gla_pc_sse','gla_pc_use',
            'prm_pc_sse','prm_pc_use','pac_pc_sse','pac_pc_use','cly_pc_sav',
            'cly_pc_uav','slt_pc_sav','slt_pc_uav','snd_pc_sav','snd_pc_uav',
            'soc_th_sav','soc_th_uav','swc_pc_syr','swc_pc_uyr','swc_pc_s01',
            'swc_pc_s02','swc_pc_s03','swc_pc_s04','swc_pc_s05','swc_pc_s06',
            'swc_pc_s07','swc_pc_s08','swc_pc_s09','swc_pc_s10','swc_pc_s11',
            'swc_pc_s12','kar_pc_sse','kar_pc_use','ero_kh_sav','ero_kh_uav',
            'ppd_pk_sav','ppd_pk_uav','urb_pc_sse','urb_pc_use','nli_ix_sav',
            'nli_ix_uav','rdd_mk_sav','rdd_mk_uav','hft_ix_s93','hft_ix_u93',
            'hft_ix_s09','hft_ix_u09']
cols_sum = ['dis_m3_pyr','dis_m3_pmn','dis_m3_pmx','run_mm_syr','lkv_mc_usu',
            'rev_mc_usu','ria_ha_ssu','ria_ha_usu','riv_tc_ssu','riv_tc_usu',
            'gwt_cm_sav','pre_mm_uyr','pet_mm_syr','pet_mm_s01', 'pet_mm_s02',
            'pet_mm_s03','pet_mm_s04','pet_mm_s05','pet_mm_s06','pet_mm_s07',
            'pet_mm_s08','pet_mm_s09','pet_mm_s10','pet_mm_s11','pet_mm_s12',
            'pet_mm_uyr','aet_mm_syr','aet_mm_s01','aet_mm_s02','aet_mm_s03',
            'aet_mm_s04','aet_mm_s05','aet_mm_s06','aet_mm_s07','aet_mm_s08',
            'aet_mm_s09','aet_mm_s10','aet_mm_s11','aet_mm_s12','aet_mm_uyr',
            'pop_ct_ssu','pop_ct_usu']
cols_cat = ['clz_cl_smj','cls_cl_smj','glc_cl_smj','pnv_cl_smj','wet_cl_smj',
            'tbi_cl_smj','tec_cl_smj','fmh_cl_smj','fec_cl_smj','lit_cl_smj']

# Create function to calcualte mode
def get_mode(series):
    try:
        return series.mode().iloc[0]  # Get the first mode value
    except IndexError:
        return None

# Calculate summary statistics over basins
avg_basins = merged_basins[['site_id'] + cols_avg].groupby('site_id').mean().reset_index()
sum_basins = merged_basins[['site_id'] + cols_sum].groupby('site_id').sum().reset_index()
cat_basins = merged_basins[['site_id'] + cols_cat].replace(-9999, np.nan).groupby(
    'site_id').agg(get_mode).reset_index()

# Merge and data
result = pd.merge(pd.merge(avg_basins, sum_basins, on='site_id', how='left'),
                  cat_basins, on='site_id', how='left')
result.to_csv('assets/data/hydrobasins/hydrobasins_summary.csv', index=False)  


  basin_centroids['geometry'] = basin_centroids['geometry'].centroid
  if await self.run_code(code, result, async_=asy):
