# Notebook to extract variables out of NHD database


Originally 2021.02.23_ExportImagestoGCS_MR1_Copy.ipynb

# Authentications

In [12]:
import pandas as pd
import numpy as np
import time
import ee
ee.Initialize()



# Set up bounding box

In [13]:
def square(lat, lon, size):
  crs_proj = "EPSG:4326"  
  return ee.Geometry.Point([lon, lat], proj=crs_proj).buffer(size).bounds()

# NHDPlus

In [14]:
def fc(nhd_folder_num, feature): # fc for feature collection
  """

  Function calls the individual GEE asset correspondiing to shapefiles in 
  subfolder of NHDPlus V2 dataset

  nhd_num: number indicating which subfolder in the NHDPlus dataset
  """

  # start with empty feature collection
  merged = ee.FeatureCollection([None])

  # convert '1' into '01' etc
  if nhd_folder_num < 10:
    num = "0" + str(nhd_folder_num)
  else:
    num = str(nhd_folder_num)

  # add suffix corresponding to how the subfolders were named
  if nhd_folder_num == 3:
    for direction in ['N', 'S', 'W']:
      merged = merged.merge((ee.FeatureCollection("users/madhukarreddy/NHDPlus" 
                                                  + num + direction 
                                                  + "_" + feature)))
  elif nhd_folder_num == 10:
    for direction in ['L', 'U']:
      merged = merged.merge((ee.FeatureCollection("users/madhukarreddy/NHDPlus" 
                                                  + num + direction 
                                                  + "_" + feature)))
  else:
    merged = merged.merge((ee.FeatureCollection("users/madhukarreddy/NHDPlus" 
                                                + num + "_" 
                                                + feature)))
  return merged

In [15]:
def merge_fc(feature):
  """

  Joins all the shapefiles across the US

  feature: "Waterbodies" or "Flowlines"
  """
  merged_fc_ = ee.FeatureCollection([None])
  for i in range(1,19):
    merged_fc_ = merged_fc_.merge(fc(i, feature))
  return merged_fc_

fc_wb = merge_fc("Waterbodies")
fc_fl = merge_fc("Flowlines")

# NHD parameter retrieval

In [16]:
# waterbodies
# key imp properties: COMID, FTYPE, GNIS_NAME, AREASQKM

# How do you combine these?:
# FTYPE: join the strings for later usage (OHE)
# GNIS_NAME: sum all of the OHE name present vs absent
# AREASQKM: sum of all

# Given a lat, lon, find the above parameters

In [94]:
# return multiple fields
def nhd_vars(feature="Waterbodies", lat=41.638, lon=-122.0048, size=1000):
  """

  For a given lat, lon, return multiple variables
  """
  
  comid_list = []
  ftype_str = ""
  gnis_count = 0
  wb_area = 0
  fl_length = 0

  try:
    if feature == "Waterbodies":
      fc = fc_wb.filterBounds(square(lat, lon, size))
    else:
      fc = fc_fl.filterBounds(square(lat, lon, size))
      
    num_of_features = len(fc.getInfo().get('features'))  
      
    for feat in range(num_of_features):

      comid_list.append(fc.getInfo().get('features')[feat]\
                               .get('properties')\
                               .get('COMID'))

      ftype_str += fc.getInfo().get('features')[feat]\
                               .get('properties')\
                               .get('FTYPE') + "+"

      gnis_count += len(fc.getInfo().get('features')[feat]\
                               .get('properties')\
                               .get('GNIS_ID')) > 0

      if feature == "Waterbodies":
        wb_area += fc.getInfo().get('features')[feat]\
                               .get('properties')\
                               .get('AREASQKM')
      else:
        wb_area = np.nan                                     
      
      if feature == "Flowlines":
        fl_length += fc.getInfo().get('features')[feat]\
                               .get('properties')\
                               .get('LENGTHKM')
      else:
        fl_length = np.nan

    return comid_list, ftype_str, gnis_count, wb_area, fl_length
  except Exception as e:
    print(e)
    print("Issue with {0} at lat={1}, lon={2}".format(feature, lat, lon))
    return np.nan

In [50]:
# return multiple fields
def nhd_vars(feature="Waterbodies", lat=41.638, lon=-122.0048, size=1000):
  """

  For a given lat, lon, return multiple GEE variables
  """
  
  comid_list = [] # numerical
  fcode = [] # numerical
  ftype_str = []
  fl_length = []
  wb_area = []
  gnis_id = [] # n-digit number


  try:
    if feature == "Waterbodies":
      fc = fc_wb.filterBounds(square(lat, lon, size))
    else:
      fc = fc_fl.filterBounds(square(lat, lon, size))
      
    num_of_features = len(fc.getInfo().get('features'))  
      
    for feat in range(num_of_features):

      comid_list.append(fc.getInfo().get('features')[feat]\
                               .get('properties')\
                               .get('COMID'))

      ftype_str.append(fc.getInfo().get('features')[feat]\
                               .get('properties')\
                               .get('FTYPE'))
      gnis_id.append(fc.getInfo().get('features')[feat]\
                               .get('properties')\
                               .get('GNIS_ID:'))

      if feature == "Waterbodies":
        wb_area.append(fc.getInfo().get('features')[feat]\
                               .get('properties')\
                               .get('AREASQKM'))
      else:
        wb_area.append(np.nan                                     )
      
      if feature == "Flowlines":
        fl_length.append(fc.getInfo().get('features')[feat]\
                               .get('properties')\
                               .get('LENGTHKM'))
      else:
        fl_length.append(np.nan)

    return comid_list, ftype_str, gnis_id, wb_area, fl_length
  except Exception as e:
    print(e)
    print("Issue with {0} at lat={1}, lon={2}".format(feature, lat, lon))
    return np.nan

In [102]:
# return multiple fields
def nhd_vars(feature="Waterbodies", lat=41.638, lon=-122.0048, size=1000):
  """

  For a given lat, lon, return multiple GEE variables
  """
  
  comid_list = [] # numerical
  fcode = [] # numerical
  ftype_str = []
  fl_length = []
  wb_area = []
  gnis_id = [] # n-digit number


  try:
    if feature == "Waterbodies":
      fc = fc_wb.filterBounds(square(lat, lon, size))
    else:
      fc = fc_fl.filterBounds(square(lat, lon, size))
      
    num_of_features = len(fc.getInfo().get('features'))  
    fc_getInfo = fc.getInfo().get('features')
    
    for feat in range(num_of_features):
      comid_list.append(fc_getInfo[feat].get('properties').get('COMID'))
      ftype_str.append(fc_getInfo[feat].get('properties').get('FTPE'))
      gnis_id.append(fc_getInfo[feat].get('properties').get('GNIS_ID'))
      if feature == "Waterbodies":
        wb_area.append(fc_getInfo[feat].get('properties').get('AREASQKM'))
      else:
        wb_area.append(np.nan)
      
      if feature == "Flowlines":
        fl_length.append(fc_getInfo[feat].get('properties').get('LENGTHKM'))
      else:
        fl_length.append(np.nan)
        
    
      

#       comid_list.append(fc.getInfo().get('features')[feat]\
#                                .get('properties')\
#                                .get('COMID'))

#       ftype_str.append(fc.getInfo().get('features')[feat]\
#                                .get('properties')\
#                                .get('FTYPE'))
#       gnis_id.append(fc.getInfo().get('features')[feat]\
#                                .get('properties')\
#                                .get('GNIS_ID:'))

#       if feature == "Waterbodies":
#         wb_area.append(fc.getInfo().get('features')[feat]\
#                                .get('properties')\
#                                .get('AREASQKM'))
#       else:
#         wb_area.append(np.nan                                     )
      
#       if feature == "Flowlines":
#         fl_length.append(fc.getInfo().get('features')[feat]\
#                                .get('properties')\
#                                .get('LENGTHKM'))
#       else:
#         fl_length.append(np.nan)

    return comid_list, ftype_str, gnis_id, wb_area, fl_length
  except Exception as e:
    print(e)
    print("Issue with {0} at lat={1}, lon={2}".format(feature, lat, lon))
    return np.nan

In [103]:
# read in the nhd addendum file

nhd_stats = pd.read_csv("nhd_stats_AI.csv")

# read in csv file with SSURGO variables
df_m = pd.read_csv("combined_regular_clean_with_ssurgo_variables.csv")

In [95]:
# def extract_nhd_variables(df_m_, i):

#   df_m_["nhd_vars_wb"] = df_m_.apply(lambda x: nhd_vars(feature="Waterbodies", lat=x.latitude, lon=x.longitude), axis=1)
#   df_m_["nhd_vars_fl"] = df_m_.apply(lambda x: nhd_vars(feature="Flowlines", lat=x.latitude, lon=x.longitude), axis=1)

#   # parse out the COMID's from nhd_vars_wb and nhd_vars_fl columns and find populate them across columns
#   # comid_list, ftype_str, gnis_count, wb_area, fl_length
#   df_m_["wb_comids"] = df_m_.apply(lambda x: " ".join([str(i) for i in x.nhd_vars_wb[0]]), axis=1)
#   df_m_["wb_ftype"] = df_m_.apply(lambda x: x.nhd_vars_wb[1], axis=1)
#   df_m_["wb_gnis_count"] = df_m_.apply(lambda x: x.nhd_vars_wb[2], axis=1)
#   df_m_["wb_area"] = df_m_.apply(lambda x: x.nhd_vars_wb[3], axis=1)

#   df_m_["fl_comids"] = df_m_.apply(lambda x: " ".join([str(i) for i in x.nhd_vars_fl[0]]), axis=1)
#   df_m_["fl_ftype"] = df_m_.apply(lambda x: x.nhd_vars_fl[1], axis=1)
#   df_m_["fl_gnis_count"] = df_m_.apply(lambda x: x.nhd_vars_fl[2], axis=1)
#   df_m_["fl_length"] = df_m_.apply(lambda x: x.nhd_vars_fl[4], axis=1)

#   # read in fl_comid_list, pull out comids and match with nhd_stats to sum 
#   # areasqkm, todasqkm and flow_type

#   df_m_["fl_comid_list"] = df_m_.apply(lambda x: x.nhd_vars_fl[0], axis=1)


#   def fl_areasqkm(comid):
#     return nhd_stats[nhd_stats["comid"] == comid]["areasqkm"]

#   def fl_totdasqkm(comid):
#     return nhd_stats[nhd_stats["comid"] == comid]["totdasqkm"]

#   def fl_flow_type(comid):
#     return nhd_stats[nhd_stats["comid"] == comid]["flow_type"]

#   df_m_["fl_areasqkm"] = df_m_.apply(lambda x: np.sum([fl_areasqkm(comid) for comid in x.fl_comid_list]), axis=1)
#   df_m_["fl_totdasqkm"] = df_m_.apply(lambda x: np.sum([fl_totdasqkm(comid) for comid in x.fl_comid_list]), axis=1)
#   df_m_["fl_flow_type"] = df_m_.apply(lambda x: np.sum([fl_flow_type(comid) for comid in x.fl_comid_list]), axis=1)
  
#   # saving the dataframe 
# #   df_m_.to_csv('NHD_extracted_vars/combined_regular_clean_with_ssurgo_nhd_variables_part' + str(i) + '.csv')
#   df_m_.to_pickle('NHD_extracted_vars/combined_regular_clean_with_ssurgo_nhd_variables_part' + str(i))

In [104]:
def extract_nhd_variables(df_m_, i):

  df_m_["nhd_vars_wb"] = df_m_.apply(lambda x: nhd_vars(feature="Waterbodies", lat=x.latitude, lon=x.longitude), axis=1)
  df_m_["nhd_vars_fl"] = df_m_.apply(lambda x: nhd_vars(feature="Flowlines", lat=x.latitude, lon=x.longitude), axis=1)

  # pickle the dataframe 
  df_m_.to_pickle('NHD_extracted_vars/combined_regular_clean_with_ssurgo_nhd_variables_part' + str(i))

In [105]:
# pass in batches of 500
# MADHUKAR: 0 - 5000 
# SHOBHA: 5000 - 10000
# RADHIKA: 10000 - 15000

batch_size = 1 #500
MY_NAME = "MADHUKAR"
START = 0 + 5000 * (MY_NAME == "SHOBHA") + 10000 * (MY_NAME == "RADHIKA")

for batch in range(10):
  print("batch {} of 10 started".format(batch + 1))
  batch_df = df_m[START + batch_size * batch : START + batch_size * (batch + 1)].copy()
  extract_nhd_variables(batch_df, (START + batch_size * batch) + 1)
  print("batch {} of 10 done".format(batch + 1))


batch 1 of 10 started
name 'fype_str' is not defined
Issue with Waterbodies at lat=42.85821, lon=-76.70773
batch 1 of 10 done
batch 2 of 10 started
name 'fype_str' is not defined
Issue with Waterbodies at lat=43.1523, lon=-75.85523999999998
name 'fype_str' is not defined
Issue with Flowlines at lat=43.1523, lon=-75.85523999999998
batch 2 of 10 done
batch 3 of 10 started


KeyboardInterrupt: 

In [100]:
#read the file
# df_m_test = pd.read_csv('NHD_extracted_vars/combined_regular_clean_with_ssurgo_nhd_variables_part1.csv')
df_m_test = pd.read_pickle('NHD_extracted_vars/combined_regular_clean_with_ssurgo_nhd_variables_part2')
df_m_test.columns


Index(['Unnamed: 0', 'Unnamed: 0.1', 'jurisdiction_type', 'da_number',
       'district', 'project_name', 'longitude', 'latitude',
       'date_issued_or_denied', 'rha_determination', 'cwa_determination',
       'rha1', 'rha2', 'cwa1', 'cwa2', 'cwa3', 'cwa4', 'cwa5', 'cwa6', 'cwa7',
       'cwa8', 'cwa9', 'potential_wetland', 'index', 'Index', 'mukey',
       'hydclprs', 'aws025wta', 'drclassdcd', 'nhd_vars_wb', 'nhd_vars_fl'],
      dtype='object')

In [101]:
[print(item) for item in df_m_test.nhd_vars_wb]

([166766871], 'LakePond+', 1, 207.005, nan)


[None]

In [44]:
df_m_test.nhd_vars_wb.values[0]

5

In [None]:
# flowlines
# imp properties: COMID, FCODE, FTYPE, GNIS_NAME, LENGTHKM
# total_len = sum of all LENGTHKM
# key parameters in nhd_stats: startflag, intephem, lengthkm (same), gnis_name_ind (OHE), areasqkm, totdasqkm, flow_type
# final key params:
# COMID, FTYPE, gnis_name_ind (0,1), LENGTHKM, areasqkm, totdasqkm, flow_type (1,0)

# How do you combine these?: 
# FTYPE: join the strings for later usage (OHE)
# gnis_name_ind: sum of all
# LENGTHKM: sum of all 
# areasqkm: sum of all
# todasqkm: sum of all
# flow_type: sum of all?
# 