# Notebook to extract variables out of NHD database


In [19]:
#ee
PATCH_SIZE = 500

Originally 2021.02.23_ExportImagestoGCS_MR1_Copy.ipynb

# Authentications

In [20]:
import pandas as pd
import numpy as np
import time
import ee
import pickle
ee.Initialize()

# Set up bounding box

In [21]:
def square(lat, lon, size):
  crs_proj = "EPSG:4326"  
  return ee.Geometry.Point([lon, lat], proj=crs_proj).buffer(size).bounds()

# NHDPlus

In [22]:
def fc(nhd_folder_num, feature): # fc for feature collection
  """

  Function calls the individual GEE asset correspondiing to shapefiles in 
  subfolder of NHDPlus V2 dataset

  nhd_num: number indicating which subfolder in the NHDPlus dataset
  """

  # start with empty feature collection
  merged = ee.FeatureCollection([None])

  # convert '1' into '01' etc
  if nhd_folder_num < 10:
    num = "0" + str(nhd_folder_num)
  else:
    num = str(nhd_folder_num)

  # add suffix corresponding to how the subfolders were named
  if nhd_folder_num == 3:
    for direction in ['N', 'S', 'W']:
      merged = merged.merge((ee.FeatureCollection("users/madhukarreddy/NHDPlus" 
                                                  + num + direction 
                                                  + "_" + feature)))
  elif nhd_folder_num == 10:
    for direction in ['L', 'U']:
      merged = merged.merge((ee.FeatureCollection("users/madhukarreddy/NHDPlus" 
                                                  + num + direction 
                                                  + "_" + feature)))
  else:
    merged = merged.merge((ee.FeatureCollection("users/madhukarreddy/NHDPlus" 
                                                + num + "_" 
                                                + feature)))
  return merged

In [23]:
def merge_fc(feature):
  """

  Joins all the shapefiles across the US

  feature: "Waterbodies" or "Flowlines"
  """
  merged_fc_ = ee.FeatureCollection([None])
  for i in range(1,19):
    merged_fc_ = merged_fc_.merge(fc(i, feature))
  return merged_fc_

fc_wb = merge_fc("Waterbodies")
fc_fl = merge_fc("Flowlines")

# NHD parameter retrieval

In [24]:
# waterbodies
# key imp properties: COMID, FTYPE, GNIS_NAME, AREASQKM

# How do you combine these?:
# FTYPE: join the strings for later usage (OHE)
# GNIS_NAME: sum all of the OHE name present vs absent
# AREASQKM: sum of all

# Given a lat, lon, find the above parameters

In [25]:
# return multiple fields
def nhd_vars(feature="Waterbodies", lat=41.638, lon=-122.0048, size=1000):
  """

  For a given lat, lon, return multiple GEE variables
  """
  
  comid_list = [] # n-digit number
  ftype_str = [] # str
  fl_length = [] # float
  wb_area = [] # float
  gnis_id = [] # n-digit number
  fcode = [] # n-digit number



  try:
    if feature == "Waterbodies":
      fc = fc_wb.filterBounds(square(lat, lon, size))
    else:
      fc = fc_fl.filterBounds(square(lat, lon, size))
      
    
    fc_getInfo = fc.getInfo().get('features')
    num_of_features = len(fc_getInfo)
    print("number of features =", num_of_features)
    
    for feat in range(num_of_features):
      comid_list.append(fc_getInfo[feat].get('properties').get('COMID'))

      ftype_str.append(fc_getInfo[feat].get('properties').get('FTYPE'))
      
      gnis_id.append(fc_getInfo[feat].get('properties').get('GNIS_ID'))
      
      if feature == "Waterbodies":
        wb_area.append(fc_getInfo[feat].get('properties').get('AREASQKM'))
      else:
        wb_area.append(np.nan)
      
      if feature == "Flowlines":
        fl_length.append(fc_getInfo[feat].get('properties').get('LENGTHKM'))
      else:
        fl_length.append(np.nan)
        
      fcode.append(fc_getInfo[feat].get('properties').get('FCODE'))

    return comid_list, ftype_str, gnis_id, wb_area, fl_length, fcode
  except Exception as e:
    print(e)
    print("Issue with {0} at lat={1}, lon={2}".format(feature, lat, lon))
    return np.nan

In [26]:
# read in the nhd addendum file

nhd_stats = pd.read_csv("nhd_stats_AI.csv")

# read in csv file with SSURGO variables
# df_m = pd.read_csv("combined_regular_clean_with_ssurgo_variables.csv")
df_m = pd.read_pickle("cwr_nwpr_dataset")

In [27]:
def extract_nhd_variables(df_m_, i, size):

  df_m_["nhd_vars_wb"] = df_m_.apply(lambda x: nhd_vars(feature="Waterbodies", lat=x.latitude, lon=x.longitude, size=size), axis=1)
  df_m_["nhd_vars_fl"] = df_m_.apply(lambda x: nhd_vars(feature="Flowlines", lat=x.latitude, lon=x.longitude, size=size), axis=1)

  # pickle the dataframe 
#   df_m_.to_pickle('NHD_extracted_vars_2.5kmX2.5km_with_fcode_ftype/2.5kmX2.5km_nhd_variables_part' + str(i))
  pickle.dump(df_m_, open("NHD_extracted_vars_cwr_nwpr_" + 
                          str(2 * PATCH_SIZE) + "mX" + 
                          str(2 * PATCH_SIZE) + "m_with_fcode_ftype/" + 
                          str(2 * PATCH_SIZE) + "mX" + str(2 * PATCH_SIZE) + 
                          "m_nhd_variables_part" + str(i),"wb"), 
              protocol=3)        

In [28]:
from datetime import datetime
print(datetime.now())

2021-04-02 22:20:11.770805


In [29]:
# pass in batches of 500
# MADHUKAR: 0 - 5000 
# SHOBHA: 5000 - 10000
# RADHIKA: 10000 - 15000

# PATCH_SIZE = 500
batch_size = 50
MY_NAME = "MADHUKAR"
START = 0 + 5000 * (MY_NAME == "SHOBHA") + 10000 * (MY_NAME == "RADHIKA")
# batch = 64
# for batch in range(10, df_m.shape[0]//50):
for batch in [64]:
  print("batch {} of 10 started".format(batch + 1))
  batch_df = df_m[START + batch_size * batch : START + batch_size * (batch + 1)].copy()
  extract_nhd_variables(batch_df, (START + batch_size * batch) + 1, PATCH_SIZE)
  print("batch {} of 10 done".format(batch + 1))


batch 65 of 10 started
number of features = 0
number of features = 1
number of features = 0
number of features = 1
number of features = 0
number of features = 0
number of features = 1
number of features = 0
number of features = 0
number of features = 0
number of features = 0
number of features = 0
number of features = 0
number of features = 2
number of features = 0
number of features = 1
number of features = 0
number of features = 0
number of features = 3
number of features = 1
number of features = 1
number of features = 0
number of features = 1
number of features = 3
number of features = 1
number of features = 1
number of features = 2
number of features = 1
number of features = 1
number of features = 1
number of features = 1
number of features = 3
number of features = 0
number of features = 1
number of features = 1
number of features = 0
batch 65 of 10 done


In [12]:
# # pass in batches of 500
# # MADHUKAR: 0 - 5000 
# # SHOBHA: 5000 - 10000
# # RADHIKA: 10000 - 15000

# batch_size = 500
# MY_NAME = "SHOBHA"
# START = 0 + 5000 * (MY_NAME == "SHOBHA") + 10000 * (MY_NAME == "RADHIKA")

# for batch in range(10):
#   print("batch {} of 10 started".format(batch + 1))
#   batch_df = df_m[START + batch_size * batch : START + batch_size * (batch + 1)].copy()
#   extract_nhd_variables(batch_df, (START + batch_size * batch) + 1, PATCH_SIZE)
#   print("batch {} of 10 done".format(batch + 1))


In [13]:
# # pass in batches of 500
# # MADHUKAR: 0 - 5000 
# # SHOBHA: 5000 - 10000
# # RADHIKA: 10000 - 15000

# batch_size = 500
# MY_NAME = "RADHIKA"
# START = 0 + 5000 * (MY_NAME == "SHOBHA") + 10000 * (MY_NAME == "RADHIKA")

# for batch in range(10):
#   print("batch {} of 10 started".format(batch + 1))
#   batch_df = df_m[START + batch_size * batch : START + batch_size * (batch + 1)].copy()
#   extract_nhd_variables(batch_df, (START + batch_size * batch) + 1, PATCH_SIZE)
#   print("batch {} of 10 done".format(batch + 1))


In [14]:
from datetime import datetime
print(datetime.now())

2021-04-02 20:53:54.042306


In [15]:
#read the file
# df_m_test = pd.read_csv('NHD_extracted_vars/combined_regular_clean_with_ssurgo_nhd_variables_part1.csv')
df_m_test = pd.read_pickle("NHD_extracted_vars_cwr_nwpr_" + 
                          str(2 * PATCH_SIZE) + "mX" + 
                          str(2 * PATCH_SIZE) + "m_with_fcode_ftype/" + 
                          str(2 * PATCH_SIZE) + "mX" + str(2 * PATCH_SIZE) + 
                          "m_nhd_variables_part1")
df_m_test.nhd_vars_wb


14324                             ([], [], [], [], [], [])
14339                             ([], [], [], [], [], [])
14346                             ([], [], [], [], [], [])
14348                             ([], [], [], [], [], [])
14378    ([4833766, 4833678, 22324549], [LakePond, Lake...
14395                             ([], [], [], [], [], [])
14540                             ([], [], [], [], [], [])
14604                             ([], [], [], [], [], [])
14609                             ([], [], [], [], [], [])
14659                             ([], [], [], [], [], [])
14668                             ([], [], [], [], [], [])
14680    ([22560220], [LakePond], [], [0.052], [nan], [...
14691                             ([], [], [], [], [], [])
14693    ([4254754, 4254522], [SwampMarsh, SwampMarsh],...
14696    ([1098974], [LakePond], [649938], [1.192], [na...
14723                             ([], [], [], [], [], [])
14726                             ([], [], [], [], [], [

In [16]:
[print(item) for item in df_m_test.nhd_vars_wb]

([], [], [], [], [], [])
([], [], [], [], [], [])
([], [], [], [], [], [])
([], [], [], [], [], [])
([4833766, 4833678, 22324549], ['LakePond', 'LakePond', 'SwampMarsh'], ['657069', '656704', ''], [0.194, 0.275, 4.705], [nan, nan, nan], [39004, 39004, 46600])
([], [], [], [], [], [])
([], [], [], [], [], [])
([], [], [], [], [], [])
([], [], [], [], [], [])
([], [], [], [], [], [])
([], [], [], [], [], [])
([22560220], ['LakePond'], [''], [0.052], [nan], [39001])
([], [], [], [], [], [])
([4254754, 4254522], ['SwampMarsh', 'SwampMarsh'], ['', ''], [0.741, 0.62], [nan, nan], [46600, 46600])
([1098974], ['LakePond'], ['649938'], [1.192], [nan], [39004])
([], [], [], [], [], [])
([], [], [], [], [], [])
([4254778], ['SwampMarsh'], [''], [0.411], [nan], [46600])
([], [], [], [], [], [])
([], [], [], [], [], [])
([], [], [], [], [], [])
([], [], [], [], [], [])
([14785653, 14785661, 14785667, 14785665, 14785659, 14785657], ['LakePond', 'LakePond', 'LakePond', 'LakePond', 'LakePond', 'LakePo

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [17]:
[print(item) for item in df_m_test.nhd_vars_fl][:1]

([20324433, 20324437], ['StreamRiver', 'StreamRiver'], ['', ''], [nan, nan], [2.179, 1.471], [46003, 46003])
([22338541], ['StreamRiver'], ['1469616'], [nan], [3.595], [46006])
([22340313, 22338847], ['StreamRiver', 'StreamRiver'], ['1478084', '1469782'], [nan, nan], [3.173, 3.123], [46006, 46003])
([14785817], ['StreamRiver'], ['422161'], [nan], [1.7], [46003])
([4835432, 4836616, 4837010], ['StreamRiver', 'ArtificialPath', 'ArtificialPath'], ['', '', ''], [nan, nan, nan], [0.407, 1.257, 0.759], [46006, 55800, 55800])
([], [], [], [], [], [])
([3882814], ['StreamRiver'], [''], [nan], [2.975], [46003])
([], [], [], [], [], [])
([18414727], ['StreamRiver'], [''], [nan], [2.146], [46006])
([1100076], ['StreamRiver'], [''], [nan], [2.076], [46003])
([], [], [], [], [], [])
([22563042, 22560452, 22560470, 22560484, 22560486, 22560482, 22560444, 22563360, 22563026, 22560448, 22560456], ['Pipeline', 'CanalDitch', 'CanalDitch', 'CanalDitch', 'CanalDitch', 'CanalDitch', 'StreamRiver', 'Artific

[None]

In [18]:
df_m_test.nhd_vars_fl

14324    ([20324433, 20324437], [StreamRiver, StreamRiv...
14339    ([22338541], [StreamRiver], [1469616], [nan], ...
14346    ([22340313, 22338847], [StreamRiver, StreamRiv...
14348    ([14785817], [StreamRiver], [422161], [nan], [...
14378    ([4835432, 4836616, 4837010], [StreamRiver, Ar...
14395                             ([], [], [], [], [], [])
14540    ([3882814], [StreamRiver], [], [nan], [2.975],...
14604                             ([], [], [], [], [], [])
14609    ([18414727], [StreamRiver], [], [nan], [2.146]...
14659    ([1100076], [StreamRiver], [], [nan], [2.076],...
14668                             ([], [], [], [], [], [])
14680    ([22563042, 22560452, 22560470, 22560484, 2256...
14691    ([4253010], [StreamRiver], [], [nan], [1.545],...
14693    ([2353786], [StreamRiver], [663175], [nan], [3...
14696                             ([], [], [], [], [], [])
14723                             ([], [], [], [], [], [])
14726    ([11689394, 11689398, 11689402], [StreamRiver,.