# Notebook to extract variables out of NHD database


In [19]:
#ee
PATCH_SIZE = 1250

Originally 2021.02.23_ExportImagestoGCS_MR1_Copy.ipynb

# Authentications

In [20]:
import pandas as pd
import numpy as np
import time
import ee
import pickle
ee.Initialize()

# Set up bounding box

In [21]:
def square(lat, lon, size):
  crs_proj = "EPSG:4326"  
  return ee.Geometry.Point([lon, lat], proj=crs_proj).buffer(size).bounds()

# NHDPlus

In [22]:
def fc(nhd_folder_num, feature): # fc for feature collection
  """

  Function calls the individual GEE asset correspondiing to shapefiles in 
  subfolder of NHDPlus V2 dataset

  nhd_num: number indicating which subfolder in the NHDPlus dataset
  """

  # start with empty feature collection
  merged = ee.FeatureCollection([None])

  # convert '1' into '01' etc
  if nhd_folder_num < 10:
    num = "0" + str(nhd_folder_num)
  else:
    num = str(nhd_folder_num)

  # add suffix corresponding to how the subfolders were named
  if nhd_folder_num == 3:
    for direction in ['N', 'S', 'W']:
      merged = merged.merge((ee.FeatureCollection("users/madhukarreddy/NHDPlus" 
                                                  + num + direction 
                                                  + "_" + feature)))
  elif nhd_folder_num == 10:
    for direction in ['L', 'U']:
      merged = merged.merge((ee.FeatureCollection("users/madhukarreddy/NHDPlus" 
                                                  + num + direction 
                                                  + "_" + feature)))
  else:
    merged = merged.merge((ee.FeatureCollection("users/madhukarreddy/NHDPlus" 
                                                + num + "_" 
                                                + feature)))
  return merged

In [23]:
def merge_fc(feature):
  """

  Joins all the shapefiles across the US

  feature: "Waterbodies" or "Flowlines"
  """
  merged_fc_ = ee.FeatureCollection([None])
  for i in range(1,19):
    merged_fc_ = merged_fc_.merge(fc(i, feature))
  return merged_fc_

fc_wb = merge_fc("Waterbodies")
fc_fl = merge_fc("Flowlines")

# NHD parameter retrieval

In [24]:
# waterbodies
# key imp properties: COMID, FTYPE, GNIS_NAME, AREASQKM

# How do you combine these?:
# FTYPE: join the strings for later usage (OHE)
# GNIS_NAME: sum all of the OHE name present vs absent
# AREASQKM: sum of all

# Given a lat, lon, find the above parameters

In [25]:
# return multiple fields
def nhd_vars(feature="Waterbodies", lat=41.638, lon=-122.0048, size=1000):
  """

  For a given lat, lon, return multiple GEE variables
  """
  
  comid_list = [] # n-digit number
  ftype_str = [] # str
  fl_length = [] # float
  wb_area = [] # float
  gnis_id = [] # n-digit number
  fcode = [] # n-digit number



  try:
    if feature == "Waterbodies":
      fc = fc_wb.filterBounds(square(lat, lon, size))
    else:
      fc = fc_fl.filterBounds(square(lat, lon, size))
      
    
    fc_getInfo = fc.getInfo().get('features')
    num_of_features = len(fc_getInfo)
    print("number of features =", num_of_features)
    
    for feat in range(num_of_features):
      comid_list.append(fc_getInfo[feat].get('properties').get('COMID'))

      ftype_str.append(fc_getInfo[feat].get('properties').get('FTYPE'))
      
      gnis_id.append(fc_getInfo[feat].get('properties').get('GNIS_ID'))
      
      if feature == "Waterbodies":
        wb_area.append(fc_getInfo[feat].get('properties').get('AREASQKM'))
      else:
        wb_area.append(np.nan)
      
      if feature == "Flowlines":
        fl_length.append(fc_getInfo[feat].get('properties').get('LENGTHKM'))
      else:
        fl_length.append(np.nan)
        
      fcode.append(fc_getInfo[feat].get('properties').get('FCODE'))

    return comid_list, ftype_str, gnis_id, wb_area, fl_length, fcode
  except Exception as e:
    print(e)
    print("Issue with {0} at lat={1}, lon={2}".format(feature, lat, lon))
    return np.nan

In [26]:
# read in the nhd addendum file

nhd_stats = pd.read_csv("nhd_stats_AI.csv")

# read in csv file with SSURGO variables
# df_m = pd.read_csv("combined_regular_clean_with_ssurgo_variables.csv")
df_m = pd.read_pickle("cwr_nwpr_dataset")

In [27]:
def extract_nhd_variables(df_m_, i, size):

  df_m_["nhd_vars_wb"] = df_m_.apply(lambda x: nhd_vars(feature="Waterbodies", lat=x.latitude, lon=x.longitude, size=size), axis=1)
  df_m_["nhd_vars_fl"] = df_m_.apply(lambda x: nhd_vars(feature="Flowlines", lat=x.latitude, lon=x.longitude, size=size), axis=1)

  # pickle the dataframe 
#   df_m_.to_pickle('NHD_extracted_vars_2.5kmX2.5km_with_fcode_ftype/2.5kmX2.5km_nhd_variables_part' + str(i))
  pickle.dump(df_m_, open("NHD_extracted_vars_cwr_nwpr_" + 
                          str(2 * PATCH_SIZE) + "mX" + 
                          str(2 * PATCH_SIZE) + "m_with_fcode_ftype/" + 
                          str(2 * PATCH_SIZE) + "mX" + str(2 * PATCH_SIZE) + 
                          "m_nhd_variables_part" + str(i),"wb"), 
              protocol=3)        

In [28]:
from datetime import datetime
print(datetime.now())

2021-04-02 22:20:41.352153


In [29]:
# pass in batches of 500
# MADHUKAR: 0 - 5000 
# SHOBHA: 5000 - 10000
# RADHIKA: 10000 - 15000

# PATCH_SIZE = 500
batch_size = 50
MY_NAME = "MADHUKAR"
START = 0 + 5000 * (MY_NAME == "SHOBHA") + 10000 * (MY_NAME == "RADHIKA")
# batch = 64
# for batch in range(10, df_m.shape[0]//50):
for batch in [64]:
  print("batch {} of 10 started".format(batch + 1))
  batch_df = df_m[START + batch_size * batch : START + batch_size * (batch + 1)].copy()
  extract_nhd_variables(batch_df, (START + batch_size * batch) + 1, PATCH_SIZE)
  print("batch {} of 10 done".format(batch + 1))


batch 65 of 10 started
number of features = 0
number of features = 2
number of features = 0
number of features = 3
number of features = 0
number of features = 0
number of features = 2
number of features = 1
number of features = 0
number of features = 0
number of features = 0
number of features = 0
number of features = 0
number of features = 4
number of features = 1
number of features = 1
number of features = 0
number of features = 0
number of features = 6
number of features = 2
number of features = 3
number of features = 0
number of features = 12
number of features = 8
number of features = 7
number of features = 12
number of features = 5
number of features = 6
number of features = 3
number of features = 5
number of features = 1
number of features = 4
number of features = 3
number of features = 5
number of features = 2
number of features = 1
batch 65 of 10 done


In [12]:
# # pass in batches of 500
# # MADHUKAR: 0 - 5000 
# # SHOBHA: 5000 - 10000
# # RADHIKA: 10000 - 15000

# batch_size = 500
# MY_NAME = "SHOBHA"
# START = 0 + 5000 * (MY_NAME == "SHOBHA") + 10000 * (MY_NAME == "RADHIKA")

# for batch in range(10):
#   print("batch {} of 10 started".format(batch + 1))
#   batch_df = df_m[START + batch_size * batch : START + batch_size * (batch + 1)].copy()
#   extract_nhd_variables(batch_df, (START + batch_size * batch) + 1, PATCH_SIZE)
#   print("batch {} of 10 done".format(batch + 1))


In [13]:
# # pass in batches of 500
# # MADHUKAR: 0 - 5000 
# # SHOBHA: 5000 - 10000
# # RADHIKA: 10000 - 15000

# batch_size = 500
# MY_NAME = "RADHIKA"
# START = 0 + 5000 * (MY_NAME == "SHOBHA") + 10000 * (MY_NAME == "RADHIKA")

# for batch in range(10):
#   print("batch {} of 10 started".format(batch + 1))
#   batch_df = df_m[START + batch_size * batch : START + batch_size * (batch + 1)].copy()
#   extract_nhd_variables(batch_df, (START + batch_size * batch) + 1, PATCH_SIZE)
#   print("batch {} of 10 done".format(batch + 1))


In [14]:
from datetime import datetime
print(datetime.now())

2021-04-02 21:00:12.635424


In [15]:
#read the file
# df_m_test = pd.read_csv('NHD_extracted_vars/combined_regular_clean_with_ssurgo_nhd_variables_part1.csv')
df_m_test = pd.read_pickle("NHD_extracted_vars_cwr_nwpr_" + 
                          str(2 * PATCH_SIZE) + "mX" + 
                          str(2 * PATCH_SIZE) + "m_with_fcode_ftype/" + 
                          str(2 * PATCH_SIZE) + "mX" + str(2 * PATCH_SIZE) + 
                          "m_nhd_variables_part1")
df_m_test.nhd_vars_wb


14324                             ([], [], [], [], [], [])
14339                             ([], [], [], [], [], [])
14346                             ([], [], [], [], [], [])
14348                             ([], [], [], [], [], [])
14378    ([4833730, 4833766, 4833678, 22324621, 2232597...
14395    ([1098900, 4250936], [LakePond, SwampMarsh], [...
14540                             ([], [], [], [], [], [])
14604    ([8609261, 8609259], [LakePond, LakePond], [, ...
14609    ([18413819], [LakePond], [], [0.154], [nan], [...
14659    ([1099348, 1099410, 1099404], [LakePond, LakeP...
14668    ([2031615, 2031653, 2031661, 2031625, 2031665]...
14680    ([22560220, 22560218, 22560214], [LakePond, La...
14691    ([4251998, 4252058], [LakePond, LakePond], [65...
14693    ([4254754, 4254522], [SwampMarsh, SwampMarsh],...
14696    ([1098968, 1098974, 4250942, 1099008], [LakePo...
14723    ([5861931, 5861949, 6743442, 5861937], [LakePo...
14726                             ([], [], [], [], [], [

In [16]:
[print(item) for item in df_m_test.nhd_vars_wb]

([], [], [], [], [], [])
([], [], [], [], [], [])
([], [], [], [], [], [])
([], [], [], [], [], [])
([4833730, 4833766, 4833678, 22324621, 22325977, 22324549], ['LakePond', 'LakePond', 'LakePond', 'SwampMarsh', 'SwampMarsh', 'SwampMarsh'], ['658594', '657069', '656704', '', '', ''], [0.193, 0.194, 0.275, 0.383, 0.875, 4.705], [nan, nan, nan, nan, nan, nan], [39004, 39004, 39004, 46600, 46600, 46600])
([1098900, 4250936], ['LakePond', 'SwampMarsh'], ['643939', ''], [0.862, 1.101], [nan, nan], [39004, 46600])
([], [], [], [], [], [])
([8609261, 8609259], ['LakePond', 'LakePond'], ['', ''], [0.015, 0.012], [nan, nan], [39004, 39004])
([18413819], ['LakePond'], [''], [0.154], [nan], [39004])
([1099348, 1099410, 1099404], ['LakePond', 'LakePond', 'LakePond'], ['647717', '', ''], [0.042, 0.02, 0.02], [nan, nan, nan], [39004, 39004, 39004])
([2031615, 2031653, 2031661, 2031625, 2031665], ['LakePond', 'LakePond', 'LakePond', 'LakePond', 'LakePond'], ['', '', '', '', ''], [0.169, 0.065, 0.322, 

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [17]:
[print(item) for item in df_m_test.nhd_vars_fl][:1]

([20331722, 20324433, 20324437], ['StreamRiver', 'StreamRiver', 'StreamRiver'], ['', '', ''], [nan, nan, nan], [2.722, 2.179, 1.471], [46003, 46003, 46003])
([22338541, 22338501, 22338455], ['StreamRiver', 'StreamRiver', 'StreamRiver'], ['1469616', '1467593', '1472325'], [nan, nan, nan], [3.595, 2.272, 2.639], [46006, 46006, 46006])
([22338859, 22340313, 22338847], ['StreamRiver', 'StreamRiver', 'StreamRiver'], ['1478084', '1478084', '1469782'], [nan, nan, nan], [2.436, 3.173, 3.123], [46006, 46006, 46003])
([14784645, 14785825, 14784643, 14785817, 14784637, 14785837], ['StreamRiver', 'StreamRiver', 'StreamRiver', 'StreamRiver', 'StreamRiver', 'CanalDitch'], ['422161', '422161', '422161', '422161', '410206', ''], [nan, nan, nan, nan, nan, nan], [2.635, 2.93, 0.918, 1.7, 3.225, 3.899], [46003, 46003, 46003, 46003, 46003, 33600])
([4835432, 4836616, 4837010], ['StreamRiver', 'ArtificialPath', 'ArtificialPath'], ['', '', ''], [nan, nan, nan], [0.407, 1.257, 0.759], [46006, 55800, 55800])


[None]

In [18]:
df_m_test.nhd_vars_fl

14324    ([20331722, 20324433, 20324437], [StreamRiver,...
14339    ([22338541, 22338501, 22338455], [StreamRiver,...
14346    ([22338859, 22340313, 22338847], [StreamRiver,...
14348    ([14784645, 14785825, 14784643, 14785817, 1478...
14378    ([4835432, 4836616, 4837010], [StreamRiver, Ar...
14395    ([1100732, 1100404, 1099884, 1099876], [Artifi...
14540    ([3882814, 3882822, 3883006], [StreamRiver, St...
14604    ([8577703, 8577705, 8610431, 8610329, 8610409,...
14609    ([18414733, 18414699, 18414727, 18414671, 1841...
14659    ([1100060, 1100052, 1100076], [StreamRiver, St...
14668    ([2032517, 2032553, 2033735], [StreamRiver, St...
14680    ([22563042, 22560452, 22560470, 22560484, 2256...
14691    ([4253010], [StreamRiver], [], [nan], [1.545],...
14693    ([937010380, 937010382, 937010381, 2354060, 23...
14696    ([1100578, 1100574, 1100828, 1100576, 1100826,...
14723    ([5862339, 5862555], [StreamRiver, StreamRiver...
14726    ([11689390, 11689394, 11689398, 11689400, 1168.