# River bifurcation in CONUS workflow
This notebook contains the workflow necessary to extract data from a HUC4 and join it to NABD for bifurcation analysis.

## 1. Install packages

In [1]:
# import sys
# !{sys.executable} -m pip install geofeather
# !{sys.executable} -m pip install nhdnet  #see Setup info document 

## 2. Load modules

In [2]:
#basic analysis 
from pathlib import Path
import os
from time import time
import geopandas as gp
import geofeather
import numpy as np
import pandas as pd


#modules from SARP analysis 
from geofeather import to_geofeather
from nhdnet.nhd.extract import extract_flowlines    # this is the original function
# from nhdnet.nhd.extract_test import extract_flowlines_R  # this function was created by Rachel to extract info from other NHD
 

#Getting the other NHD 
# from nhdnet.nhd.download import download_huc4
# nhd_dir = Path("data/nhd/source/huc4")


# #pull data from Google Drive
# from pydrive.auth import GoogleAuth
# from pydrive.drive import GoogleDrive

# gauth = GoogleAuth()
# gauth.LocalWebserverAuth() # client_secrets.json need to be in the same directory as the script
# drive = GoogleDrive(gauth)

## 3. Initial setup and constants

In [3]:
#Select HUC of interest
HUC2 = 10
i = 19
HUC4 = "{0}{1:02d}".format(HUC2, i)  # this formats the HUC4 name how we want it. ':02d' is string formatting
print(HUC4)
print(type(HUC4))
huc_id = int(HUC4) * 1000000   # the full HUC4 ID
print(huc_id)

data_dir = Path("data/nhd/source/huc4")  # point to where GDBs are
# data_dir = Path("/Volumes/GoogleDrive/My Drive/Condon_Research_Group/Research_Projects/Rachel/Research/GIS/Layers/NHDPlusNationalData")  # point to where GDBs are

#Setting projections
CRS = {           # Using USGS CONUS Albers (EPSG:102003): https://epsg.io/102003  WHY?
    "proj": "aea",
    "lat_1": 29.5,
    "lat_2": 45.5,
    "lat_0": 37.5,
    "lon_0": -96,
    "x_0": 0,
    "y_0": 0,
    "datum": "NAD83",
    "units": "m",
    "no_defs": True,}
print(data_dir)

1019
<class 'str'>
1019000000
data/nhd/source/huc4


## 4. Read in the geodatabase

In [4]:
# Read the smaller HR gdb 
gdb = data_dir/HUC4/ "NHDPLUS_H_{HUC4}_HU4_GDB.gdb".format(HUC4=HUC4)
print(gdb)
read_start = time()
flowlines, joins = extract_flowlines(gdb, target_crs=CRS)
print("Read {:,} flowlines in  {:.0f} seconds".format(len(flowlines), time() - read_start))


# # Read in the entire gdb from Drive
# gdb = data_dir/"NHDPlusV21_National_Seamless_Flattened_Lower48.gdb"
# print(gdb)
# read_start = time()
# flowlines = extract_flowlines_R(gdb, target_crs=CRS)
# print("Read {:,} flowlines in  {:.0f} seconds".format(len(flowlines), time() - read_start))


# Filter the gdb as it is read in
# gdb = data_dir/ "NHDPlusV21_National_Seamless_Flattened_Lower48.gdb".format(HUC4=HUC4)
# print(gdb)
# read_start = time()
# flowlines, joins = extract_flowlines(gdb, target_crs=CRS)
# print("Read {:,} flowlines in  {:.0f} seconds".format(len(flowlines), time() - read_start))

data/nhd/source/huc4/1019/NHDPLUS_H_1019_HU4_GDB.gdb
Reading flowlines
Columns= <bound method NDFrame.head of            NHDPlusID  FlowDir  FType   GNIS_ID             GNIS_Name  \
0       2.300190e+13        1    460      None                  None   
1       2.300190e+13        1    460      None                  None   
2       2.300190e+13        1    460      None                  None   
3       2.300190e+13        1    460      None                  None   
4       2.300190e+13        1    460      None                  None   
...              ...      ...    ...       ...                   ...   
232506  2.300190e+13        1    460      None                  None   
232507  2.300190e+13        1    460  00184959  West Toll Gate Creek   
232508  2.300190e+13        1    334      None                  None   
232509  2.300190e+13        1    558      None                  None   
232510  2.300190e+13        1    336  00203030      Burlington Ditch   

                         

### 4.1 Some checks and cleanup of 'flowlines'

In [5]:
# flowlines.head(3)
# list(flowlines.columns)

# cleaning up columns
imp_cols = ["lineID", "NHDPlusID", "ReachCode", "streamorder", "geometry"]
flowlines = flowlines[imp_cols]
list(flowlines.columns)
# len(flowlines.columns)

['lineID', 'NHDPlusID', 'ReachCode', 'streamorder', 'geometry']

### 4.2 Read in the HUC4 shapefile (1019) I made 

In [36]:
huc_test = (gp.read_file('/Users/rachelspinti/Desktop/HUC_test/Test1029.shp')) # this is actually HUC 1019
huc_test = huc_test.rename(columns={"REACHCODE": "ReachCode"})
huc_test = huc_test.rename(columns={"StreamOrde": "streamorder"})
huc_test.ReachCode = huc_test.ReachCode.astype("uint64")
# print(huc_test)
# list(huc_test.columns)

### 4.3 Checks and cleanup of 'huc_test'

In [23]:
# checking huc_test
# type(huc_test.ReachCode)
# huc_test.head(3)
# list(huc_test.columns)
print(len(huc_test))
# cleaning up columns
imp_cols2 = ['OBJECTID', 'COMID', 'ReachCode', 'streamorder']
huc_test = huc_test[imp_cols2]
# huc_test.drop(huc_test.columns[[3, 4, 5, 6,7 ]], axis=1, inplace=True)
list(huc_test.columns)
# # len(huc_test.columns)


14847


['OBJECTID', 'COMID', 'ReachCode', 'streamorder']

In [32]:
# len(flowlines)
set(huc_test.streamorder)

{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}

## 5. Join the two datasets
Check this link out for help: https://www.earthdatascience.org/courses/use-data-open-source-python/intro-vector-data-python/vector-data-processing/spatial-joins-in-python-geopandas-shapely/

See also: https://geopandas.org/mergingdata.html

https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

In [9]:
# Attribute join with geopandas
rivers = flowlines.merge(huc_test, how='outer', on='ReachCode')

### 5.1 Checks of 'rivers'

In [10]:
# Checking stuff after merge
print(len(rivers))
print(rivers.head(4))  #check the join
# print(rivers.COMID)
# rivers.iloc[rivers.streamorder].plot()
# flowlines[flowlines.streamorder>4].plot()

# Create a list of unique values by turning the
# pandas column into a set
# len(set(rivers.ReachCode))
# len(set(rivers.COMID))

# type(rivers)
# print(rivers.describe)
# rivers.plot()
# print(rivers.shape)
# print(list(rivers.columns))

250868
   lineID     NHDPlusID       ReachCode  streamorder  \
0     1.0  2.300190e+13  10190001010972          1.0   
1     2.0  2.300190e+13  10190002021576          1.0   
2     3.0  2.300190e+13  10190002022431          1.0   
3     4.0  2.300190e+13  10190002022326          1.0   

                                            geometry  OBJECTID  COMID  
0  LINESTRING (-800081.762 215515.782, -800082.87...       NaN    NaN  
1  LINESTRING (-799823.525 221309.940, -799829.56...       NaN    NaN  
2  LINESTRING (-800259.206 220461.613, -800252.86...       NaN    NaN  
3  LINESTRING (-799873.180 220328.121, -799869.36...       NaN    NaN  


## 6. Join NHD and NABD

In [24]:
# Read in NABD shapefile and join with NHD
nabd = gp.read_file('/Volumes/GoogleDrive/My Drive/Condon_Research_Group/Research_Projects/Rachel/Research/GIS/Layers/nabd_fish_barriers_2012.shp')
imp_cols3 = ['NIDID', 'COMID', 'Dam_name', 'Purposes', 'Year_compl', 'Max_stor', 'Norm_stor','NID_stor', 'geometry']
nabd = nabd[imp_cols3]
print(list(nabd.columns))
print(len(nabd))

['NIDID', 'COMID', 'Dam_name', 'Purposes', 'Year_compl', 'Max_stor', 'Norm_stor', 'NID_stor', 'geometry']
52456


In [37]:
# Merge
nabd = nabd.merge(huc_test, how= 'right', on='COMID')   #attribute join with geopandas

In [40]:
# Checking stuff after merge
print(len(nabd))
nabd.head(10)  #check the join
# nabd[nabd.COMID].plot()
# nabd.plot()
# nabd[nabd.streamorder].plot()
# set(nabd.streamorder)
set(nabd)

14854


{'ArbolateSu',
 'AreaSqKM',
 'COMID',
 'Dam_name',
 'DivDASqKM',
 'Divergence',
 'DnDrainCou',
 'DnHydroseq',
 'DnLevel',
 'DnLevelPat',
 'DnMinorHyd',
 'ELEVFIXED',
 'Enabled',
 'FCODE',
 'FDATE',
 'FLOWDIR',
 'FTYPE',
 'FromMeas',
 'FromNode',
 'GNIS_ID',
 'GNIS_NAME',
 'HWNodeSqKM',
 'HWTYPE',
 'Hydroseq',
 'LENGTHKM',
 'LakeFract',
 'LevelPathI',
 'MAXELEVRAW',
 'MAXELEVSMO',
 'MINELEVRAW',
 'MINELEVSMO',
 'Max_stor',
 'NIDID',
 'NID_stor',
 'Norm_stor',
 'OBJECTID_x',
 'OBJECTID_y',
 'PathTimeMA',
 'Pathlength',
 'Purposes',
 'QA_01',
 'QA_02',
 'QA_03',
 'QA_04',
 'QA_05',
 'QA_06',
 'QA_07',
 'QA_08',
 'QA_09',
 'QA_10',
 'QA_11',
 'QA_12',
 'QA_MA',
 'QC_01',
 'QC_02',
 'QC_03',
 'QC_04',
 'QC_05',
 'QC_06',
 'QC_07',
 'QC_08',
 'QC_09',
 'QC_10',
 'QC_11',
 'QC_12',
 'QC_MA',
 'QE_01',
 'QE_02',
 'QE_03',
 'QE_04',
 'QE_05',
 'QE_06',
 'QE_07',
 'QE_08',
 'QE_09',
 'QE_10',
 'QE_11',
 'QE_12',
 'QE_MA',
 'RAreaHLoad',
 'RESOLUTION',
 'RPUID',
 'ReachCode_x',
 'ReachCode_y',
 '