# River bifurcation in CONUS workflow
This notebook contains the workflow necessary to extract data from a HUC4 and join it to NABD for bifurcation analysis.

## 1. Load modules

In [1]:
# basic analysis 
from pathlib import Path
import os
from time import time
import geopandas as gp


# import extract module for analysis
from extract_nhd import extract_nhdflowlines  # this function was created by Rachel to extract info from other NHD 

## 2. Initial setup and constants

In [2]:
# Select HUC of interest
HUC2 = 10
i = 19
HUC4 = "{0}{1:02d}".format(HUC2, i)  # this formats the HUC4 name how we want it. ':02d' is string formatting
# print(HUC4)
# print(type(HUC4))
huc_id = int(HUC4) * 1000000   # the full HUC4 ID
# print(huc_id)

# Setting projections
CRS = {           # Using USGS CONUS Albers (EPSG:102003): https://epsg.io/102003  WHY?
    "proj": "aea",
    "lat_1": 29.5,
    "lat_2": 45.5,
    "lat_0": 37.5,
    "lon_0": -96,
    "x_0": 0,
    "y_0": 0,
    "datum": "NAD83",
    "units": "m",
    "no_defs": True,
}


## 3. Read in the geodatabase

In [3]:
# data_dir = Path("/Volumes/GoogleDrive/My Drive/Condon_Research_Group/Research_Projects/Rachel/Research/GIS/Layers/NHDPlusNationalData")  # point to where GDBs are

# # Read in the entire gdb from Drive
# gdb = data_dir/"NHDPlusV21_National_Seamless_Flattened_Lower48.gdb"
# print(gdb)
# read_start = time()
# flowlines = extract_nhdflowlines(gdb, target_crs=CRS)
# print("Read {:,} flowlines in  {:.0f} seconds".format(len(flowlines), time() - read_start))

# # Filter the gdb as it is read in
# gdb = data_dir/ "NHDPlusV21_National_Seamless_Flattened_Lower48.gdb".format(HUC4=HUC4)
# print(gdb)
# read_start = time()
# flowlines, joins = extract_nhdflowlines(gdb, target_crs=CRS)
# print("Read {:,} flowlines in  {:.0f} seconds".format(len(flowlines), time() - read_start))


In [None]:
data_dir = Path("/Users/rachelspinti/Documents/River_bifurcation/data/nhd/NHDPlusNationalData")  # point to where GDBs are

# nhd_gdb = gp.read_file(data_dir/"NHDPlusV21_National_Seamless_Flattened_Lower48.gdb")
gdb_path = data_dir/"NHDPlusV21_National_Seamless_Flattened_Lower48.gdb"
# cats = gp.read_file(gdb_path, layer="Catchment")
# flowline_cols = ["COMID", "NHDPlusID", "FlowDir", "FType", "geometry", "ReachCode"]
flowlines = gp.read_file(gdb_path, layer="NHDFlowline_Network")[3]
# flowlines = gp.read_file(gdb_path, layer="NHDFlowline_Network")[flowline_cols]


# # Read in the entire gdb
# gdb = data_dir/"NHDPlusV21_National_Seamless_Flattened_Lower48.gdb"
# print(gdb)
# read_start = time()
# flowlines = extract_nhdflowlines(gdb, target_crs=CRS)
# print("Read {:,} flowlines in  {:.0f} seconds".format(len(flowlines), time() - read_start))

In [None]:
# nhd_gdb.head(3)
flowlines.head(3)
# print(len(flowlines))
# cats.head(3)

## Junk

In [None]:
#Testing
import pandas as pd
data = {'HUC':  [101901, 102002, 101902, 101510]}

df = pd.DataFrame (data, columns = ['HUC'])

huc4 = 1019
select = data['HUC'][:3] == huc4
print(select)
# int(str(number)[:2])
# gapminder['year']==2002
print(data['HUC'] == )

select_index = data['HUC'].str.contains('huc4')
select = data[select_index]
select