# River bifurcation in CONUS workflow
This notebook contains the workflow necessary to extract data from a HUC4 and join it to NABD for bifurcation analysis.

## 1. Load modules

In [10]:
# basic analysis 
from pathlib import Path
import os
from time import time
import geopandas as gp
import pandas as pd


# import extract module for analysis
from extract_nhd import extract_nhdflowlines  # this function was created by Rachel to extract info from other NHD 

## 2. Initial setup and constants

In [11]:
# Select HUC of interest
HUC2 = 10
i = 19
HUC4 = "{0}{1:02d}".format(HUC2, i)  # this formats the HUC4 name how we want it. ':02d' is string formatting
# print(HUC4)
# print(type(HUC4))
huc_id = int(HUC4) * 1000000   # the full HUC4 ID
# print(huc_id)

# Setting projections
CRS = {           # Using USGS CONUS Albers (EPSG:102003): https://epsg.io/102003  WHY?
    "proj": "aea",
    "lat_1": 29.5,
    "lat_2": 45.5,
    "lat_0": 37.5,
    "lon_0": -96,
    "x_0": 0,
    "y_0": 0,
    "datum": "NAD83",
    "units": "m",
    "no_defs": True,
}
os.getcwd()


'/Users/rachelspinti/Documents/River_bifurcation'

## 3. Read in the geodatabase

In [32]:
# Flowline csv
# flowlines = pd.read_csv('/Users/rachelspinti/Documents/River_bifurcation/data/nhd/test.csv')
flowlines.head(3)
# list(flowlines.columns)

Unnamed: 0,WKT,OBJECTID,COMID,FDATE,RESOLUTION,GNIS_ID,GNIS_NAME,LENGTHKM,REACHCODE,FLOWDIR,...,QC_12,VC_12,QE_12,VE_12,LakeFract,SurfArea,RAreaHLoad,RPUID,VPUID,Enabled
0,MULTILINESTRING ZM ((-69.0801710698805 47.0479...,1,721640,2008-08-28T00:00:00.000,Medium,561174,Allagash River,7.634,1010002000001,With Digitized,...,1263.34,1.79598,1230.971,1.66764,,,,01a,1,1
1,MULTILINESTRING ZM ((-69.0452752699346 47.0841...,2,717072,2008-08-28T00:00:00.000,Medium,561174,Allagash River,0.948,1010002000001,With Digitized,...,1263.974,0.95148,1231.605,0.89602,,,,01a,1,1
2,MULTILINESTRING ZM ((-69.075266469888 47.02969...,3,717076,2008-08-28T00:00:00.000,Medium,561174,Allagash River,2.151,1010002000001,With Digitized,...,1253.339,1.69117,1252.634,1.68824,,,,01a,1,1


In [34]:
# list(flowlines.columns)
imp_cols = ['OBJECTID', 'COMID', 'REACHCODE', 'StreamOrde', 'WKT']
flowlines[imp_cols]
# flowlines.head(3)
# flowlines['REACHCODE']
flowlines['WKT']

0          MULTILINESTRING ZM ((-69.0801710698805 47.0479...
1          MULTILINESTRING ZM ((-69.0452752699346 47.0841...
2          MULTILINESTRING ZM ((-69.075266469888 47.02969...
3          MULTILINESTRING ZM ((-69.0893988031995 47.0002...
4          MULTILINESTRING ZM ((-69.0892390031996 47.0000...
                                 ...                        
2691334    MULTILINESTRING ZM ((-115.747330180199 33.1009...
2691335    MULTILINESTRING ZM ((-116.066931396948 33.5158...
2691336    MULTILINESTRING ZM ((-115.320418998106 33.4260...
2691337    MULTILINESTRING ZM ((-115.313808575171 33.4240...
2691338    MULTILINESTRING ZM ((-114.85966327436 32.92522...
Name: WKT, Length: 2691339, dtype: object

In [3]:
# data_dir = Path("/Volumes/GoogleDrive/My Drive/Condon_Research_Group/Research_Projects/Rachel/Research/GIS/Layers/NHDPlusNationalData")  # point to where GDBs are

# # Read in the entire gdb from Drive
# gdb = data_dir/"NHDPlusV21_National_Seamless_Flattened_Lower48.gdb"
# print(gdb)
# read_start = time()
# flowlines = extract_nhdflowlines(gdb, target_crs=CRS)
# print("Read {:,} flowlines in  {:.0f} seconds".format(len(flowlines), time() - read_start))

# # Filter the gdb as it is read in
# gdb = data_dir/ "NHDPlusV21_National_Seamless_Flattened_Lower48.gdb".format(HUC4=HUC4)
# print(gdb)
# read_start = time()
# flowlines, joins = extract_nhdflowlines(gdb, target_crs=CRS)
# print("Read {:,} flowlines in  {:.0f} seconds".format(len(flowlines), time() - read_start))


In [5]:
data_dir = Path("/Users/rachelspinti/Documents/River_bifurcation/data/nhd/NHDPlusNationalData")  # point to where GDBs are

nhd_gdb = gp.read_file(data_dir/"NHDPlusV21_National_Seamless_Flattened_Lower48.gdb")
# gdb_path = data_dir/"NHDPlusV21_National_Seamless_Flattened_Lower48.gdb"
# # cats = gp.read_file(gdb_path, layer="Catchment")
# # flowline_cols = ["COMID", "NHDPlusID", "FlowDir", "FType", "geometry", "ReachCode"]
# flowlines = gp.read_file(gdb_path, layer="NHDFlowline_Network")[3]
# flowlines = gp.read_file(gdb_path, layer="NHDFlowline_Network")[flowline_cols]
# print(nhd_gdb.describe)

# # Read in the entire gdb
# gdb = data_dir/"NHDPlusV21_National_Seamless_Flattened_Lower48.gdb"
# print(gdb)
# read_start = time()
# flowlines = extract_nhdflowlines(gdb, target_crs=CRS)
# print("Read {:,} flowlines in  {:.0f} seconds".format(len(flowlines), time() - read_start))

<bound method NDFrame.describe of             REACHCODE REACHRESOL                              SOURCE_ORI  \
0      01100005001603     Medium          USGS, Water Resources Division   
1      01100005001605     Medium          USGS, Water Resources Division   
2      01090002000178     Medium          USGS, Water Resources Division   
3      01100005000020     Medium          USGS, Water Resources Division   
4      01090004000089     Medium          USGS, Water Resources Division   
...               ...        ...                                     ...   
28159  21010005001524       High  United States Geological Survey (USGS)   
28160  21010005001445       High  United States Geological Survey (USGS)   
28161  21010005001440       High  United States Geological Survey (USGS)   
28162  04040003003036     Medium          USGS, Water Resources Division   
28163  02020001009344     Medium          USGS, Water Resources Division   

            SOURCE_FEA                               

In [None]:
# nhd_gdb.head(3)
flowlines.head(3)
# print(len(flowlines))
# cats.head(3)

## Junk

In [None]:
#Testing
import pandas as pd
data = {'HUC':  [101901, 102002, 101902, 101510]}

df = pd.DataFrame (data, columns = ['HUC'])

huc4 = 1019
select = data['HUC'][:3] == huc4
print(select)
# int(str(number)[:2])
# gapminder['year']==2002
print(data['HUC'] == )

select_index = data['HUC'].str.contains('huc4')
select = data[select_index]
select

# Converting line strings to 2D
# from nhdnet.geometry.lines import to2D

# # Testing change of linestring
# imp_cols2 = ['OBJECTID', 'COMID', 'ReachCode', 'streamorder', 'geometry']
# flowlines = flowlines[imp_cols2]

# line_test = flowlines[:10]
# line_test.to_file("line_test.shp")
# line_test = (gp.read_file('/Users/rachelspinti/Documents/River_bifurcation/line_test.shp'))
# # print(line_test)

# line_test.geometry = line_test.geometry.apply(to2D)
# print(line_test)

# From extract.py
#     # convert to LineString from MultiLineString
#     if df.iloc[0].geometry.geom_type == "MultiLineString":
#         print("Converting MultiLineString => LineString")
#         df.geometry = df.geometry.apply(
#             lambda g: g[0] if isinstance(g, MultiLineString) else g
#         )

#     # Convert incoming data from XYZM to XY
#     print("Converting geometry to 2D")
#     df.geometry = df.geometry.apply(to2D)