## Optional first step installing packages

In [1]:
#import sys
#!{sys.executable} -m pip install geofeather
#!{sys.executable} -m pip install nhdnet

### Loading modules

In [2]:
from pathlib import Path
import os
from time import time
import geopandas as gp


from geofeather import to_geofeather
from nhdnet.nhd.extract import extract_flowlines
#from nhdnet.nhd.extract, extract_waterbodies
from nhdnet.io import serialize_df, serialize_sindex, to_shp

## Intial setup and constanst

In [3]:
HUC2 = 10
i = 19
HUC4 = "{0}{1:02d}".format(HUC2, i)
print(HUC4)
print(type(HUC4))
huc_id = int(HUC4) * 1000000
print(huc_id)

#Setting projections
CRS = {
    "proj": "aea",
    "lat_1": 29.5,
    "lat_2": 45.5,
    "lat_0": 37.5,
    "lon_0": -96,
    "x_0": 0,
    "y_0": 0,
    "datum": "NAD83",
    "units": "m",
    "no_defs": True,
}

1019
<class 'str'>
1019000000


## Read in the geodatabase

In [4]:
gdb ="1019/NHDPLUS_H_{HUC4}_HU4_GDB.gdb".format(HUC4=HUC4)
print(gdb)
read_start = time()
flowlines, joins = extract_flowlines(gdb, target_crs=CRS)
print("Read {:,} flowlines in  {:.0f} seconds".format(len(flowlines), time() - read_start))

1019/NHDPLUS_H_1019_HU4_GDB.gdb
Reading flowlines


DriverError: 1019/NHDPLUS_H_1019_HU4_GDB.gdb: No such file or directory

## Getting information about what came out of this

First for the flowlines -- this is a geodataframe with the flowline geometry

In [None]:
type(flowlines)
#flowlines.head(3)
#print(flowlines.describe)
#flowlines.plot()
#print(flowlines.shape)
print(list(flowlines.columns))
#flowlines[flowlines.streamorder>6]
flowlines[flowlines.streamorder>6].plot()

Then for the joins - this is a dataframe with the linkage information

In [None]:
type(joins)
joins.head(3)
#print(joins.describe)

## Reorganizing the columns (not really sure why they do this)

In [None]:
flowlines= flowlines[["geometry",
                 "lineID",
                 "NHDPlusID",
                "FType",
                "length",
                "sinuosity",
                "sizeclass",
                "streamorder"]]
print(flowlines.shape)
#print(max(flowlines['NHDPlusID']), min(flowlines['NHDPlusID']))
#print(max(flowlines['lineID']), min(flowlines['lineID']))



In [None]:
# Calculate lineIDs to be unique across the regions
#LC - .loc Accesses a group of rows and columns by label(s) or a boolean array
flowlines["lineID"] += huc_id
# Set updated lineIDs with the HUC4 prefix
joins.loc[joins.upstream_id != 0, "upstream_id"] += huc_id
joins.loc[joins.downstream_id != 0, "downstream_id"] += huc_id
joins.head(3)

## Need to figure out what the read water bodies part is doing--- that function doesnt work in the sourced library but exists in the git repo
Check if we need to have the water bodies in order to have a fully connected drainage network or not

## Getting rid of dead ends
Note in this example there are none so nothing changes
~ means take the compliment

In [None]:
print(joins.shape)
joins=joins.loc[~((joins.downstream == 0) & (joins.upstream == 0))].copy()
print(joins.shape)

# Serializing the flowlines --- not sure exactly what this means
Blog post on to_geofeather: https://medium.com/@brendan_ward/introducing-geofeather-a-python-library-for-faster-geospatial-i-o-with-geopandas-341120d45ee5 
reset_index explanation: https://www.geeksforgeeks.org/reset-index-in-pandas-dataframe/

In [None]:
print("serializing {:,} flowlines to feather".format(len(flowlines)))
region_dir=Path(HUC4)
flowlines = flowlines.reset_index(drop=True)
to_geofeather(flowlines, region_dir /"flowlines.feather")
#Serializes a pandas DataFrame to a feather file on disk --- just writing it efficientely
serialize_df(joins,  "flowline_joins.feather", index=False)


###  Not part of the workflow just testing out  joins to see how they made that table 
This is copied from extract.py

In [None]:
print("Reading flowline joins")

#this line reads the flowlines and grabs out just the columns 'FromNHDPID' and 'ToNHDPID' then it renames them as upstream and downstream
join_df = gp.read_file(gdb, layer="NHDPlusFlow")[["FromNHDPID", "ToNHDPID"]].rename(columns={"FromNHDPID": "upstream", "ToNHDPID": "downstream"})
join_df.upstream = join_df.upstream.astype("uint64")
join_df.downstream = join_df.downstream.astype("uint64")



#join_df = (join_df.join(df.lineID.rename("upstream_id"), on="upstream").
#           join(df.lineID.rename("downstream_id"), on="downstream")
#           .fillna(0))

#test=flowlines[0:3]
#print(test)
#print(test.FType)
#test.FType.rename("testing")
print(test)



In [None]:
#grabbing two columns out
print(join_df.shape)
test=join_df[["FromNHDPID", "ToNHDPID"]]
print(test.shape)
test.head(3)

In [None]:
#Grabbing two coulmns out and modifying 

test2=join_df[["FromNHDPID", "ToNHDPID"]].rename(columns={"FromNHDPID": "upstream", "ToNHDPID": "downstream"})
test2.head(3)

## Reading in the NABD shape file
Usefull tips on working with shape files: https://www.earthdatascience.org/workshops/gis-open-source-python/intro-vector-data-python/

In [None]:
NABD = gp.read_file('./nabd_fish_barriers_2012/nabd_fish_barriers_2012.shp')


In [None]:
#look at the properties

In [None]:
print(NABD.shape)
print(list(NABD.columns))
NABD.head(3)