## Optional first step installing packages

In [20]:
import sys
!{sys.executable} -m pip install geofeather
!{sys.executable} -m pip install nhdnet  #see Setup info document 



### Loading modules

In [1]:
from pathlib import Path
import os
from time import time
import geopandas as gp
import geofeather

from geofeather import to_geofeather
from nhdnet.nhd.extract import extract_flowlines
# from nhdnet.nhd.extract import extract_waterbodies
from nhdnet.io import serialize_df, serialize_sindex, to_shp  

## Initial setup and constants

In [2]:
#Select HUC of interest
HUC2 = 10
i = 19
HUC4 = "{0}{1:02d}".format(HUC2, i)  # this formats the HUC4 name how we want it. ':02d' is string formatting
print(HUC4)
print(type(HUC4))
huc_id = int(HUC4) * 1000000   # the full HUC4 ID
print(huc_id)

data_dir = Path("data/nhd/source/huc4")  # point to where GDBs are

#Setting projections
CRS = {           # Using USGS CONUS Albers (EPSG:102003): https://epsg.io/102003  WHY?
    "proj": "aea",
    "lat_1": 29.5,
    "lat_2": 45.5,
    "lat_0": 37.5,
    "lon_0": -96,
    "x_0": 0,
    "y_0": 0,
    "datum": "NAD83",
    "units": "m",
    "no_defs": True,
}

1019
<class 'str'>
1019000000


## Read in the geodatabase

In [3]:
# sys.path.append('/Users/rachelspinti/Documents/River_bifurcation/data/nhd/source/huc4/1019') #call where these scripts are located

gdb = data_dir/HUC4/ "NHDPLUS_H_{HUC4}_HU4_GDB.gdb".format(HUC4=HUC4)
print(gdb)
read_start = time()
flowlines, joins = extract_flowlines(gdb, target_crs=CRS)
print("Read {:,} flowlines in  {:.0f} seconds".format(len(flowlines), time() - read_start))

data/nhd/source/huc4/1019/NHDPLUS_H_1019_HU4_GDB.gdb
Reading flowlines
Columns= <bound method NDFrame.head of            NHDPlusID  FlowDir  FType   GNIS_ID             GNIS_Name  \
0       2.300190e+13        1    460      None                  None   
1       2.300190e+13        1    460      None                  None   
2       2.300190e+13        1    460      None                  None   
3       2.300190e+13        1    460      None                  None   
4       2.300190e+13        1    460      None                  None   
...              ...      ...    ...       ...                   ...   
232506  2.300190e+13        1    460      None                  None   
232507  2.300190e+13        1    460  00184959  West Toll Gate Creek   
232508  2.300190e+13        1    334      None                  None   
232509  2.300190e+13        1    558      None                  None   
232510  2.300190e+13        1    336  00203030      Burlington Ditch   

                         

## Getting information about what came out of this

First for the flowlines -- this is a geodataframe with the flowline geometry. Comes from *flowlines.py*

In [4]:
type(flowlines)
flowlines.head(3)
# print(flowlines.describe)
# flowlines.plot()
# print(flowlines.shape)
# print(list(flowlines.columns))
# flowlines[flowlines.streamorder>6]
# flowlines[flowlines.streamorder>6].plot()

Unnamed: 0_level_0,NHDPlusID,FType,GNIS_ID,GNIS_Name,geometry,ReachCode,lineID,streamorder,TotDASqKm,sizeclass,length,sinuosity
NHDPlusID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
23001900008851,23001900008851,460,,,"LINESTRING (-800081.762 215515.782, -800082.87...",10190001010972,1,1,0.1164,1a,486.965546,1.082492
23001900008863,23001900008863,460,,,"LINESTRING (-799823.525 221309.940, -799829.56...",10190002021576,2,1,0.0245,1a,81.604347,1.00827
23001900008864,23001900008864,460,,,"LINESTRING (-800259.206 220461.613, -800252.86...",10190002022431,3,1,0.103,1a,480.92453,1.016657


Then for the joins - this is a dataframe with the linkage information. Comes from *flowlines.py*

In [111]:
type(joins)
joins.head(3)

# joins.plot()
# joins[joins.downstream_id>0].plot()  # plotting test
# print(joins.describe)

Unnamed: 0,upstream,downstream,upstream_id,downstream_id,type
0,0,23001900145002,0,1019106322,origin
1,0,23001900145006,0,1019106326,origin
2,0,23001900145007,0,1019106327,origin


## Reorganizing the columns (not really sure why they do this)

In [5]:
flowlines= flowlines[["geometry",
                 "lineID",
                 "NHDPlusID",
                "ReachCode",
                "FType",
                "length",
                "sinuosity",
                "sizeclass",
                "streamorder"]]
print(flowlines.shape)
#print(max(flowlines['NHDPlusID']), min(flowlines['NHDPlusID']))
print(max(flowlines['lineID']), min(flowlines['lineID']))



(221415, 9)
232510 1


### Compare flowlines and joins
The lineIDs are created in *flowlines.py* 

In [6]:
from IPython.core.display import HTML

def multi_table(table_list):
    ''' Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell
    '''
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>'
    )

# Calculate lineIDs to be unique across the regions
#LC - .loc Accesses a group of rows and columns by label(s) or a boolean array
flowlines["lineID"] += huc_id
# flowlines.head(3)

# Set updated lineIDs with the HUC4 prefix
joins.loc[joins.upstream_id != 0, "upstream_id"] += huc_id
joins.loc[joins.downstream_id != 0, "downstream_id"] += huc_id
# joins.head(3)

multi_table([flowlines.head(3), joins.head(3)])

Unnamed: 0_level_0,geometry,lineID,NHDPlusID,ReachCode,FType,length,sinuosity,sizeclass,streamorder
NHDPlusID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Unnamed: 0_level_2,upstream,downstream,upstream_id,downstream_id,type,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
23001900008851,"LINESTRING (-800081.762 215515.782, -800082.87...",1019000001.0,23001900008851.0,10190001010972.0,460,486.965546,1.082492,1a,1.0
23001900008863,"LINESTRING (-799823.525 221309.940, -799829.56...",1019000002.0,23001900008863.0,10190002021576.0,460,81.604347,1.00827,1a,1.0
23001900008864,"LINESTRING (-800259.206 220461.613, -800252.86...",1019000003.0,23001900008864.0,10190002022431.0,460,480.92453,1.016657,1a,1.0
0,0,23001900145002.0,0.0,1019106322.0,origin,,,,
1,0,23001900145006.0,0.0,1019106326.0,origin,,,,
2,0,23001900145007.0,0.0,1019106327.0,origin,,,,
"geometry  lineID  NHDPlusID  ReachCode  FType  length  sinuosity  sizeclass  streamorder  NHDPlusID  23001900008851  LINESTRING (-800081.762 215515.782, -800082.87...  1019000001  23001900008851  10190001010972  460  486.965546  1.082492  1a  1  23001900008863  LINESTRING (-799823.525 221309.940, -799829.56...  1019000002  23001900008863  10190002021576  460  81.604347  1.008270  1a  1  23001900008864  LINESTRING (-800259.206 220461.613, -800252.86...  1019000003  23001900008864  10190002022431  460  480.924530  1.016657  1a  1",upstream  downstream  upstream_id  downstream_id  type  0  0  23001900145002  0  1019106322  origin  1  0  23001900145006  0  1019106326  origin  2  0  23001900145007  0  1019106327  origin,,,,,,,,

Unnamed: 0_level_0,geometry,lineID,NHDPlusID,ReachCode,FType,length,sinuosity,sizeclass,streamorder
NHDPlusID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
23001900008851,"LINESTRING (-800081.762 215515.782, -800082.87...",1019000001,23001900008851,10190001010972,460,486.965546,1.082492,1a,1
23001900008863,"LINESTRING (-799823.525 221309.940, -799829.56...",1019000002,23001900008863,10190002021576,460,81.604347,1.00827,1a,1
23001900008864,"LINESTRING (-800259.206 220461.613, -800252.86...",1019000003,23001900008864,10190002022431,460,480.92453,1.016657,1a,1

Unnamed: 0,upstream,downstream,upstream_id,downstream_id,type
0,0,23001900145002,0,1019106322,origin
1,0,23001900145006,0,1019106326,origin
2,0,23001900145007,0,1019106327,origin


## Need to figure out what the read water bodies part is doing--- that function doesn't work in the sourced library but exists in the git repo
Check if we need to have the water bodies in order to have a fully connected drainage network or not.

*The joins are the connections between the flowlines, so do not need waterbodies. See extract.py*

In [11]:
### Read waterbodies
read_start = time()
waterbodies = extract_waterbodies(
                gdb,
                target_crs=CRS,
                exclude_ftypes=WATERBODY_EXCLUDE_FTYPES,
                min_area=WATERBODY_MIN_SIZE)

print("Read {:,} waterbodies in  {:.0f} seconds".format(
                    len(waterbodies), time() - read_start))

# calculate ids to be unique across region
waterbodies["wbID"] += huc_id

### Only retain waterbodies that intersect flowlines
print("Intersecting waterbodies and flowlines")
wb_joins = gp.sjoin(waterbodies, flowlines, how="inner", op="intersects")[["wbID", "lineID"]]

waterbodies = waterbodies.loc[waterbodies.wbID.isin(wb_joins.wbID)].copy()
# print("Retained {:,} waterbodies that intersect flowlines".format(
#                     len(waterbodies))

NameError: name 'extract_waterbodies' is not defined

## Getting rid of dead ends
Note in this example there are none so nothing changes
~ means take the compliment

In [12]:
print(joins.shape)
joins=joins.loc[~((joins.downstream == 0) & (joins.upstream == 0))].copy()
print(joins.shape)

(311186, 5)
(311186, 5)


# Serializing the flowlines 
I think this means that the data structure is changed. So go from dataframe to feather file because it is easier to work with.

Blog post on to_geofeather: https://medium.com/@brendan_ward/introducing-geofeather-a-python-library-for-faster-geospatial-i-o-with-geopandas-341120d45ee5 
reset_index explanation: https://www.geeksforgeeks.org/reset-index-in-pandas-dataframe/

In [35]:
print("serializing {:,} flowlines to feather".format(len(flowlines)))
region_dir=data_dir/HUC4/ "NHDPLUS_H_{HUC4}_HU4_GDB.gdb".format(HUC4=HUC4)
# region_dir=Path(HUC4)
flowlines = flowlines.reset_index(drop=True)
to_geofeather(flowlines, region_dir /"flowlines.feather")
# #Serializes a pandas DataFrame to a feather file on disk --- just writing it efficiently
serialize_df(joins,  "flowline_joins.feather", index=False)


serializing 221,415 flowlines to feather


## Not part of the workflow just testing out joins to see how they made that table¶
This is copied from extract.py. I think the reson we don't get the same downstream/upstream_ids is the filtering they do with coastlines and the removed_idx

In [40]:
#     print("Filtering out loops and coastlines")
#     coastline_idx = flowlines.loc[(flowlines.FType == 566)].index
#     removed_idx = flowlines.loc[
#         (flowlines.streamorder != flowlines.StreamCalc) | (flowlines.FlowDir.isnull()) | (flowlines.FType == 566)
#     ].index
#     flowlines = flowlines.loc[~flowlines.index.isin(removed_idx)].copy()
#     print("{:,} features after removing loops and coastlines".format(len(flowlines)))

In [44]:
print("Reading flowline joins")

#this line reads the flowlines and grabs out just the columns 'FromNHDPID' and 'ToNHDPID' then it renames them as upstream and downstream
join_df = gp.read_file(gdb, layer="NHDPlusFlow")[["FromNHDPID", "ToNHDPID"]].rename(columns={"FromNHDPID": "upstream", "ToNHDPID": "downstream"})
join_df.upstream = join_df.upstream.astype("uint64")
join_df.downstream = join_df.downstream.astype("uint64")

join_df = join_df.drop_duplicates()
join_df = (join_df.join(flowlines.lineID.rename("upstream_id"), on="upstream").
          join(flowlines.lineID.rename("downstream_id"), on="downstream")
          .fillna(0))

for col in ("upstream", "downstream"):
        join_df[col] = join_df[col].astype("uint64")

for col in ("upstream_id", "downstream_id"):
        join_df[col] = join_df[col].astype("uint32")

# test=flowlines[0:3]
#print(test)
#print(test.FType)
#test.FType.rename("testing")
# print(test)
print(join_df)


Reading flowline joins
              upstream      downstream  upstream_id  downstream_id
0                    0  23001900145002   1019000001              0
1                    0  23001900145006   1019000001              0
2                    0  23001900145007   1019000001              0
3                    0  23001900145008   1019000001              0
4                    0  23001900145009   1019000001              0
...                ...             ...          ...            ...
324348               0  23001900005062   1019000001              0
324349  23001900230763  23001900085073            0              0
324350               0  23001900010186   1019000001              0
324351  23001900226146  23001900226147            0              0
324352               0  23001900008015   1019000001              0

[324353 rows x 4 columns]


In [46]:
joins.head(3)
joins.describe

<bound method NDFrame.describe of               upstream      downstream  upstream_id  downstream_id      type
0                    0  23001900145002            0     1019106322    origin
1                    0  23001900145006            0     1019106326    origin
2                    0  23001900145007            0     1019106327    origin
3                    0  23001900145008            0     1019107021    origin
4                    0  23001900145009            0     1019109914    origin
...                ...             ...          ...            ...       ...
324348               0  23001900005062            0     1019007072    origin
324349  23001900230763  23001900085073   1019232233     1019057328  internal
324350               0  23001900010186            0     1019033645    origin
324351  23001900226146  23001900226147   1019226246     1019226247  internal
324352               0  23001900008015            0     1019007281    origin

[311186 rows x 5 columns]>

In [15]:
#grabbing two columns out
print(join_df.shape)
# test=join_df[["FromNHDPID", "ToNHDPID"]]
test=join_df[["upstream_id", "downstream_id"]]
print(test.shape)
test.head(3)

(324353, 4)
(324353, 2)


Unnamed: 0,upstream_id,downstream_id
0,1019000001,0
1,1019000001,0
2,1019000001,0


In [24]:
# #Grabbing two coulmns out and modifying 

# test2=join_df[["FromNHDPID", "ToNHDPID"]].rename(columns={"FromNHDPID": "upstream", "ToNHDPID": "downstream"})
# test2.head(3)

## Reading in the NABD shape file
Usefull tips on working with shape files: https://www.earthdatascience.org/workshops/gis-open-source-python/intro-vector-data-python/

In [17]:
NABD = gp.read_file('/Users/rachelspinti/Documents/River_bifurcation/data/nabd/nabd_fish_barriers_2012.shp')


In [18]:
#look at the properties

In [19]:
print(NABD.shape)
print(list(NABD.columns))
NABD.head(3)

(52456, 56)
['COMMENT', 'NIDID', 'COMID', 'UNIQUE_STR', 'newX', 'newY', 'RecordID', 'Dam_name', 'Dam_former', 'STATEID', 'Section', 'County', 'River', 'City', 'Distance', 'Owner_name', 'Owner_type', 'Dam_type', 'Core', 'Foundation', 'Purposes', 'Year_compl', 'Year_modif', 'Dam_length', 'Dam_height', 'NID_height', 'Hazard', 'EAP', 'Inspection', 'Outlet_gat', 'Volume', 'State', 'Dam_Name2', 'Designer', 'Private', 'Str_Height', 'Hyd_Height', 'Max_Disch', 'Max_stor', 'Norm_stor', 'NID_stor', 'Surf_area', 'Drain_area', 'Insp_Freq', 'St_reg', 'St_reg_ag', 'Spill_type', 'Num_locks', 'Len_locks', 'Wid_locks', 'Source', 'Condition', 'Cond_Date', 'Cond_desc', 'Spill_wid', 'geometry']


Unnamed: 0,COMMENT,NIDID,COMID,UNIQUE_STR,newX,newY,RecordID,Dam_name,Dam_former,STATEID,...,Spill_type,Num_locks,Len_locks,Wid_locks,Source,Condition,Cond_Date,Cond_desc,Spill_wid,geometry
0,1,AL00288,893441.0,1,-86.196373,31.424403,326.0,DONALDSON LAKE DAM,,,...,,0.0,0.0,0.0,AL,,,,0.0,POINT (-86.19637 31.42440)
1,1,AL01925,894119.0,2,-86.400374,31.170262,1679.0,CHARLES WOODHAM LAKE,,,...,,0.0,0.0,0.0,AL,,,,0.0,POINT (-86.40037 31.17026)
2,1,AL00648,895019.0,3,-86.299755,31.223052,641.0,JERRY ADAMS,,,...,,0.0,0.0,0.0,AL,,,,0.0,POINT (-86.29975 31.22305)


## Attempt at spatial join of NHD_HUC4 and NHD I have
#### 1. Filter NHD

In [20]:
# Stuck here...
# Need a script like the extract.py but that is separate so we can extract COMID, so copy the code, but save elsewhere?

#### 2. Join NHD_HUC4 and NHD
Check this link out for help: https://www.earthdatascience.org/courses/use-data-open-source-python/intro-vector-data-python/vector-data-processing/spatial-joins-in-python-geopandas-shapely/