# Extract the FIM HF reaches which have corresponding SWORD reaches, and join them with the attributes of corresponding SWORD and IRIS reaches 

## Setting the output paths and local variables [Modify based on your case. The (most possible) modifying lines have been indicated with a comment [Replace/Redefine...]]

In [1]:
import arcpy
import os
import pandas as pd
import numpy as np

# Enable overwriting of output files
arcpy.env.overwriteOutput = True

# Set geodatabase path to store the output geo-data
# The 1-4 paths for saving the intermediate reach and point feature classes, you can use separated paths or an overall one
# The 5-6 paths for saving the all extracted FIM HF flowlines and those with joined IRIS attributes, respectively
# You can rename the geodatabases (.gdb)
# [Replace with your own paths]
output_gdb_sword = r"...\Intermediate_SWORD.gdb"
output_gdb_point = r"...\Intermediate_Points.gdb"
output_gdb_reach = r"...\Intermediate_Reaches.gdb"
output_gdb_reach_point_perp = r"...\Intermediate_Reaches_Points_Perp.gdb" # Path to save the intermediate outputs when using the perpendicular lines to extract the target flowlines
output_gdb_final_reach = r"...\Extracted_Final_Reaches.gdb"
output_gdb_final_reach_iris = r"...\Extracted_Final_Reaches_Join_IRIS_SWORD.gdb"

# Set local variables
arcpy.env.workspace = output_gdb_final_reach_iris

## Step 1: First extract FIM HF flowlines within SWORD flowline buffer [Normally no need to adjust]

In [4]:
def extract_stream_network_within_sword_buffer(huc_id, original_fc, sword_fc, buffer_distance, unique_id):
    # Step 1: Create Midpoints for Each Reach
    midpoints_fc = f"{output_gdb_point}/midpoints_{huc_id}"
    arcpy.management.FeatureVerticesToPoints(original_fc, midpoints_fc, "MID")
    
    # Step 2: 
    # Option1: Create Buffer Around SWORD Stream Network
    # Extract numeric value and remove the ' Meters' part
    buffer_distance_value = buffer_distance.split()[0]  # Gets 'buffer_distance'
    sword_buffer_fc = f"{output_gdb_sword}/sword_buffer_{buffer_distance_value}m"
    # print(sword_fc)
    # print(sword_buffer_fc)
    # print(buffer_distance)
    arcpy.analysis.Buffer(sword_fc, sword_buffer_fc, buffer_distance)

    # # Option2: Use the Existing Buffer Shapefile of SWORD Stream Network, then please comment the codes for Option 1
    # sword_buffer_fc = "Z:\Documents\RiverSlope\Paper_NewRiverSlopeDataset\Codes_Uploaded\Demonstration_SpatialJoin\Intermediate_SWORD.gdb/sword_buffer_120m"
   
    # Step 3: Spatially Select Midpoints within SWORD Buffer
    selected_midpoints_fc = f"{output_gdb_point}/selected_midpoints_{huc_id}"
    arcpy.analysis.SpatialJoin(midpoints_fc, sword_buffer_fc, selected_midpoints_fc, join_type="KEEP_COMMON", match_option="WITHIN")
    
    # Step 4: Select and Extract Matching Reaches Using ORIG_FID
    # Create a set of unique ORIG_FID values from the selected midpoints
    orig_fid_list = [row[0] for row in arcpy.da.SearchCursor(selected_midpoints_fc, unique_id)]
    # print(orig_fid_list, "\n")

    for field in arcpy.ListFields(original_fc):
        if field.name == unique_id:
            if field.type == "String":
                # Build a where clause to select the matching reaches
                where_clause = "{} IN ({})".format(unique_id, ','.join([f"'{fid}'" for fid in orig_fid_list]))
                # print((f"WHERE CLAUSE (string): {where_clause}"))
            else:
                where_clause = f"{unique_id} IN ({','.join(map(str, orig_fid_list))})"
                # print((f"WHERE CLAUSE (other): {where_clause}"))
    
    # Select the original stream reaches that correspond to the selected midpoints
    arcpy.management.MakeFeatureLayer(original_fc, f"original_lyr_{huc_id}", where_clause)
    # print("Select the original stream reaches successed!", "\n")
    
    # Copy the selected reaches to the output feature class
    output_fc = f"{output_gdb_reach}/extracted_reaches_{huc_id}"
    arcpy.management.CopyFeatures(f"original_lyr_{huc_id}", output_fc)
    
    print(f"Step 1: First extracted flowlines saved to {output_fc}")
    
    return output_fc

## Step 2: Trace downstream for each first extracted flowline to get the not initially extracted flowlines [Normally no need to adjust]

In [5]:
def reconstruct_disconnected_segments(extract_stream_fc, unique_ids_current_nextDown, unique_id, unique_id_nextDown, original_fc, huc_id):
    disconnected_segments = []
    
    # Initialize lists to collect the HydroID and NextDownID values
    hydro_ids = []
    nextdown_ids = []
    
    # Step 1: Identify ends of disconnected segments
    with arcpy.da.SearchCursor(extract_stream_fc, unique_ids_current_nextDown) as cursor:
        for row in cursor:
            hydro_ids.append(row[0])     # Collect HydroID values
            nextdown_ids.append(row[1])  # Collect NextDownID values

    # Convert the lists to NumPy arrays
    hydro_ids_np = np.array(hydro_ids)
    nextdown_ids_np = np.array(nextdown_ids)
    
    disconnected_ends = np.unique(nextdown_ids_np[~np.isin(nextdown_ids_np, hydro_ids_np)])  # IDs that should be upstream but are missing
    # print(f"disconnected_ends: {disconnected_ends}")

    # Step 2: Trace upstream and find missing reaches
    for end_id in disconnected_ends:
        current_id = end_id
        missing_reaches = []

        while current_id is not None:
            # Check if this reach exists in the original dataset

            for field in arcpy.ListFields(original_fc):
                if field.name == unique_id:
                    if field.type == "String":
                        where_clause = f"{unique_id} = '{current_id}'"
                        # print(f"where_clause (string): {where_clause}")
                    else:
                        where_clause = f"{unique_id} = {current_id}"
                        # print(f"where_clause (other): {where_clause}")
                    unique_id_field_type = field.type
            
            found_row = False  # Flag to detect if any rows were found
            
            with arcpy.da.SearchCursor(original_fc, [unique_id, unique_id_nextDown], where_clause) as cursor:
                for row in cursor:
                    # print(row, "\n")
                    found_row = True                    
                    if current_id not in hydro_ids: 
                        missing_reaches.append(current_id)  # Add to missing reaches if it's not in the extracted network
#                         print(current_id)
                   
                    current_id = row[1]  # Move to the next downstream reach
#                     print(current_id, "\n")
                    
            if not found_row:
#                 print(f"No matching record found for HydroID {current_id}. Exiting loop.")
                break  # Exit the while loop if no matching rows were found

            if current_id in hydro_ids or current_id is None:
                break  # Stop if we've reached a connected reach or the end of the network

        if missing_reaches:
            disconnected_segments.append(missing_reaches)
            
    # Step 3: Add missing reaches to the extracted network
    missing_reaches_fc = f"{output_gdb_reach}/missing_reaches_{huc_id}"
    arcpy.management.MakeFeatureLayer(original_fc, f"original_lyr_{huc_id}")
    
    for segment in disconnected_segments:
        # print(f"segment: {segment}")
        
        if unique_id_field_type == "String":
            where_clause = "{} IN ({})".format(unique_id, ','.join([f"'{fid}'" for fid in segment]))
        else:
            where_clause = f"{unique_id} IN ({','.join(map(str, segment))})"
                        
        arcpy.management.SelectLayerByAttribute(f"original_lyr_{huc_id}", "ADD_TO_SELECTION", where_clause)
    
    arcpy.management.CopyFeatures(f"original_lyr_{huc_id}", missing_reaches_fc)
    
    # Merge the missing reaches back into the extracted stream network
    final_fc = f"{output_gdb_reach}/traced_extracted_reaches_{huc_id}" # Define the path to save the final reconstructed network
    arcpy.management.Merge([extract_stream_fc, missing_reaches_fc], final_fc)
    
    print(f"Step 2: Traced and first extracted flowlines saved to {final_fc}")

    return final_fc

## Step 3: Check if there are unnormal situations [Modify based on your case]

### Situation 1: If count of extracted_reaches == 0 --> no SWORD flowline intersects with FIM HF flowlines in this HUC --> do not take any FIM HF flowlines
### Situation 2: If count of traced_extracted_reaches = count of extracted_reaches + count of all original FIM HF flowlines in this HUC --> the extracted_reaches already have all the FIM HF flowlines corresponding to SWORD flowlines in this HUC --> just use the extracted_reaches (this is due to the deficiency of our downstream tracing algorithm - Step 2)
### Situation 3: If not the previous two cases, suggesting there still are possibly upstream reaches and others to be extracted.

In [6]:
def process_flowlines(extract_stream_fc, traced_extracted_reaches, original_fc, huc_id):
    # Get count of features in each shapefile
    count_extract = int(arcpy.management.GetCount(extract_stream_fc)[0])
    count_trace_extract = int(arcpy.management.GetCount(traced_extracted_reaches)[0])
    count_original = int(arcpy.management.GetCount(original_fc)[0])

    # print(f"   Extracted reaches count: {count_extract}")
    # print(f"   Traced and extracted reaches count: {count_trace_extract}")
    # print(f"   Original reaches count: {count_original}")

    # Situation 1: If extract_stream_fc count is 0, skip further processing
    if count_extract == 0:
        situation_identify = "no_intersection"
        print("Step 3: There may be no intersection of FIM HF and SWORD flowlines in this HUC.")

    # Situation 2: If traced_extracted_reaches count = extract_stream_fc + original_fc, copy the extracted streams
    elif count_trace_extract == count_extract + count_original:
        output_fc = f"{output_gdb_final_reach}\\all_extracted_reaches_{huc_id}"
        arcpy.management.CopyFeatures(extract_stream_fc, output_fc)

        # Enable automatic addition of outputs to the map
        arcpy.env.addOutputsToMap = True

        # Define fields for duplicate detection
        # [Replace based on your case]
        fields = ["HydroID", "From_Node", "To_Node"]
        arcpy.management.DeleteIdentical(output_fc, fields)

        situation_identify = "all_extracted"
        
        print(f"Step 3: All corresponding flowlines extracted have been removed duplicate records and saved to {output_fc}")
        
    else:
        # Situation 3: Otherwise, print that more flowlines need to be extracted
        situation_identify = "extract_upstream_others"

        print("Step 3: Maybe the most upstream flowlines (and other) still need to be extracted using Step 4.")

    return situation_identify
    # print(f"Identified situation: {situation_identify}")

    # Disable automatic addition of outputs to the map
    arcpy.env.addOutputsToMap = False

## Step 4: Extract the (possibly) upstream (and other) flowlines that have not yet been extracted

### Step 4.1: Remove the perpendicular lines intersected with traced_extracted_reaches [Normally no need to adjust]

In [7]:
def remove_touching_perpendiculars(traced_extracted_reaches, perpendicular_lines, huc_id):
    # Make a copy of perpendicular_lines for modification
    output_fc = f"{output_gdb_reach_point_perp}\\perp_lines_notcross_{huc_id}"
    arcpy.CopyFeatures_management(perpendicular_lines, output_fc)

    # Create a feature layer from the copied perpendicular lines
    arcpy.MakeFeatureLayer_management(output_fc, "perpendicular_lines_layer")

    # Select perpendicular lines that TOUCH the flowlines in traced_extracted_reaches
    arcpy.management.SelectLayerByLocation("perpendicular_lines_layer", 
                                           overlap_type="INTERSECT", 
                                           select_features=traced_extracted_reaches, 
                                           selection_type="NEW_SELECTION")

    # Count the selected features
    count = int(arcpy.management.GetCount("perpendicular_lines_layer")[0])
    
    if count > 0:
        # Delete the selected features that touch flowlines
        arcpy.management.DeleteFeatures("perpendicular_lines_layer")
        # print(f"Removed {count} perpendicular lines that touched traced flowlines.")

    # print(f"     Step 4.1: Saved perpendicular lines not touching traced flowlines to {output_fc}")

    return output_fc

### Step 4.2: Do the intersect join of all original FIM HF flowlins of this HUC and remaining perpendicular lines [Normally no need to adjust]

In [8]:
# Function to perform the spatial join for a given HUC ID
def perform_spatial_join_with_perp(huc_id, original_fc, perp_lines_notcross):
    
    output_fc = os.path.join(output_gdb_reach_point_perp, f"original_reaches_{huc_id}_join_perp")
    
    # Perform the spatial join
    arcpy.analysis.SpatialJoin(
        target_features = original_fc,
        join_features = perp_lines_notcross,
        out_feature_class = output_fc,
        join_type = "KEEP_ALL",  # Keeps all features
        match_option = "INTERSECT",  # Match on intersect
        join_operation = "JOIN_ONE_TO_ONE"
    )
    
    # print(f"     Step 4.2: Spatial join output saved at {output_fc}")
    
    return output_fc

### Step 4.3: Set filter to find only the really needed reaches (e.g. most upstream, the main stream reaches corresponding to SWORD reaches) [Modify based on your case]

In [9]:
def filter_intersected_reaches(huc_id, ori_reaches_join_perp):

    # Set paths
    output_fc = f"{output_gdb_reach_point_perp}/original_reaches_{huc_id}_join_perp_Seled_count3"  # Replace with the desired output path for selected features

    # Create a feature layer from the spatial join output
    arcpy.management.MakeFeatureLayer(ori_reaches_join_perp, f"joined_layer_{huc_id}")

    # Apply the selection based on Join_Count and order_ conditions
    # Select Join_Count >= 3 OR (Join_Count >= 1 AND order_ > 2)
    # [Redefine the filter rules based on your case, after having some local inspections]
    where_clause = """("Join_Count" >= 3) OR ("Join_Count" >= 1 AND "order_" > 2)"""
#     where_clause = """("Join_Count" >= 3)""" # can reduce the wrongly got reaches
    arcpy.management.SelectLayerByAttribute(f"joined_layer_{huc_id}", "NEW_SELECTION", where_clause)

    # Save the selected features to a new feature class
    arcpy.management.CopyFeatures(f"joined_layer_{huc_id}", output_fc)
    
    # print(f"     Step 4.3: Selected features saved to {output_fc}")
    
    return output_fc

## Step 4.4: Combine all the extracted reaches including traced_extracted_reaches (Steps 1-2), join_perp_Seled (Steps 4.1-4.3). [Modify based on your case]

In [10]:
def merge_and_remove_duplicates(traced_extracted_reaches, huc_id):
    # Define output feature class
    all_extracted_reaches = f"{output_gdb_final_reach}\\all_extracted_reaches_{huc_id}"

    # Merge the two feature classes
    arcpy.management.Merge([join_perp_Seled_reaches, traced_extracted_reaches], all_extracted_reaches)
    print(f"Step 4: Merged all features saved to {all_extracted_reaches}")

    # Disable automatic addition of outputs to the map
    arcpy.env.addOutputsToMap = True

    # Define fields for duplicate detection
    # [Replace based on your case]
    fields = ["HydroID", "From_Node", "To_Node"]

    # Use Delete Identical tool to remove duplicates based on the specified fields
    arcpy.management.DeleteIdentical(all_extracted_reaches, fields)
    # print(f"Removed duplicate records based on {fields}")

    # Get the field names for both feature classes
    all_extracted_fields = {field.name for field in arcpy.ListFields(all_extracted_reaches)}
    original_fields = {field.name for field in arcpy.ListFields(original_fc)}

    # Identify fields to remove (present in all_extracted_fields but not in original_fields)
    # [Replace the specific not-in fields based on your case, if needed]
    fields_to_remove = [field for field in all_extracted_fields if field not in original_fields and field not in ['Shape_Length', 'Shape', 'geom_Length', 'OBJECTID']]
    
    # # Print fields that will be removed
    # print("Fields to be removed:", fields_to_remove)

    if fields_to_remove:
        # Delete extra fields
        arcpy.management.DeleteField(all_extracted_reaches, fields_to_remove)
        # print(f"Removed fields: {fields_to_remove}")

    return all_extracted_reaches

## Step 5: Join reach attributes of IRIS, SWORD to all extracted reaches [Modify based on your case]

In [11]:
# [You may still need to replace some specific field names based on your case in the following lines]

def join_sword_iris_to_reaches(huc_id, all_extract_reaches, sword_fc, iris_fc, iris_reach_id_field):

    # Disable automatic addition of outputs to the map
    arcpy.env.addOutputsToMap = False

    # Step 5.1: Generate Midpoints for All Extracted Reaches
    all_reach_midpoints_fc = f"{output_gdb_point}/all_extracted_reaches_midp_{huc_id}"
    arcpy.management.FeatureVerticesToPoints(all_extract_reaches, all_reach_midpoints_fc, "MID")

    # Step 5.2: Find the Closest SWORD Reach for Each Midpoint
    arcpy.analysis.Near(all_reach_midpoints_fc, sword_fc, location="NO_LOCATION", angle="NO_ANGLE", method="PLANAR")
    
    # Step 5.2.1: Add a new field to the target feature class to store NEAR_FID first
    arcpy.management.AddField(all_extract_reaches, "NEAR_FID", "BIGINTEGER")

    # Step 5.2.2: Join Midpoints with Reaches Based on OBJECTID
    # Join midpoints to reaches based on HydroID
    joined_layer = arcpy.management.AddJoin(all_extract_reaches, "OBJECTID", all_reach_midpoints_fc, "ORIG_FID")
    
#     fields = [f.name for f in arcpy.ListFields(joined_layer)]
#     print(fields)

    # Copy the NEAR_FID from midpoints to the original reaches
    arcpy.management.CalculateField(joined_layer, "NEAR_FID", f"!all_extracted_reaches_midp_{huc_id}.NEAR_FID!", "PYTHON3")

    # Remove the join after copying NEAR_FID
    arcpy.management.RemoveJoin(joined_layer, f"all_extracted_reaches_midp_{huc_id}")

    # Step 5.3: Join SWORD Attributes to the Extracted Reaches Using NEAR_FID
    sword_oid_field = "OBJECTID"  # Assuming SWORD's unique ID field is 'OBJECTID'
    arcpy.management.AddJoin(joined_layer, "NEAR_FID", sword_fc, sword_oid_field)
    
#     fields = [f.name for f in arcpy.ListFields(joined_layer)]
#     print(fields)
    
#     fields = [f.name for f in arcpy.ListFields(iris_fc)]
#     print(fields)

    # Step 5.4: Join IRIS Layer to Extracted Reaches
    arcpy.management.AddJoin(joined_layer, "main_reaches.reach_id", iris_fc, iris_reach_id_field)   # [Replace with that in your data]
    
#     fields = [f.name for f in arcpy.ListFields(joined_layer)]
#     print(fields)

    # Enable automatic addition of outputs to the map
    arcpy.env.addOutputsToMap = True

    # Step 5.5: Export the joined layer to GDB
    # Define the output shapefile path
    output_filepath = os.path.join(output_gdb_final_reach_iris, f"all_extracted_reaches_iris_{huc_id}")

    # Export the joined layer to a shapefile
    arcpy.conversion.ExportFeatures(joined_layer, output_filepath)
    print(f"Step 5: All extracted reaches joined with IRIS saved to {output_filepath}")

    # Disable automatic addition of outputs to the map
    arcpy.env.addOutputsToMap = False

    # Enable overwriting of output files
    arcpy.env.overwriteOutput = True
    
    # Define the output CSV file path
    output_csv = os.path.join(output_gdb_final_reach_iris, f"csv_all_extracted_reaches_iris_{huc_id}") 

    # Export the attribute table to a CSV file
    arcpy.conversion.ExportTable(output_filepath, output_csv)
    # print(f"Step 5: csv of all extracted reaches joined with IRIS saved to {output_csv}")

    # return(output_filepath)

# Running all the functions [Modify based on your case]

In [14]:
# Disable automatic addition of outputs to the map
arcpy.env.addOutputsToMap = False

## Step 1: First extracted reaches using SWORD buffer
huc_id = "03020201"   # Unique HUC8 id [Global unique identifier, especially useful when you iteratively process multiple datasets; replace with your own (data) id] 
# HUC 03020201 FIM hydrofabric flowline (reach) path [Replace with your path]
original_fc = rf"...\InputGeoData.gdb/main_demDerived_reaches_split_filtered_addedAttributes_crosswalked_0"
unique_id = "HydroID"   # Unique identifier of FIM flowline [Replace with that in your data]

# SWORD flowlines path, i.e. the flowlines your want to join to your target flowlines (FIM HF flowlines here) [Replace with yours]
sword_fc = r"...\InputGeoData.gdb/main_reaches"
buffer_distance = "100 Meters"   # [Replace with the one you want specify]

extract_stream_fc = extract_stream_network_within_sword_buffer(huc_id, original_fc, sword_fc, buffer_distance, unique_id)

## Step 2: Trace downstream to find the initially not extracted
unique_id_nextDown = "NextDownID"  # Next downstream FIM flowline unique identifier [Replace with that in your data]
unique_ids_current_nextDown = [unique_id, unique_id_nextDown] # Unique identifier of FIM flowline and its downstream
traced_extracted_reaches = reconstruct_disconnected_segments(extract_stream_fc, unique_ids_current_nextDown, unique_id, unique_id_nextDown, original_fc, huc_id)

## Step 3: Deal with the unnormal situations
situation = process_flowlines(extract_stream_fc, traced_extracted_reaches, original_fc, huc_id)

## Step 4: Find upstream and other reaches using perpendicular lines of SWORD reaches
if situation == "no_intersection":
    
    pass

elif situation == "all_extracted" or situation == "extract_upstream_others":
    
    if situation == "extract_upstream_others":

        # Step 4.1: Remove the perp lines intersected with already extracted FIM flowlines
        # Due to it's difficult for us to generate perpendicular lines of SWORD reaches in ArcGIS, so we generated them separately in Python, then used the resulting feature class here
        # Python codes to generate the perp lines has been provided as 'Step4.0_PerpendicularLines_Generation.py' in this GitHub
        perpendicular_lines = r'...\InputGeoData.gdb\perpendicular_lines_len0006_space_sword_nodes_cleaned'  # [Replace with your actual path]
        perp_lines_notcross = remove_touching_perpendiculars(traced_extracted_reaches, perpendicular_lines, huc_id)
    
        # Step 4.2: Join the remaining perp lines with original FIM flowlines
        ori_reaches_join_perp = perform_spatial_join_with_perp(huc_id, original_fc, perp_lines_notcross)
    
        # Step 4.3: Filter the joined FIM flowlines to get the wanted ones
        join_perp_Seled_reaches = filter_intersected_reaches(huc_id, ori_reaches_join_perp)
    
        # Step 4.4: Merge all the flowlines extracted from different steps
        merge_and_remove_duplicates(traced_extracted_reaches, huc_id)

    ## Step 5: Join IRIS and SWORD to all extracted reaches
    all_extract_reaches = f"{output_gdb_final_reach}\\all_extracted_reaches_{huc_id}"   # [Modify this if necessary]
    iris_fc = r"...\InputGeoData.gdb/IRIS_v26"   # The flwolines you want to link via the common flowlines, i.e. IRIS here [Replace with yours if necessary]
    iris_reach_id_field = "reach_id"  # The common field between SWORD and IRIS [Replace with that in your data, if necessary]
    join_sword_iris_to_reaches(huc_id, all_extract_reaches, sword_fc, iris_fc, iris_reach_id_field)

Step 1: First extracted flowlines saved to Z:\Documents\RiverSlope\Paper_NewRiverSlopeDataset\Codes_Uploaded\Demonstration_SpatialJoin\Intermediate_Reaches.gdb/extracted_reaches_01020004
Step 2: Traced and first extracted flowlines saved to Z:\Documents\RiverSlope\Paper_NewRiverSlopeDataset\Codes_Uploaded\Demonstration_SpatialJoin\Intermediate_Reaches.gdb/traced_extracted_reaches_01020004
Step 3: Maybe the most upstream flowlines (and other) still need to be extracted using Step 4.
Step 4: Merged all features saved to Z:\Documents\RiverSlope\Paper_NewRiverSlopeDataset\Codes_Uploaded\Demonstration_SpatialJoin\Extracted_Final_Reaches.gdb\all_extracted_reaches_01020004
Step 5: All extracted reaches joined with IRIS saved to Z:\Documents\RiverSlope\Paper_NewRiverSlopeDataset\Codes_Uploaded\Demonstration_SpatialJoin\Extracted_Final_Reaches_Join_IRIS_SWORD.gdb\all_extracted_reaches_iris_01020004
