# Setup

In [1]:
# Manage paths
import pathlib
import os

# Dataframes
import pandas as pd
from datetime import datetime
import geopandas as gpd
from itertools import combinations

# Maths
import numpy as np

# Progress
from tqdm import tqdm

# Plotting
import matplotlib.pyplot as plt

# Warnings
import warnings
warnings.filterwarnings("ignore")

data_path = data_path = pathlib.Path().resolve().parent.parent / "rn_home" / "data"
route_data_path = data_path / "route_data"

# Import data

In [91]:
route_data = pd.read_csv(route_data_path / "locate_along_routes_test.csv")

# Clean data

In [199]:
def prep_raw_route_data(data):
    """
    Takes as input a DataFrame of route intersection data from ArcGIS Pro and cleans it, removing duplicate entries of the same road
    etc. 
    """
    # Create a copy of the data and keep only relevant columns
    data = data.copy()
    data = data.iloc[:, 0:6]

    # Clean the data
    data.loc[:, "from_od"] = data.loc[:, "RID"].str[0:13]
    data.loc[:, "to_od"] = data.loc[:, "RID"].str[14:]
    data.loc[:, "keep"] = 1

    # Create an organized DF sorted by the Route ID, then the road, then the from measure
    data = data.sort_values(["RID", "OBJECTID2", "FMEAS"]).reset_index(drop=True)

    
    curr_RID = data.iloc[0, data.columns.get_loc("RID")]
    curr_ROUTE_ID = data.iloc[0, data.columns.get_loc("OBJECTID2")]
    curr_FMEAS = data.iloc[0, data.columns.get_loc("FMEAS")]
    curr_TMEAS = data.iloc[0, data.columns.get_loc("TMEAS")]
    curr_index = 0
    curr_count = 0 

    for index, row in tqdm(data.iterrows()):
        # Keep the first row
        if index == 0:
            curr_count += 1
            continue
        
        # If we are on the same RID and same road, move on, but note that we have some repeat
        if (row.loc["RID"] == curr_RID) and (row.loc["OBJECTID2"] == curr_ROUTE_ID) and (index != (len(data) - 1)):
            curr_count += 1
            continue
        
        # If we are on the same RID but the road changes
        elif (row.loc["RID"] == curr_RID) and (row.loc["OBJECTID2"] != curr_ROUTE_ID) and (index != (len(data) - 1)): 
            if curr_count == 0:
                curr_RID = row.loc["RID"]
                curr_ROUTE_ID = row.loc["OBJECTID2"]
                curr_FMEAS = row.loc["FMEAS"]
                curr_TMEAS = row.loc["TMEAS"]
                curr_index = index
            else:
                # Set the to measure on the original row to the new one
                data.iloc[curr_index, data.columns.get_loc("TMEAS")] = data.iloc[index-1, data.columns.get_loc("TMEAS")]
                
                # Set all intermediate rows to be dropped
                data.iloc[curr_index+1 : index,  data.columns.get_loc("keep")] = 0
    
                # Reset indices etc.
                curr_RID = row.loc["RID"]
                curr_ROUTE_ID = row.loc["OBJECTID2"]
                curr_FMEAS = row.loc["FMEAS"]
                curr_TMEAS = row.loc["TMEAS"]
                curr_index = index
                curr_count = 0

        # Final row
        elif (index == (len(data) - 1)) and (curr_count != 0):
             print("Final row")
             # Set the to measure on the original row to the new one
             data.iloc[curr_index, data.columns.get_loc("TMEAS")] = row.loc["TMEAS"]
             
             # Set all intermediate rows to be dropped
             data.iloc[curr_index+1 :,  data.columns.get_loc("keep")] = 0
                                               
    return(data.sort_values("FMEAS"))

In [201]:
test = route_data[route_data["RID"] == "090093614012 - 090091847002"].sort_values("FMEAS")

In [202]:
test_out = prep_raw_route_data(test)

69it [00:00, 7767.86it/s]


In [203]:
len(test_out)

69