In [2]:
import pandas as pd

In [3]:
original_df = pd.read_csv("../datasets/raw/Scats_Data_October_2006.csv", skiprows=[0])
original_df.head

<bound method NDFrame.head of       SCATS Number                         Location CD_MELWAY  NB_LATITUDE  \
0              970  WARRIGAL_RD N of HIGH STREET_RD   060 G10    -37.86703   
1              970  WARRIGAL_RD N of HIGH STREET_RD   060 G10    -37.86703   
2              970  WARRIGAL_RD N of HIGH STREET_RD   060 G10    -37.86703   
3              970  WARRIGAL_RD N of HIGH STREET_RD   060 G10    -37.86703   
4              970  WARRIGAL_RD N of HIGH STREET_RD   060 G10    -37.86703   
...            ...                              ...       ...          ...   
4187          4821      VICTORIA_ST W OF BURNLEY_ST   002HF02    -37.81296   
4188          4821      VICTORIA_ST W OF BURNLEY_ST   002HF02    -37.81296   
4189          4821      VICTORIA_ST W OF BURNLEY_ST   002HF02    -37.81296   
4190          4821      VICTORIA_ST W OF BURNLEY_ST   002HF02    -37.81296   
4191          4821      VICTORIA_ST W OF BURNLEY_ST   002HF02    -37.81296   

      NB_LONGITUDE  HF VicRoads I

In [4]:
df_unique = original_df[["SCATS Number", "Location"]].drop_duplicates()

df_unique.head(15)

Unnamed: 0,SCATS Number,Location
0,970,WARRIGAL_RD N of HIGH STREET_RD
31,970,HIGH STREET_RD E of WARRIGAL_RD
62,970,WARRIGAL_RD S of HIGH STREET_RD
93,970,HIGH STREET_RD W of WARRIGAL_RD
123,2000,WARRIGAL_RD N of TOORAK_RD
151,2000,BURWOOD_HWY E of WARRIGAL_RD
180,2000,WARRIGAL_RD S of BURWOOD_HWY
208,2000,TOORAK_RD W of WARRIGAL_RD
237,2200,UNION_RD N of MAROONDAH_HWY
268,2200,MAROONDAH_HWY E of UNION_RD


In [5]:
import re

def parse_location(location):
    location = location.strip().upper()

    # Try more precise directions first
    directions = [" NW OF ", " NE OF ", " SW OF ", " SE OF ",
                  " N OF ", " S OF ", " E OF ", " W OF "]

    for dir_phrase in directions:
        if dir_phrase in location:
            parts = location.split(dir_phrase)
            if len(parts) == 2:
                main_road = parts[0].strip()
                cross_road = parts[1].strip()
                direction = dir_phrase.strip().split()[0]  # e.g., "NW"
                return main_road, direction, cross_road

    return None, None, None



In [6]:
df_unique[["Main Road", "Direction", "Cross Road"]] = df_unique["Location"].apply(
    lambda loc: pd.Series(parse_location(loc))
)


df_unique.head(15)

Unnamed: 0,SCATS Number,Location,Main Road,Direction,Cross Road
0,970,WARRIGAL_RD N of HIGH STREET_RD,WARRIGAL_RD,N,HIGH STREET_RD
31,970,HIGH STREET_RD E of WARRIGAL_RD,HIGH STREET_RD,E,WARRIGAL_RD
62,970,WARRIGAL_RD S of HIGH STREET_RD,WARRIGAL_RD,S,HIGH STREET_RD
93,970,HIGH STREET_RD W of WARRIGAL_RD,HIGH STREET_RD,W,WARRIGAL_RD
123,2000,WARRIGAL_RD N of TOORAK_RD,WARRIGAL_RD,N,TOORAK_RD
151,2000,BURWOOD_HWY E of WARRIGAL_RD,BURWOOD_HWY,E,WARRIGAL_RD
180,2000,WARRIGAL_RD S of BURWOOD_HWY,WARRIGAL_RD,S,BURWOOD_HWY
208,2000,TOORAK_RD W of WARRIGAL_RD,TOORAK_RD,W,WARRIGAL_RD
237,2200,UNION_RD N of MAROONDAH_HWY,UNION_RD,N,MAROONDAH_HWY
268,2200,MAROONDAH_HWY E of UNION_RD,MAROONDAH_HWY,E,UNION_RD


In [7]:
df_cleaned = df_unique.dropna(subset=["Main Road", "Cross Road"])

df_cleaned

Unnamed: 0,SCATS Number,Location,Main Road,Direction,Cross Road
0,970,WARRIGAL_RD N of HIGH STREET_RD,WARRIGAL_RD,N,HIGH STREET_RD
31,970,HIGH STREET_RD E of WARRIGAL_RD,HIGH STREET_RD,E,WARRIGAL_RD
62,970,WARRIGAL_RD S of HIGH STREET_RD,WARRIGAL_RD,S,HIGH STREET_RD
93,970,HIGH STREET_RD W of WARRIGAL_RD,HIGH STREET_RD,W,WARRIGAL_RD
123,2000,WARRIGAL_RD N of TOORAK_RD,WARRIGAL_RD,N,TOORAK_RD
...,...,...,...,...,...
4042,4812,SWAN_ST SW of MADDEN_GV,SWAN_ST,SW,MADDEN_GV
4068,4821,WALMER_ST N OF VICTORIA_ST,WALMER_ST,N,VICTORIA_ST
4099,4821,VICTORIA_ST E OF BURNLEY_ST,VICTORIA_ST,E,BURNLEY_ST
4130,4821,BURNLEY_ST S OF VICTORIA_ST,BURNLEY_ST,S,VICTORIA_ST


In [8]:
grouped = df_cleaned.groupby("Main Road")


In [9]:
edges = []

for road, group in grouped:
    sorted_group = group.sort_values("Cross Road")  # Lexical sort for adjacency
    scats_list = sorted_group["SCATS Number"].tolist()

    for i in range(len(scats_list) - 1):
        edge = (scats_list[i], scats_list[i + 1])
        edges.append(edge)

In [10]:
print("Sample edges:", edges[:10])
print("Total edges:", len(edges))

Sample edges: [(4266, 4266), (4057, 4057), (4057, 3127), (3127, 3180), (3180, 4063), (4063, 4063), (4035, 3001), (3001, 3002), (3002, 3002), (3002, 3001)]
Total edges: 92


In [12]:
edges = [tuple(edge) for edge in edges if edge[0] != edge[1]]
# stops same SCAT site on both ends


In [13]:
import json

with open("../datasets/processed/edges.json", "w") as f:
    json.dump(edges, f)
