In [99]:
import math
import pandas as pd
import geopandas as gpd
from shapely.ops import nearest_points
from itertools import combinations
from pathlib import Path
import json

In [97]:
# Load park coordinates

import pandas as pd
import geopandas as gpd


def load_parks(csv_path: str) -> gpd.GeoDataFrame:
    """
    Load park coordinates from CSV and return a GeoDataFrame with WGS84 CRS.
    """
    df = pd.read_csv(csv_path)
    df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
    df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')
    df = df.dropna(subset=['latitude', 'longitude'])
    parks_df = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df['longitude'], df['latitude']),
        crs='EPSG:4326'
    )
    return parks_df

parks_df=load_parks('../dataset/parks_coordinates.csv')
print(parks_df[['name', 'geometry','states']].head())



                              name                geometry          states
0             Acadia National Park    POINT (-68.21 44.35)           Maine
1  National Park of American Samoa  POINT (-170.68 -14.25)  American Samoa
2             Arches National Park   POINT (-109.57 38.68)            Utah
3           Badlands National Park    POINT (-102.5 43.75)    South Dakota
4           Big Bend National Park   POINT (-103.25 29.25)           Texas


In [14]:
def load_states(states_path: str) -> gpd.GeoDataFrame:
    """
    Load US states from shapefile, filter exclusions, return GeoDataFrame.
    """
    EXCLUDED_STATES = {
        "American Samoa", "Guam", "Commonwealth of the Northern Mariana Islands",
        "Puerto Rico", "United States Virgin Islands"}
    states = gpd.read_file(states_path)
    states = states[~states['name'].isin(EXCLUDED_STATES)]
    return states
states_df=load_states('../dataset/ne_110m_admin_1_states_provinces/ne_110m_admin_1_states_provinces.shp')
print(states_df[['name', 'geometry']].head())

           name                                           geometry
0     Minnesota  POLYGON ((-89.95766 47.28691, -90.13175 47.292...
1       Montana  POLYGON ((-116.04823 49.00037, -113.0595 49.00...
2  North Dakota  POLYGON ((-97.22894 49.00089, -97.21414 48.902...
3        Hawaii  MULTIPOLYGON (((-155.93665 19.05939, -155.9080...
4         Idaho  POLYGON ((-116.04823 49.00037, -115.9678 47.95...


In [15]:
def load_timezones(timezones_path: str) :
    """
    Load time zones from shapefile, filter exclusions, return GeoDataFrame.
    """
    EXCLUDED_ZONES = {"Chamorro", "Samoa", "Atlantic"}
    tz = gpd.read_file(timezones_path)
    tz = tz[~tz['zone'].isin(EXCLUDED_ZONES)]
    return tz

tz_df=load_timezones('../dataset/NTAD_Time_Zones_467650596632424595/Time_Zones.shp')
print(states_df[['name', 'geometry']].head())

           name                                           geometry
0     Minnesota  POLYGON ((-89.95766 47.28691, -90.13175 47.292...
1       Montana  POLYGON ((-116.04823 49.00037, -113.0595 49.00...
2  North Dakota  POLYGON ((-97.22894 49.00089, -97.21414 48.902...
3        Hawaii  MULTIPOLYGON (((-155.93665 19.05939, -155.9080...
4         Idaho  POLYGON ((-116.04823 49.00037, -115.9678 47.95...


In [56]:
# input: staet row and timezone row, dirction('state' or 'zone', meaning finding relation from state to timezone or vice versa)
# output: list of relations between state and timezone

def calc_topology(state, timezone, direction):
    upper = 0.98
    lower = 0.02
    geom_state = state.geometry
    geom_other = timezone.geometry
    inter = geom_state.intersection(geom_other)
    ia = inter.area
    area_state = geom_state.area
    upper_area = upper * area_state
    lower_area = lower * area_state

    rels = []
    if ia == 0:
        if state['name'] == "Utah" and timezone.zone == "Pacific" :
            rels.append("touch")
        elif state['name'] == "Montana" and timezone.zone == "Central":
            rels.append("touch")
        else: rels.append("disjoint")
    if 0 < ia <= lower_area:
        rels.append("touch")
    if lower_area <ia <= upper_area:
        rels.append("overlaps") 
    if ia >= upper_area:
        special = {"Iowa", "Missouri", "Arkansas", "West Virginia"}
        if state['name'] in special:
            if(direction == "state"):
                rels.append("within")
            else:   
                rels.append("contains")
        elif state['name'] == "Alaska":
            if(direction == "zone"):
                rels.append("covered by")
            else:   
                rels.append("covers")
        else:
            if(direction == "state"):
                rels.append("covered by")
            else:   
                rels.append("covers")
    
    return sorted(set(rels))

In [57]:
def generate_topo_queries(df_states, df_tz):
    results = []

    # 1) state → tz
    for _, st in df_states.iterrows():
        for _, tz in df_tz.iterrows():
            rels = calc_topology(st, tz, "state")
            results.append({
                'state':     st['name'],
                'time_zone': tz['zone'],
                'query':     f"How is {st['name']} state spatially related to {tz['zone']} Time Zone area?",
                'answer':    ", ".join(rels) if rels else "none"
            })

    # 2) tz → state
    for _, tz in df_tz.iterrows():
        for _, st in df_states.iterrows():
            rels = calc_topology(st, tz, "zone")
            results.append({
                'state':     st['name'],
                'time_zone': tz['zone'],
                'query':     f"How is {tz['zone']} Time Zone Area spatially related to {st['name']} state?",
                'answer':    ", ".join(rels) if rels else "none"
            })

    for tz1, tz2 in combinations(df_tz.itertuples(), 2):
        for _, st in df_states.iterrows():
            # Compute relations against each individual TZ
            rels_tz1 = calc_topology(st, tz1, "state")
            rels_tz2 = calc_topology(st, tz2, "state")

            # If either relation list includes "within, covered by", consider it within the combined area
            if "within" in rels_tz1 or "within" in rels_tz2:
                results.append({
                    'state':     st['name'],
                    'time_zone': f"{tz1.zone} and {tz2.zone}",
                    'query':     (
                        f"How is {st['name']} state spatially related to the combined area of "
                        f"{tz1.zone} and {tz2.zone} Time Zones?"
                    ),
                    'answer':    "within"
                })
                results.append({
                    'state':     st['name'],
                    'time_zone': f"{tz1.zone} and {tz2.zone}",
                    'query':     (
                        f"How is the combined area of {tz1.zone} and {tz2.zone} Time Zones"
                        f"spatially related to {st['name']} state?"
                    ),
                    'answer':    "contains"
                })
                results.append({
                    'state':     st['name'],
                    'time_zone': f"{tz1.zone} and {tz2.zone}",
                    'query':     (
                        f"How is {st['name']} state spatially related to its state area that falls into "
                        f"the combined area of {tz1.zone} and {tz2.zone} Time Zones?"
                    ),
                    'answer':    "equals"
                })

    topo_df = pd.DataFrame(results)
    print(topo_df.head())

    topo_df[['query', 'answer']].to_json(
        '../GeoReason_T1_queries/topo_queries.json',
        orient='records',
        lines=True,
        indent=2
                )
    return results

pairs = generate_topo_queries(
    df_states=states_df,
    df_tz=tz_df
)


       state time_zone                                              query  \
0  Minnesota   Eastern  How is Minnesota state spatially related to Ea...   
1  Minnesota   Central  How is Minnesota state spatially related to Ce...   
2  Minnesota  Mountain  How is Minnesota state spatially related to Mo...   
3  Minnesota   Pacific  How is Minnesota state spatially related to Pa...   
4  Minnesota    Alaska  How is Minnesota state spatially related to Al...   

       answer  
0       touch  
1  covered by  
2    disjoint  
3    disjoint  
4    disjoint  


In [58]:
def calc_direction(parks_gdf, row) -> list:
    """
    Given row with 'park1' and 'park2', compute compass direction neighbors.
    """
    # Lookup coordinates
    p1 = parks_gdf.loc[parks_gdf['name'] == row['park1']].iloc[0]
    p2 = parks_gdf.loc[parks_gdf['name'] == row['park2']].iloc[0]
    lat1, lon1 = p1.latitude, p1.longitude
    lat2, lon2 = p2.latitude, p2.longitude

    # Calculate bearing
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    d_lambda = math.radians(lon2 - lon1)
    x = math.sin(d_lambda) * math.cos(phi2)
    y = math.cos(phi1) * math.sin(phi2) - \
        math.sin(phi1) * math.cos(phi2) * math.cos(d_lambda)
    bearing = (math.degrees(math.atan2(x, y)) + 360) % 360

    # Map to compass directions including neighbors
    directions = ['North', 'Northeast', 'East', 'Southeast',
                  'South', 'Southwest', 'West', 'Northwest']
    idx = int(((bearing + 22.5) % 360) / 45)
    return directions[idx]




In [59]:
def generate_direction_queries(parks_df):
    rows = []
    for i in range(len(parks_df)):
        park1 = parks_df.iloc[i]
        
        # Get indices of all other parks
        other_indices = list(range(len(parks_df)))
        other_indices.remove(i)
        
        # Randomly select 5 other parks   
        for j in other_indices:
            park2 = parks_df.iloc[j]
            query = (
            f"What's the closest 8-point compass direction from {park1['name']} to {park2['name']}?"
            )
            rows.append({
                'park1': park1['name'],
                'park2': park2['name'],
                'query': query
            })
    dir_df = pd.DataFrame(rows)
    dir_df['answer'] = dir_df.apply(lambda row: calc_direction(parks_df, row), axis=1)
    dir_df[['query', 'answer']].to_json(
        '../GeoReason_T1_queries/dir_queries.json',
        orient='records',
        lines=True,
        indent=2,
        force_ascii=False
    )

generate_direction_queries(parks_df)


In [60]:
# calculate distance given lats and longs of two parks
def calc_park_to_park_distance(lat1, lon1, lat2, lon2):
    R = 6371.0  # Radius of Earth in kilometers

    # Convert degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Differences
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = R * c
    return distance

In [61]:
def generate_distance_queries(parks_df):
    nearest_parks = []
    farthest_parks = []
    for i in range(len(parks_df)):
        park1 = parks_df.iloc[i]
        
        # Get indices of all other parks
        other_indices = list(range(len(parks_df)))
        other_indices.remove(i)
        
        # Calculate distances to all other parks
        distances = []
        for j in other_indices:
            park2 = parks_df.iloc[j]
            distance = calc_park_to_park_distance(park1['latitude'], park1['longitude'], park2['latitude'], park2['longitude'])
            distances.append((park2['name'], distance))
        
        # Sort by distance
        distances.sort(key=lambda x: x[1])
        
        # Nearest and farthest parks
        nearest_parks.append(distances[0])
        farthest_parks.append(distances[-1])
    # Create DataFrame for nearest and farthest parks
    nearest_df = pd.DataFrame(nearest_parks, columns=['answer', 'nearest_distance'])
    farthest_df = pd.DataFrame(farthest_parks, columns=['answer', 'farthest_distance'])
    # Combine with original DataFrame
    # new_gdf = pd.concat([gdf, nearest_df, farthest_df], axis=1)
    nearest_df = pd.concat([parks_df['name'], nearest_df], axis=1)
    farthest_df = pd.concat([parks_df['name'], farthest_df], axis=1)

    # add query to nearest_df and farthest_df
    nearest_df['query'] = nearest_df.apply(lambda row: f"Which officially designated national park is the closest to {row['name']} in straight line distance?", axis=1)
    farthest_df['query'] = farthest_df.apply(lambda row: f"Which officially designated national park is the farthest from {row['name']} in straight line distance?", axis=1)


    # Export queries + answers as JSON
    nearest_df[['query', 'answer']].to_json(
        '../GeoReason_T1_queries/dis_nearest_queries.json',
        orient='records',
        lines=True, indent=2, force_ascii=False
    )

    farthest_df[['query', 'answer']].to_json(
        '../GeoReason_T1_queries/dis_farthest_queries.json',
        orient='records',
        lines=True, indent=2, force_ascii=False
    )
generate_distance_queries(parks_df)

In [62]:
import shapely.ops as ops

# calculate distance from park to state boundary
def calc_park_to_state_distance(park_name,parks_gdf, state_name, states_gdf) -> float:
    # Select park geometry
    park_row = parks_gdf[parks_gdf['name'] == park_name]
    if park_row.empty:
        raise ValueError(f"Park '{park_name}' not found in parks_gdf.")
    park_point = park_row.geometry.iloc[0]

    # Select state geometry
    state_row = states_gdf[states_gdf['name'] == state_name]
    if state_row.empty:
        raise ValueError(f"State '{state_name}' not found in states_gdf.")
    state_poly = state_row.geometry.iloc[0]

    # If park is inside state, distance is zero
    if state_poly.contains(park_point):
        return 0.0

    # Find nearest point on state boundary
    boundary = state_poly.boundary
    nearest_geom = ops.nearest_points(park_point, boundary)[1]

    # Compute haversine distance
    return calc_park_to_park_distance(
        park_point.y, park_point.x,
        nearest_geom.y, nearest_geom.x
    )

In [63]:
# example usage
calc_park_to_state_distance(
    'Yosemite National Park', parks_df, 'Alabama', states_df
)

2808.1730884567332

In [None]:
def add_topo_queries(df_states, df_tz, template, answer):
    results = []
    for tz1, tz2 in combinations(df_tz.itertuples(), 2):
        for _, st in df_states.iterrows():
            # Compute relations against each individual TZ
            rels_tz1 = calc_topology(st, tz1, "state")
            rels_tz2 = calc_topology(st, tz2, "state")
            # If either relation list includes "within, covered by", consider it within the combined area
            if "within" in rels_tz1 or "within" in rels_tz2:
                results.append({
                    'query':     template.format(tz1=tz1.zone, tz2=tz2.zone, state=st['name']),
                    'answer':    answer
                })
    return results


In [None]:
import pandas as pd
import random
# Which state that topologically contains Eastern Time Zones area is nearest to Voyageurs National Park in straight-line distance?

def find_states(tz, topo, states_gdf):
    states = []
    for _,state in states_gdf.iterrows():
        rels = calc_topology(state, tz, "state")
        if topo in rels:
            states.append(state['name'])
    return states

# some_states = find_states(tz_df[tz_df['zone']=='Central'].iloc[0], 'covered by', states_df)
# print(some_states)

def find_state_by_distance(park_name, parks_gdf, states_list, states_gdf, distance_type):
    distances = []
    for state_name in states_list:
        dist = calc_park_to_state_distance(park_name, parks_gdf, state_name, states_gdf)
        distances.append((state_name, dist))
    distances.sort(key=lambda x: x[1])
    if not distances:
        return "None"
    if distance_type == "nearest":
        return distances[0][0]
    else:
        return distances[-1][0]
    
# find_state_by_distance("Yellowstone National Park", parks_df, some_states, states_df, "nearest")

def generate_topo_dis_queries(parks_gdf, states_gdf, tz_gdf):
    DISTANCE_TYPES = ['nearest', 'farthest']
    TOPOLOGIES = ['covered by', 'covers', 'within', 'contains', 'equals', 'overlaps', 'touch', 'disjoint'] 
    out_fp = Path('../GeoReason_T2_queries/topology_and_distance_queries.jsonl')
    out_fp.parent.mkdir(parents=True, exist_ok=True)
    with out_fp.open('w', encoding='utf-8') as f:
        for _, park in parks_gdf.iterrows():
            for _, tz in tz_gdf.iterrows():
                for distance_type in DISTANCE_TYPES:
                    for topo in TOPOLOGIES:
                        cand_states = find_states(tz, topo, states_gdf)
                        if not cand_states:
                            answer = "None"
                        record = []
                        answer = find_state_by_distance(park['name'], parks_gdf, cand_states, states_gdf, distance_type)
                        query = (
                            f"Which state that topologically {topo} {tz['zone']} Time Zone area is the {distance_type} to "
                            f"{park['name']} in straight-line distance?"
                        )
                        record.append({'query': query, 'answer': answer})
                        query = (
                            f"Which time zone that topologically {answer} is the {distance_type} to "
                            f"{park['name']} in straight-line distance?"
                        )
                        json.dump(record, f, ensure_ascii=False)
                        f.write('\n')



        for _, park in parks_gdf.iterrows():
            for distance_type in DISTANCE_TYPES:
                query = (
                            f"Which state that topologically {topo} combined area of {{tz1}} and {{tz2}} Time Zone area is the {distance_type} to "
                            f"{park['name']} in straight-line distance?"
                        )
                records = add_topo_queries(states_df, tz_df, query, topo)
                json.dump(records, f, ensure_ascii=False)
                f.write('\n')


generate_topo_dis_queries(parks_df, states_df, tz_df)


# add_topo_queries(states_df, tz_df, "How is {state} state spatially related to the combined area of {tz1} and {tz2} Time Zones?", "within")               
                    
                    

KeyboardInterrupt: 

In [None]:
(
                        f"How is {st['name']} state spatially related to the combined area of "
                        f"{tz1.zone} and {tz2.zone} Time Zones?"
                    )