In [1]:
import pandas as pd
import os
from haversine import haversine_vector, Unit, haversine
import random
import warnings
from math import radians
warnings.filterwarnings("ignore")

In [5]:
def downsample_coordinates(df, distance_threshold):
    coordinates = []

    # Extract latitude and longitude columns from the DataFrame
    latitudes = df['address_lat'].tolist()
    longitudes = df['address_lon'].tolist()

    # Convert coordinates from degrees to radians
    coordinates = [(radians(lat), radians(lon)) for lat, lon in zip(latitudes, longitudes)]

    # Perform downsampling
    selected_coordinates = [coordinates[0]]  # Start with the first coordinate
    selected_indices = [0]
    for i, coord in enumerate(coordinates):
        # Check distance to previously selected coordinates
        distances = [haversine(coord, selected_coord, unit=Unit.METERS)
                     for selected_coord in selected_coordinates]
        if all(distance > distance_threshold for distance in distances):
            selected_coordinates.append(coord)
            selected_indices.append(i)

    return selected_coordinates, selected_indices

if not os.path.exists("closest_pt_downsampled"):
    os.makedirs("closest_pt_downsampled")
    
# Assuming you have a list of district names and their corresponding coordinates
districts = range(1, 24)
pt_types = range(4)

distance_threshold = 1
for district in districts:
    print(district)
    df_join = pd.DataFrame()
    df_downsampled_join = pd.DataFrame()
    for pt_type in pt_types:
        df_join = pd.concat([df_join, pd.read_csv(f"closest_pt/closest_pt_{pt_type}_{district}.csv")])
        if os.path.exists(f"closest_pt_downsampled/closest_pt_{pt_type}_{district}.csv"):
            df_downsampled_join = pd.concat([df_downsampled_join, pd.read_csv(f"closest_pt_downsampled/closest_pt_{pt_type}_{district}.csv")])
            continue
        df = pd.read_csv(f"closest_pt/closest_pt_{pt_type}_{district}.csv")

        selected_coordinates, selected_indices = downsample_coordinates(df, distance_threshold)
        df_downsampled = df.loc[selected_indices,:]
        df_downsampled.to_csv(f"closest_pt_downsampled/closest_pt_{pt_type}_{district}.csv")
        
    df_join.drop(['pt_name', 'pt_lon', 'pt_lat', 'shortest_distance_pt'], axis=1, inplace=True)
    df_downsampled_join.drop(['pt_name', 'pt_lon', 'pt_lat', 'shortest_distance_pt'], axis=1, inplace=True)
    
    df_join.to_csv(f"closest_pt/closest_pt_{district}.csv")
    df_downsampled_join.to_csv(f"closest_pt_downsampled/closest_pt_{district}.csv")
    

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
