In [88]:
import sys
import os

import numpy as np
import pandas as pd

from gbs import compute_presence_nonpresence_ssi, compute_relative_performance_ssi

In [89]:
# convert km to randian
def km_to_radians(km):
    # Earth's average radius in kilometers
    earth_radius_km = 6371.0
    # Convert km to radians
    radians = km / earth_radius_km
    return radians


km = 100
radians = km_to_radians(km)
print(f"{km} km is equivalent to {radians} radians")

100 km is equivalent to 0.01569612305760477 radians


In [90]:
# preprocess_data, replace 0 to -1, drop NaN coords
def preprocess_data(data_df):
    # Replace 0 in binary prediction with -1 (ensure mean is 0)
    for column in ['hit@1', 'hit@3']:
        if column in data_df.columns:
            data_df[column] = data_df[column].replace(0, -1)

    # Delete rows with NaN in 'lon' or 'lat'
    data_df.dropna(subset=['lon', 'lat'], inplace=True)

    # Convert 'lon' and 'lat' to radians
    data_df[['lon', 'lat']] = np.radians(data_df[['lon', 'lat']].values)

    return data_df

In [91]:
# Function to calculate the Haversine distance between two points in radians
def haversine(lat1, lon1, lat2, lon2):
    delta_lat = lat2 - lat1
    delta_lon = lon2 - lon1
    a = np.sin(delta_lat / 2) ** 2 + np.cos(lat1) * \
        np.cos(lat2) * np.sin(delta_lon / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))  # central angle in radians
    return c  # angular distance in radians

In [92]:
def find_neighbors_within_radius(data, low_point, radius):
    neighbors = []
    '''
    [Input]
        data: full data points
        low_point: low performance point (lat, lon, metric), the metric (e.g., hit@1, hit@3)
        radius: distance in radian
    [Output]
        neighbors of the given low_point, as a numpy array, [lat, lon, metric]
    '''
    low_lat, low_lon = low_point[0], low_point[1]

    # Loop through all points in the dataset to find neighbors
    for point in data:
        lat, lon = point[0], point[1]
        # Angular distance in radians
        distance = haversine(low_lat, low_lon, lat, lon)

        # Check if the distance is within the specified radius
        if distance <= radius:
            # Assuming 3rd column in data is the metric (e.g., hit@1, hit@3)
            metric = point[2]
            # Store the neighbor's lat, lon, and metric
            neighbors.append([lat, lon, metric])

    return np.array(neighbors)

In [93]:
input_folder = "result_tables"
radius = 0.01
metric = 'hit@1'

for filename in os.listdir(input_folder):
    if filename.endswith(".csv"):  # only CSV files
        input_file_path = os.path.join(input_folder, filename)

        data_df = pd.read_csv(input_file_path)
        data_df = preprocess_data(data_df)
        data = data_df[['lat', 'lon', metric]].to_numpy()

        # Select points with low performance
        low_performance = data[:, data_df.columns.get_loc(metric)] == -1
        low_data = data[low_performance]

        for density in np.array([2000000, 2000000]):
            count = 0
            total_pnp_ssi = 0
            total_rp_ssi = 0
            for low_point in low_data:
                neighbors = find_neighbors_within_radius(
                    data, low_point, radius)
                pnp_ssi = compute_presence_nonpresence_ssi(
                    neighbors[:, :2], low_point, radius, density, k=4)
                rp_ssi = compute_relative_performance_ssi(
                    neighbors[:, :2], neighbors[:, 2], low_point, radius, density, k=4)

                count += 1
                total_pnp_ssi += pnp_ssi
                total_rp_ssi += rp_ssi
                print(f"pnp_ssi: {pnp_ssi}, rp_ssi: {rp_ssi}")

            avg_pnp_ssi = total_pnp_ssi / count
            avg_rp_ssi = total_rp_ssi / count
            print(
                f"Density:{density}, Average pnp_ssi: {avg_pnp_ssi}, Average rp_ssi: {avg_rp_ssi}")

        print(f"Data from {filename} Finished")

pnp_ssi: 1.0499530138898319, rp_ssi: 2.4157919150895255
pnp_ssi: 85.9914533439124, rp_ssi: 110.16381387349584
pnp_ssi: 15.675536077498206, rp_ssi: 13.268840992363875
pnp_ssi: 352.97083825045917, rp_ssi: 275.8155804353226
pnp_ssi: 70.09131442722784, rp_ssi: 60.71206006637026
pnp_ssi: 128.6321491332602, rp_ssi: 141.6230232111469
pnp_ssi: 80.22378670760214, rp_ssi: 105.7547087059064
pnp_ssi: 190.8907373988883, rp_ssi: 152.45939802505106
pnp_ssi: 135.60161064964598, rp_ssi: 129.7896121433702
pnp_ssi: 189.37386370400284, rp_ssi: 212.2118863117921
pnp_ssi: 86.36299640731922, rp_ssi: 112.35503943395656
pnp_ssi: 40.748085226178105, rp_ssi: 12.66992183118756
pnp_ssi: 310.01299892215263, rp_ssi: 244.83583610216496
pnp_ssi: 293.29640065374645, rp_ssi: 194.4377802767862
pnp_ssi: 118.22296053106662, rp_ssi: 43.00982602192252
pnp_ssi: 305.2372094127608, rp_ssi: 242.57809385426535
pnp_ssi: 11.855054073636502, rp_ssi: 2.7211154092666243
pnp_ssi: 88.60081448403533, rp_ssi: 115.34217057407169
pnp_ssi: 2