In [None]:
#This file goes through historical data and removes all data that is not within X km of a wind turbine

import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
import os
import ast

In [2]:
def filter_nearby_sensors(reference_coords, sensor_dict, max_distance_km):
    """
    Returns sensor_ids whose coordinates are farther than max_distance_km from all reference points.
    
    Parameters:
        reference_coords (list of tuples): List of (lat, lon) reference coordinates.
        sensor_dict (dict): Dictionary {sensor_id: (lat, lon)} for target sensors.
        max_distance_km (float): Maximum allowed distance in kilometers.
    
    Returns:
        list of sensor_ids: IDs of sensors outside the distance threshold.
    """
    # Convert lat/lon to radians
    ref_radians = np.radians(reference_coords)
    sensor_ids = list(sensor_dict.keys())
    sensor_coords = list(sensor_dict.values())
    tgt_radians = np.radians(sensor_coords)

    # Build KDTree
    tree = cKDTree(ref_radians)
    EARTH_RADIUS_KM = 6371.0

    # Query distances
    distances, _ = tree.query(tgt_radians, distance_upper_bound=max_distance_km / EARTH_RADIUS_KM)

    # Return sensor IDs whose distance is inf (i.e., no close reference point)
    filtered_ids = [sensor_id for sensor_id, d in zip(sensor_ids, distances) if d == np.inf]

    return filtered_ids

In [16]:
def produce_filtered_dataset(unfiltered_data_path,output_folder, max_distance_km):
    df = pd.read_csv("data/uswtdb_v7_2_20241120.csv")
        
    df_ny = df[df["t_state"] == "NY"] # Filter for New York State (NY)
        
    # Select only coordinates
    ny_turbine_coords = list(zip(df_ny["ylat"], df_ny["xlong"]))
    
    df_sensor_coords = pd.read_csv('data/coordinate_columns.csv')
    df_sensor_coords = df_sensor_coords.drop_duplicates(subset='sensor_id')
 
    sensor_dict = dict(zip(df_sensor_coords['sensor_id'], zip(df_sensor_coords['latitude'], df_sensor_coords['longitude'])))    
        
    distant_sensor_ids = filter_nearby_sensors(ny_turbine_coords, sensor_dict, max_distance_km)
    
    print('Filtered Sensors Found')    

    columns_to_drop = [f"{prefix}_{id}" for id in distant_sensor_ids for prefix in ("u80", "v80")]
    
                
    print('Loading: ', unfiltered_data_path)
    df_unfiltered = pd.read_csv(unfiltered_data_path)
    
    
    no_matches = [col for col in columns_to_drop if col not in df_unfiltered.columns]
    if no_matches:
        print('Error: Attempting to drop sensors that do not appear in the unfiltered dataframe')
    
    df_unfiltered = df_unfiltered.drop(columns=[col for col in columns_to_drop if col in df_unfiltered.columns])
    print(f'Successfully Dropped {len(columns_to_drop)} Columns ')
    
    output_path = output_folder + f'/{max_distance_km}km_historicalForecast2024.csv'
    
    df_unfiltered.to_csv(output_path, index=False)

    print(f"Saved filtered CSV to: {output_path}")

max_distance_km = 5
produce_filtered_dataset('data/unfiltered_historicalForecast2024.csv','data/filtered_historicalForecasts',max_distance_km)


Filtered Coordinates Found
Loading:  data/unfiltered_historicalForecast2024.csv
Successfully Dropped 33746 Columns 
Saved filtered CSV to: data/filtered_historicalForecasts/5km_historicalForecast2024.csv
