# Processing Tools

Hopefully we don't need to run any of these again but I wrote them as tools to help clean up the data 

## Merge CSV data

In [None]:
import pandas as pd
import os

# Folder containing the historical forecast CSVs
target_folder = 'historicalForecasts2024'

# Output path for combined file
output_csv_path = 'unfiltered_historicalForecast2024.csv'

# List to collect each month's DataFrame
dfs = []

# Load and append each month's CSV
for i in range(1, 13):
    file_path = os.path.join(target_folder, f'historicalForecast{i:02}.csv')
    print('Loading:', file_path)
    df = pd.read_csv(file_path)
    df['Month'] = i  # Optional: tag rows with the month they came from
    dfs.append(df)

# Combine all months into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Save to disk
combined_df.to_csv(output_csv_path, index=False)
print(f"Saved unfiltered combined CSV to: {output_csv_path}")

## Drop 10m forecasts from unfiltered data 

In [None]:
import pandas as pd

path = 'data/unfiltered_historicalForecast2024.csv'

df = pd.read_csv(path)

df = df.drop(columns=[col for col in df.columns if col.startswith('u10') or col.startswith('v10')])

df.to_csv(path, index=False)

## Cache coordinates of unfiltered sensors to `column_coordinates.csv`

In [26]:
import pandas as pd

df = pd.read_csv('../data/unfiltered_historicalForecast2024.csv')
coordinates = df.columns[1:-1]

parsed = []
for col in coordinates:
    direction, coord_str = col.split('_(')
    lat, lon = coord_str[:-1].split(', ')  # remove closing ')' and split
    parsed.append((direction, float(lat), float(lon)))
    
coords_df = pd.DataFrame(parsed, columns=['direction', 'latitude', 'longitude'])

coords_df['latitude'] = coords_df['latitude'].round(10)
coords_df['longitude'] = coords_df['longitude'].round(10)

coords_df['sensor_id'] = coords_df.groupby(['latitude', 'longitude']).ngroup() + 1
coords_df = coords_df[['sensor_id', 'direction', 'latitude', 'longitude']]

coords_df = coords_df.sort_values(by='sensor_id').reset_index(drop=True)

coords_df.to_csv('../data/coordinate_columns.csv', index=False)

## Replace column names with direction and sensor_ids

In [None]:
path = '../data/unfiltered_historicalForecast2024.csv'

df = pd.read_csv(path)
df_coords = pd.read_csv('../data/coordinate_columns.csv')


new_columns = []

for col in df.columns:
    if '_(' in col and col.endswith(')'):
        direction, coord_str = col.split('_(')
        lat_str, lon_str = coord_str[:-1].split(', ')  # remove closing ')' and split
        lat = round(float(lat_str), 10)
        lon = round(float(lon_str), 10)
        new_col = f"{direction}_({lat}, {lon})"
        new_columns.append(new_col)
    else:
        new_columns.append(col)  # leave untouched if it doesn't match pattern

# Apply the new column names
df.columns = new_columns


rename_dict = {}

for _, row in df_coords.iterrows():
    direction = row['direction']
    lat = row['latitude']
    lon = row['longitude']
    
    # Reconstruct the original column name in df
    old_col = f"{direction}_({lat}, {lon})"
    new_col = f"{direction}_{row['sensor_id']}"
    
    rename_dict[old_col] = new_col
    
keys = rename_dict.keys()


df = df.rename(columns=rename_dict)

display(df.head(1))

non_sensor_cols = [col for col in df.columns if "_" not in col]

# Get sensor columns (e.g., u80_1, v80_1, ...)
sensor_cols = [col for col in df.columns if "_" in col]

print(sensor_cols)

# Parse sensor columns into (sensor_id, direction, column_name)
parsed_cols = []
for col in sensor_cols:
    direction, sensor_id_str = col.split("_")
    sensor_id = int(sensor_id_str)
    parsed_cols.append((sensor_id, direction, col))

# Sort by sensor_id, then direction (u80 before v80)
sorted_sensor_cols = [col for _, _, col in sorted(parsed_cols, key=lambda x: (x[0], x[1]))]

# Reorder df columns
df = df[non_sensor_cols + sorted_sensor_cols]

print(df.columns[:60])
display(df.head(1))

df.to_csv(path,index=False)