### Process missing weather station data

#### Fill missing ECCC station data with interpolation

In [None]:
import pandas as pd
from pathlib import Path

def eccc_fill_wind_dir_gaps(input_csv):
    """
    Reads wind direction data, checks for gaps in 'date/time (lst)' within a 7-day threshold,
    fills missing rows, and interpolates 'winddirsde' using nearest-neighbor interpolation 
    grouped by 'climate id' and 'month'.

    Parameters:
    input_csv (str or Path): Path to the input CSV file containing required columns.
    """
    data = pd.read_csv(input_csv, low_memory=False, parse_dates=['date/time (lst)'])
    required_columns = ['climate id', 'year', 'month', 'day', 'time (lst)', 
                        'longitude (x)', 'latitude (y)', 'station name', 'winddirsde']
    if not all(col in data.columns for col in required_columns):
        raise ValueError(f"Input CSV must contain {required_columns}")

    # Sort and process groups to fill gaps in 'date/time (lst)' within 7 days
    filled_data = []
    for (climate_id, year, month), group in data.sort_values(by=['climate id', 'year', 'month', 'date/time (lst)']).groupby(['climate id', 'year', 'month']):
        if (group['date/time (lst)'].diff().dt.total_seconds() / 3600).max() > 1:
            group = group.set_index('date/time (lst)').resample('H').asfreq().reset_index()
            group[['climate id', 'year', 'month']] = climate_id, year, month
            group['day'], group['time (lst)'] = group['date/time (lst)'].dt.day, group['date/time (lst)'].dt.time
            group[['longitude (x)', 'latitude (y)', 'station name']] = group[['longitude (x)', 'latitude (y)', 'station name']].ffill().bfill()
        filled_data.append(group)

    # Concatenate and interpolate 'winddirsde'
    data_filled = pd.concat(filled_data).reset_index(drop=True)
    data_filled['winddirsde'] = data_filled.groupby(['climate id', 'month'])['winddirsde'].transform(
        lambda x: x.interpolate(method='nearest', limit_direction='both')
    )

    # Save output
    output_file = Path(input_csv).with_name(f"{Path(input_csv).stem}_filled.csv")
    data_filled.to_csv(output_file, index=False, float_format='%.2f')
    print(f"Filled data saved to {output_file}")

# Example usage
csv_file = r"D:\UCalgary_Lectures\GEOG_683\Data_workspace\BBox\ECCC_station_data_BBox_small_12_1_2.csv"
eccc_fill_wind_dir_gaps(csv_file)


#### Fill missing ACIS station data with interpolation

In [None]:
import pandas as pd
from pathlib import Path

def acis_fill_wind_dir_gaps(input_csv):
    """
    Reads wind direction data, fills missing 'datetime' rows within a 7-day threshold and interpolates 'winddir'
    grouped by 'station name' and 'year', then saves the filled data to a new CSV file.

    Parameters:
    input_csv (str or Path): Path to the input CSV file containing 'station name', 'datetime', and 'winddir' columns.
    """
    data = pd.read_csv(input_csv, low_memory=False, parse_dates=['datetime'])
    if not all(col in data.columns for col in ['station name', 'datetime', 'winddir']):
        raise ValueError("Input CSV must contain 'station name', 'datetime', and 'winddir' columns.")

    data['year'] = data['datetime'].dt.year
    data = data.sort_values(['station name', 'year', 'datetime'])

    # Process groups and fill missing rows in 'datetime' within a 7-day threshold
    filled_data = []
    for (station_name, year), group in data.groupby(['station name', 'year']):
        if (group['datetime'].diff().dt.total_seconds() / 3600).max() > 1:
            group = group.set_index('datetime').resample('H').asfreq().reset_index()
            group[['station name', 'year']] = station_name, year
        filled_data.append(group)

    data_filled = pd.concat(filled_data).reset_index(drop=True)

    # Interpolate 'winddir' for each station and year
    data_filled['winddir'] = data_filled.groupby(['station name', 'year'])['winddir'].transform(
        lambda x: x.interpolate(method='nearest', limit_direction='both')
    )

    # Save output
    output_file = Path(input_csv).with_name(f"{Path(input_csv).stem}_filled.csv")
    data_filled.drop(columns=['year']).to_csv(output_file, index=False)
    print(f"Filled data saved to {output_file}")

# Example usage
csv_file = r"D:\UCalgary_Lectures\GEOG_683\Data_workspace\ACIS_Data\ACIS_station_data.csv"
acis_fill_wind_dir_gaps(csv_file)


### Validate the ERA5 NC data with station data

#### Validate using ECCC filled CSV file

In [None]:
import pandas as pd
import numpy as np
import xarray as xr
from pathlib import Path

def validate_era5_station_data_by_eccc_station_data(nc_file, station_csv, station_names):
    """
    Processes ERA5 NC data and station CSV data to extract daily mean wind direction
    and match data by time, generating an output CSV with comparison.

    Parameters:
    - nc_file (str): Path to the ERA5 NetCDF file.
    - station_csv (str): Path to the station CSV file.
    - station_names (list): List of station names to process.
    """
    station_data = pd.read_csv(station_csv, low_memory=False)
    station_data['date/time (lst)'] = pd.to_datetime(station_data['date/time (lst)'], errors='coerce')
    station_data = station_data[station_data['station name'].isin(station_names)]
    station_daily = station_data.groupby(
        ['station name', 'climate id', 'latitude (y)', 'longitude (x)', 'year', 'month', 'day']
    ).agg({'winddirsde': lambda x: np.mean(x.dropna() * 10) if len(x.dropna()) > 0 else np.nan}).reset_index()
    station_daily.rename(columns={'winddirsde': 'Y (mean direction of weath st)'}, inplace=True)

    era5_data = xr.open_dataset(nc_file, engine='netcdf4')
    nc_years_months = set(pd.to_datetime(era5_data['valid_time'].values).strftime('%Y-%m'))

    def calculate_mean_wind_direction(u, v):
        speed = np.sqrt(u ** 2 + v ** 2)
        rad = (np.arctan2(u, v) + 2 * np.pi) % (2 * np.pi)
        return np.rad2deg((np.arctan2(np.mean(np.sin(rad) * speed), np.mean(np.cos(rad) * speed)) + 2 * np.pi) % (2 * np.pi))

    output = []
    for _, variant in station_daily[['latitude (y)', 'longitude (x)', 'climate id']].drop_duplicates().iterrows():
        u, v = era5_data['u'].sel(latitude=variant['latitude (y)'], longitude=variant['longitude (x)'], method='nearest'), \
               era5_data['v'].sel(latitude=variant['latitude (y)'], longitude=variant['longitude (x)'], method='nearest')
        time_indices = pd.to_datetime(u['valid_time'].values)
        daily = pd.DataFrame([
            (day, calculate_mean_wind_direction(
                u.values[time_indices.floor('D') == day],
                v.values[time_indices.floor('D') == day]
            ))
            for day in time_indices.to_series().dt.floor('D').unique()
        ], columns=['time', 'X (mean direction of NC)'])

        daily['year'] = daily['time'].dt.year
        daily['month'] = daily['time'].dt.month
        daily['day'] = daily['time'].dt.day

        filtered = station_daily[
            (station_daily['climate id'] == variant['climate id']) &
            (station_daily[['year', 'month']].apply(lambda x: f"{x[0]}-{x[1]:02}", axis=1).isin(nc_years_months))
        ]
        output.append(pd.merge(daily, filtered, on=['year', 'month', 'day'], how='inner'))

    pd.concat(output, ignore_index=True)[[
        'station name', 'climate id', 'latitude (y)', 'longitude (x)', 'year', 'month', 'day',
        'X (mean direction of NC)', 'Y (mean direction of weath st)'
    ]].to_csv(Path(nc_file).with_name(f"{Path(nc_file).stem}_{'_'.join(station_names)}_ECCC_comparison.csv"), index=False)

    print(f"Output saved to {Path(nc_file).with_name(f'{Path(nc_file).stem}_{'_'.join(station_names)}_ECCC_comparison.csv')}")

# Core
# nc_file = r"D:\UCalgary_Lectures\GEOG_683\Data_workspace\Daily_multilevel\Windir_1950_2024_Feb\data_stream-oper_BBox_small_masked.nc"
# station_csv = r"D:\UCalgary_Lectures\GEOG_683\Data_workspace\BBox\ECCC_station_data_BBox_small_12_1_2_filled.csv"
# station_names = ["BROOKS", "MEDICINE HAT A", "SUFFIELD A"]

# Outer core
nc_file = r"D:\UCalgary_Lectures\GEOG_683\Data_workspace\Daily_multilevel\Windir_1950_2024_Dec\data_stream-oper_BBox_large_masked.nc"
station_csv = r"D:\UCalgary_Lectures\GEOG_683\Data_workspace\BBox\ECCC_station_data_BBox_large_12_1_2_filled.csv"
station_names = ["CALGARY INT'L A", "LETHBRIDGE A"]

validate_era5_station_data_by_eccc_station_data(nc_file, station_csv, station_names)


#### Validate using ACIS filled CSV

In [None]:
import pandas as pd
import numpy as np
import xarray as xr
from pathlib import Path

def validate_era5_station_data_by_acis_station_data(nc_file, station_csv, station_names):
    """
    Processes ERA5 NC data and ACIS station CSV data to extract daily mean wind direction
    and match data by time, generating an output CSV with comparison.

    Parameters:
    - nc_file (str): Path to the ERA5 NetCDF file.
    - station_csv (str): Path to the ACIS station CSV file.
    - station_names (list): List of station names to process.
    """
    station_data = pd.read_csv(station_csv, low_memory=False)
    station_data['datetime'] = pd.to_datetime(station_data['datetime'], errors='coerce')
    station_data = station_data[station_data['station name'].isin(station_names)]
    
    station_daily = station_data.groupby(
        ['station name', 'latitude', 'longitude', station_data['datetime'].dt.year.rename('year'), station_data['datetime'].dt.month.rename('month'), station_data['datetime'].dt.day.rename('day')]
    ).agg({'winddir': lambda x: np.mean(x.dropna()), 'atmostemp': lambda x: np.mean(x.dropna())}).reset_index()
    station_daily.rename(columns={
        'winddir': 'Y (mean direction of station)',
        'atmostemp': 'Y (mean temp of station)'
    }, inplace=True)

    era5_data = xr.open_dataset(nc_file, engine='netcdf4')
    nc_time = pd.to_datetime(era5_data['valid_time'].values)
    nc_years_months = set(nc_time.strftime('%Y-%m'))

    def calculate_mean_wind_direction(u, v):
        speed = np.sqrt(u ** 2 + v ** 2)
        rad = (np.arctan2(u, v) + 2 * np.pi) % (2 * np.pi)
        return np.rad2deg((np.arctan2(np.mean(np.sin(rad) * speed), np.mean(np.cos(rad) * speed)) + 2 * np.pi) % (2 * np.pi))

    output = []
    for _, variant in station_daily[['latitude', 'longitude']].drop_duplicates().iterrows():
        u, v = era5_data['u'].sel(latitude=variant['latitude'], longitude=variant['longitude'], method='nearest'), \
               era5_data['v'].sel(latitude=variant['latitude'], longitude=variant['longitude'], method='nearest')
        time_indices = pd.to_datetime(u['valid_time'].values)
        daily = pd.DataFrame([
            (day, calculate_mean_wind_direction(
                u.values[time_indices.floor('D') == day],
                v.values[time_indices.floor('D') == day]
            ))
            for day in time_indices.to_series().dt.floor('D').unique()
        ], columns=['time', 'X (mean direction of NC)'])

        daily['year'] = daily['time'].dt.year
        daily['month'] = daily['time'].dt.month
        daily['day'] = daily['time'].dt.day

        filtered = station_daily[
            (station_daily['latitude'] == variant['latitude']) &
            (station_daily['longitude'] == variant['longitude']) &
            (station_daily[['year', 'month']].apply(lambda x: f"{x[0]}-{x[1]:02}", axis=1).isin(nc_years_months))
        ]
        output.append(pd.merge(daily, filtered, on=['year', 'month', 'day'], how='inner'))

    pd.concat(output, ignore_index=True)[[
        'station name', 'latitude', 'longitude', 'year', 'month', 'day',
        'X (mean direction of NC)', 'Y (mean direction of station)', 'Y (mean temp of station)'
    ]].to_csv(Path(nc_file).with_name(f"{Path(nc_file).stem}_{'_'.join(station_names)}_ACIS_comparison.csv"), index=False)

    print(f"Output saved to {Path(nc_file).with_name(f'{Path(nc_file).stem}_{'_'.join(station_names)}_ACIS_comparison.csv')}")

# Example usage
# nc_file = r"D:\UCalgary_Lectures\GEOG_683\Data_workspace\Daily_multilevel\Windir_1950_2024_Dec\data_stream-oper_BBox_small_masked.nc"
nc_file = r"D:\UCalgary_Lectures\GEOG_683\Data_workspace\Daily_multilevel\Windir_1950_2024_Feb\data_stream-oper_BBox_large_masked.nc"
station_csv = r"D:\UCalgary_Lectures\GEOG_683\Data_workspace\ACIS_Data\ACIS_station_data_filled.csv"
station_names = ["Hussar AGDM", "Acadia Valley AGCM"]

validate_era5_station_data_by_acis_station_data(nc_file, station_csv, station_names)


#### Get the number significant values per station names using Shapiro-Wilks test on residuals

In [None]:
import pandas as pd
from scipy.stats import shapiro

def analyze_station_pvalues(file_path, sheet_name='Sheet1', residual_column='Residual (y-x)', 
                            year_column='year', station_column='station name'):
    """
    Analyzes the normality of residuals for each station and year using the Shapiro-Wilk test.

    Parameters:
    file_path (str): Path to the Excel file containing the data.
    sheet_name (str): Sheet name in the Excel file to load. Default is 'Sheet1'.
    residual_column (str): Name of the column containing residual values. Default is 'Residual (y-x)'.
    year_column (str): Name of the column containing year values. Default is 'year'.
    station_column (str): Name of the column containing station names. Default is 'station name'.

    Returns:
    pd.DataFrame: A DataFrame summarizing the results for each station.
    """
    try:
        # Load the data
        data = pd.read_excel(file_path, sheet_name=sheet_name)

        # Group by station and analyze
        station_results = []
        for station, station_data in data.groupby(station_column):
            p_value_above_05 = 0
            total_years = station_data[year_column].nunique()
            total_data_points = station_data.shape[0]

            for year, year_data in station_data.groupby(year_column):
                if len(year_data[residual_column]) >= 3:  # Minimum data points for Shapiro test
                    _, p_value = shapiro(year_data[residual_column])
                    p_value_above_05 += (p_value > 0.05)

            # Append results for the station
            station_results.append({
                'Station': station,
                'P-Value > 0.05': p_value_above_05,
                'P-Value <= 0.05': total_years - p_value_above_05,
                'Total Years': total_years,
                'Total Data Points': total_data_points
            })

        # Convert results to a DataFrame
        results_df = pd.DataFrame(station_results)
        return results_df

    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Example usage
file_path = r'D:\UCalgary_Lectures\GEOG_683\Data_workspace\Daily_multilevel\data_stream-oper_BBox_small_masked_ACIS_DJF_comparison.xlsx'
analyze_station_pvalues(file_path)


Station-Level Summary with P-Values, Year Counts, and Data Points:
              Station  P-Value > 0.05  P-Value <= 0.05  Total Years  \
0  Acadia Valley AGCM               0               17           17   
1              Brooks               0               20           20   
2     Fincastle IMCIN               0               20           20   
3         Hussar AGDM               0               20           20   
4         Onefour CDA               0               20           20   

   Total Data Points  
0               1444  
1               1713  
2               1713  
3               1713  
4               1713  
