#### Download ECCC weather station data

In [None]:
import requests
from bs4 import BeautifulSoup
from pathlib import Path

def download_csv_files(url, folder_path):
    # Local directory where files will be saved using pathlib
    save_dir = Path(folder_path)
    save_dir.mkdir(parents=True, exist_ok=True) 

    # Get the webpage content
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to connect to {url}")
        return

    # Parse the webpage content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the CSV file links
    for link in soup.find_all('a'):
        file_name = link.get('href')
        if file_name.endswith('.csv'):
            file_url = url + file_name

            # Download the CSV file
            print(f"Downloading {file_name}...")
            csv_response = requests.get(file_url)

            # Save the CSV file using pathlib
            file_path = save_dir / file_name
            file_path.write_bytes(csv_response.content)

    print("All files downloaded!")

# Example usage
url = "https://dd.weather.gc.ca/climate/observations/hourly/csv/AB/"
download_folder = r"C:\Users\Sunbeam\Downloads\csv_files"
download_csv_files(url, download_folder)

#### Filter and analyze station data and create a merged csv

In [None]:
import arcpy
import re
import pandas as pd
from pathlib import Path

arcpy.env.overwriteOutput = True

def filter_weather_data(csv_dir, months, shp_polygon, weather_param=None):

    """
    Filters weather data from CSV files based on specified months and spatial polygon, and saves the filtered data as a combined CSV.

    Parameters:
    csv_dir (str): Directory containing the CSV files with weather data.
    months (list): List of months (as integers) to filter the data.
    shp_polygon (str): Path to the shapefile containing the polygon for spatial filtering.
    weather_param (list, optional): List of weather parameters to include in the filtered data. Defaults to None.
    
    """

    default_columns = ['longitude (x)', 'latitude (y)', 'station name', 'climate id', 'date/time (lst)', 'year', 'month', 'day', 'time (lst)']
    weather_param = weather_param or []
    original_weather_param = [param.lower() for param in weather_param]
    cleaned_weather_param = [re.sub(r'[^a-zA-Z]', '', param.lower())[:10] for param in weather_param]
    selected_columns = default_columns + original_weather_param

    with arcpy.da.SearchCursor(shp_polygon, ["SHAPE@"]) as cursor:
        bbox_polygon = next(cursor)[0]

    csv_files = list(Path(csv_dir).glob("*.csv"))
    station_dict, lat_long_id_dict, all_data = {}, {}, []

    for csv_file in csv_files:
        try:
            weather_data = pd.read_csv(csv_file, encoding='utf-8', on_bad_lines='skip', engine='python')
        except UnicodeDecodeError:
            weather_data = pd.read_csv(csv_file, encoding='ISO-8859-1', on_bad_lines='skip', engine='python')

        weather_data.columns = weather_data.columns.str.strip().str.lower()
        if 'station name' not in weather_data.columns:
            continue

        if 'date/time (lst)' in weather_data.columns:
            weather_data['date/time (lst)'] = pd.to_datetime(weather_data['date/time (lst)'], errors='coerce')
            weather_data = weather_data[weather_data['date/time (lst)'].dt.month.isin(months)]
        if weather_data.empty:
            continue

        longitude_col, latitude_col = [col for col in weather_data.columns if 'longitude' in col][0], [col for col in weather_data.columns if 'latitude' in col][0]
        weather_data['station_point'] = weather_data.apply(
            lambda row: arcpy.PointGeometry(arcpy.Point(row[longitude_col], row[latitude_col]), arcpy.SpatialReference(4326)), axis=1
        )
        weather_data = weather_data[weather_data['station_point'].apply(lambda pt: pt.within(bbox_polygon))]

        if weather_data.empty:
            continue

        for _, row in weather_data.iterrows():
            lat_long_id_key = (row[longitude_col], row[latitude_col], row['climate id'])
            if lat_long_id_key not in lat_long_id_dict:
                lat_long_id_dict[lat_long_id_key] = {
                    'station_name': row['station name'],
                    'climate_id': row['climate id'],
                    'location': row['station_point']
                }
                station_dict.setdefault((row['station name'], row['climate id']), {'location': row['station_point'], 'data': pd.DataFrame()})

        all_data.append(weather_data[selected_columns])

    print(f"Total unique stations processed from all CSVs: {len(station_dict)}")

    for (station_name, climate_id), station_info in station_dict.items():
        station_data = pd.concat([data for data in all_data if data['station name'].eq(station_name).any()])
        station_data['year'] = station_data['date/time (lst)'].dt.year
        year_counts = station_data.groupby('year').size()
        year_info = ', '.join([f"{year} ({count})" for year, count in year_counts.items()])
        print(f"{station_name} (ID: {climate_id}) - {year_info}")

    if lat_long_id_dict:
        shapefile_name = Path(shp_polygon).parent / f"filtered_stations_{Path(shp_polygon).stem}_{'_'.join(map(str, months))}.shp"
        point_features = [info['location'] for info in lat_long_id_dict.values()]
        arcpy.CopyFeatures_management(point_features, str(shapefile_name))

        arcpy.management.AddFields(str(shapefile_name), [["station_n", "TEXT"], ["climate_id", "TEXT"]] + [[field, "TEXT"] for field in cleaned_weather_param])

        with arcpy.da.UpdateCursor(str(shapefile_name), ["station_n", "climate_id"] + cleaned_weather_param) as cursor:
            for i, row in enumerate(cursor):
                key = list(lat_long_id_dict.keys())[i]
                row[0], row[1] = lat_long_id_dict[key]['station_name'], lat_long_id_dict[key]['climate_id']
                cursor.updateRow(row)

        print(f"Filtered shapefile saved at: {shapefile_name}")

    merged_csv_path = Path(shp_polygon).parent / f"merged_weather_data_{Path(shp_polygon).stem}_{'_'.join(map(str, months))}.csv"
    merged_data = pd.concat(all_data, ignore_index=True)
    merged_data.rename(columns={original: cleaned for original, cleaned in zip(original_weather_param, cleaned_weather_param)}, inplace=True)
    merged_data.to_csv(merged_csv_path, index=False)
    print(f"Merged CSV file saved at: {merged_csv_path}")

csv_dir = r"D:\UCalgary_Lectures\GEOG_683\Data_workspace\Station_Data_csv"
months = [1]
# shp_polygon = r"D:\UCalgary_Lectures\GEOG_683\Data_workspace\BBox\BBox_small.shp"
shp_polygon = r"D:\UCalgary_Lectures\GEOG_683\Data_workspace\BBox\BBox_large.shp"
weather_param = ['Temp (°C)', 'Wind Dir (10s deg)']

filter_weather_data(csv_dir, months, shp_polygon, weather_param)

#### Count the wind directions in two class - 210-280 and all the remainings and output in CSV

In [None]:
import pandas as pd
from pathlib import Path

def process_weather_data(input_csv, aggregation='yearly'):

    """
    Processes weather station data from a CSV file, counts wind directions within specific groups, and outputs a pivoted CSV file.
    
    Parameters:
    - input_csv: Path to the input CSV file containing weather station data.
    - aggregation: Aggregation level, either 'yearly' or '5-yearly'.
    """

    df = pd.read_csv(input_csv, low_memory=False).dropna(subset=['winddirsde'])
    df['station name'] = df['station name'].astype(str).str.strip()
    df['climate id'] = df['climate id'].astype(str).str.strip()
    df['actual_wind_dir'] = df['winddirsde'] * 10

    df['wind_dir_210_280'] = df['actual_wind_dir'].between(210, 280).astype(int)
    df['wind_dir_281_209'] = (~df['actual_wind_dir'].between(210, 280)).astype(int)

    if aggregation == '5-yearly':
        df['aggregation_period'] = (df['year'] // 5 * 5).astype(str) + '-' + (df['year'] // 5 * 5 + 4).astype(str)
        pivot_table = df.pivot_table(
            values=['wind_dir_210_280', 'wind_dir_281_209'],
            index=['station name', 'climate id', 'aggregation_period', 'month'],
            aggfunc='sum'
        ).reset_index()
    elif aggregation == 'yearly':
        df['aggregation_period'] = df['year'].astype(str)
        pivot_table = df.pivot_table(
            values=['wind_dir_210_280', 'wind_dir_281_209'],
            index=['station name', 'climate id', 'aggregation_period', 'month'],
            aggfunc='sum'
        ).reset_index()
    else:
        raise ValueError("Invalid aggregation level. Please specify either 'yearly' or '5-yearly'.")

    output_csv = Path(input_csv).with_name(f"{Path(input_csv).stem}_{aggregation}_pivoted.csv")
    pivot_table.to_csv(output_csv, index=False)
    print(f"Processed data has been saved to {output_csv}")

# Example usage
input_csv_path = r"D:\UCalgary_Lectures\GEOG_683\Data_workspace\BBox\merged_weather_data_BBox_large_1.csv"
process_weather_data(input_csv_path, aggregation='yearly')


Processed data has been saved to D:\UCalgary_Lectures\GEOG_683\Data_workspace\BBox\merged_weather_data_BBox_large_1_yearly_pivoted.csv
