In [None]:
from datetime import datetime
from pathlib import Path
from typing import TypedDict

import chardet
import matplotlib.pyplot as plt
import pandas as pd
from meteostat import Daily, Point

In [None]:
dataset_path = Path.cwd().parents[1] / "datasets/csv"

In [None]:
class EncodingResult(TypedDict):
    encoding: str | None
    confidence: float
    language: str | None


def check_encoding(file_path: str) -> str | None:
    """Detects the encoding of a file."""
    with open(file=file_path, mode="rb") as f:
        result: EncodingResult = chardet.detect(byte_str=f.read())
    return result["encoding"]


def convert_to_utf8(file_path: str, encoding: str | None) -> None:
    """Converts a file to UTF-8 encoding."""
    # Open the file with the detected encoding
    with open(file=file_path, mode="r", encoding=encoding) as f:
        lines: list[str] = f.readlines()

    # Remove the line that starts with 'prefix:'
    lines = [line for line in lines if not line.startswith("prefix:")]

    # Write the lines back out in UTF-8
    with open(file=file_path, mode="w", encoding="utf-8") as f:
        f.writelines(lines)


In [None]:
check_encoding(str(dataset_path/"meteoswiss.csv"))

In [None]:
# Read and concatenate CSV files
df = pd.concat(
    [
        pd.read_csv(dataset_path/"washingtondc.csv"),
        pd.read_csv(dataset_path/"liestal.csv"),
        pd.read_csv(dataset_path/"kyoto.csv"),
        pd.read_csv(dataset_path/"vancouver.csv"),
        pd.read_csv(dataset_path/"south_korea.csv"),
        pd.read_csv(dataset_path/"japan.csv"),
        pd.read_csv(dataset_path/"nyc.csv"),
        pd.read_csv(dataset_path/"meteoswiss.csv", encoding="ISO-8859-1"),
    ],
    ignore_index=True,
)

In [None]:
# Function to split location 
def split_location(location): 
    parts = location.split('/', 1) 
    if len(parts) == 2: return parts 
    else: return [None, location]
    
# Apply the function to the location column 
df[['country', 'city']] = df['location'].apply(split_location).apply(pd.Series)

In [None]:
df.head()

In [None]:
# Drop duplicates based on the city column 
unique_cities_df = df.drop_duplicates(subset=['city'])

In [None]:
unique_cities_df.head()

In [None]:
# Step 1: Initialize an empty list to store weather data for all cities
all_weather_data = []

# Step 2: Loop through each row (city) in unique_cities_df to fetch temperature data
for _, row in unique_cities_df.iterrows():
    city = row["city"]
    lat = row["lat"]
    long = row["long"]

    # Check if lat and long are strings and fix sneaky incorrect unicode character pretending to be a minus if necessary
    if isinstance(lat, str):
        lat = float(lat.replace("\u2013", "-"))
    if isinstance(long, str):
        long = float(long.replace("\u2013", "-"))

    # Create a Meteostat Point using latitude and longitude
    location = Point(lat, long)

    # Define the date range (start_date to end_date)
    start_date = datetime(2023, 1, 1)  # Full year example
    end_date = datetime(2023, 12, 31)

    # Fetch daily weather data for the date range
    weather_data = Daily(location, start_date, end_date).fetch()

    if not weather_data.empty:
        # Reset index to bring the date (time) into the DataFrame as a regular column
        weather_data.reset_index(inplace=True)

        # Add columns for city and day_of_year
        weather_data["day_of_year"] = weather_data["time"].dt.dayofyear
        weather_data["city"] = city
        
        # Append the cleaned data to the list
        all_weather_data.append(weather_data[["time", "city", "day_of_year", "tavg", "tmin", "tmax"]])

# Step 3: Concatenate all fetched data into a single DataFrame
weather_data_df = pd.concat(all_weather_data, ignore_index=True)

weather_data_df.to_csv("all_dates.csv")

In [None]:
df2 = pd.read_csv("all_dates.csv")
df2.head()

In [None]:
# Perform a left join based on 'city' and 'time' (full date)
merged_df = pd.merge(
    df2,
    df,
    left_on=["city", "time"],
    right_on=["city", "bloom_date"],
    how="left",
    suffixes=("_temp", "_bloom")
)

In [None]:
merged_df['time'] = pd.to_datetime(merged_df['time'])

In [None]:
# Extract the year from the 'time' column for plotting
merged_df["temp_year"] = merged_df["time"].dt.year

# Group data by city and temp_year (derived from time)
city_year_groups = merged_df.groupby(["city", "temp_year"])

# Loop through each city-year combination
for (city, year), data in city_year_groups:
    if data.empty:
        continue

    # Drop rows where tavg is NaN
    city_year_data = data.dropna(subset=["tavg"])

    # Plot temperature over the year
    plt.figure(figsize=(12, 6))
    plt.plot(
      city_year_data["day_of_year"], 
      city_year_data["tavg"], 
      label="Daily Avg Temp", 
      color="blue"
    )
    plt.plot(
      city_year_data["day_of_year"], 
      city_year_data["tmin"], 
      label="Daily Min Temp", 
      color="green"
    )
    plt.plot(
      city_year_data["day_of_year"], 
      city_year_data["tmax"], 
      label="Daily Max Temp", 
      color="orange"
    )

    # Find the bloom date for this city and year (if available)
    bloom_data = data.dropna(subset=["bloom_doy"])
    if not bloom_data.empty:
        bloom_doy = bloom_data["bloom_doy"].values[0]
        bloom_temp = city_year_data.loc[
            city_year_data["day_of_year"] == bloom_doy, "tavg"
        ].values

        if len(bloom_temp) > 0:
            bloom_temp = bloom_temp[0]
            # Add marker for bloom day
            plt.scatter(
                bloom_doy,
                bloom_temp,
                color="red",
                s=100,
                zorder=5,
                label="Peak Bloom"
            )

    # Add titles and labels
    plt.title(f"Temperature and Peak Bloom in {city.capitalize()}, {year}")
    plt.xlabel("Day of Year")
    plt.ylabel("Average Temperature (Â°C)")
    plt.legend()
    plt.tight_layout()

    # Display the plot
    plt.show()
