In [None]:
import requests, pandas as pd, pathlib

# Make sure raw-data folder exists
pathlib.Path("data/raw").mkdir(parents=True, exist_ok=True)

# City coordinates
CITIES = {
    "london":     {"lat": 51.5072, "lon": -0.1276},
    "manchester": {"lat": 53.4808, "lon": -2.2426},
    "edinburgh":  {"lat": 55.9533, "lon": -3.1883},
}

START, END = "2020-01-01", "2024-12-31"
DAILY = ["temperature_2m_mean", "precipitation_sum"]

frames = []

print("Fetching weather data... üå¶Ô∏è\n")
for name, c in CITIES.items():
    print(f"‚Üí Downloading data for {name.title()} ...")
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": c["lat"],
        "longitude": c["lon"],
        "start_date": START,
        "end_date": END,
        "daily": ",".join(DAILY),
        "timezone": "Europe/London",
    }
    r = requests.get(url, params=params, timeout=60)
    r.raise_for_status()
    df = pd.DataFrame(r.json()["daily"])
    df["date"] = pd.to_datetime(df["time"])
    df["city"] = name
    frames.append(df.drop(columns=["time"]))
    print(f"   ‚úÖ {len(df)} rows fetched.\n")

# Combine all cities into one DataFrame
raw = pd.concat(frames, ignore_index=True)

# Save to CSV
raw.to_csv("data/raw/uk_weather_daily.csv", index=False)

# --- ‚úÖ Verification summary (shows in main output) ---
print("\n‚úÖ Data fetch complete!")
print(f"Total rows: {len(raw):,}")
print("Cities included:", raw['city'].unique())
print("\nRow count per city:")
print(raw['city'].value_counts())

# Show first few rows of each city
print("\nSample data:")
for city in raw['city'].unique():
    print(f"\n{city.title()} sample:")
    display(raw[raw['city']==city].head(2))



In [None]:
df = pd.read_csv("data/raw/uk_weather_daily.csv")
print(df.shape)
df.head()

## Notes (Day 2)
# Pulled 2020‚Äì2024 weather data for London, Manchester, Edinburgh via Open-Meteo API.
#Dataset saved at `data/raw/uk_weather_daily.csv` (~5 K rows).
# Verified columns, dates, and missing values.
#Next step ‚Üí clean and feature-engineer the data.


In [None]:
# -----------------------------------------------------------
# STEP 2 ‚Äì Data Cleaning and Feature Engineering
# -----------------------------------------------------------

import pandas as pd, numpy as np
from pathlib import Path

# Load raw data
df = pd.read_csv("data/raw/uk_weather_daily.csv", parse_dates=["date"])
print("Shape:", df.shape)
print("Cities:", df["city"].unique())
print("\nInitial Null Counts:\n", df.isna().sum())

# --- Data Type Fixes and Missing Value Handling ---
# Convert columns to numeric
df["temperature_2m_mean"] = pd.to_numeric(df["temperature_2m_mean"], errors="coerce")
df["precipitation_sum"]   = pd.to_numeric(df["precipitation_sum"], errors="coerce")

# Sort (required before interpolation)
df = df.sort_values(["city", "date"], kind="mergesort")

# Interpolate missing temperature per city
df["temperature_2m_mean"] = (
    df.groupby("city")["temperature_2m_mean"]
      .transform(lambda s: s.interpolate(limit_direction="both"))
)

# Replace missing precipitation with 0
df["precipitation_sum"] = df["precipitation_sum"].fillna(0.0)

print("\n‚úÖ Missing values handled successfully.")


# --- Feature Engineering ---
# Add Year, Month, Season columns
df["year"]  = df["date"].dt.year.astype("int16")
df["month"] = df["date"].dt.month.astype("int8")

season_map = {
    12:"Winter", 1:"Winter", 2:"Winter",
     3:"Spring", 4:"Spring", 5:"Spring",
     6:"Summer", 7:"Summer", 8:"Summer",
     9:"Autumn",10:"Autumn",11:"Autumn"
}
df["season"] = df["month"].map(season_map)
df["season"] = pd.Categorical(
    df["season"],
    categories=["Winter","Spring","Summer","Autumn"],
    ordered=True
)
print("‚úÖ Added date features: year, month, season.")


# --- Validation Checks ---
key_cols = ["date","city","temperature_2m_mean","precipitation_sum","year","month","season"]
nulls = df[key_cols].isna().sum()
print("\nNull counts:\n", nulls)
assert nulls.sum() == 0, "‚ö†Ô∏è There are still missing values."

# Temperature sanity check
assert df["temperature_2m_mean"].between(-30, 40).mean() > 0.99, "‚ö†Ô∏è Temperature range seems off."
assert (df["precipitation_sum"] >= 0).all(), "‚ö†Ô∏è Negative precipitation values found."

# Equal date coverage across cities
counts = df.groupby("city")["date"].nunique()
print("\nUnique days per city:\n", counts)
assert counts.nunique() == 1, "‚ö†Ô∏è Unequal date coverage across cities."

# Date range check
print("\nDate range:", df["date"].min(), "‚Üí", df["date"].max())
print("‚úÖ All validation checks passed.")


# --- Save Outputs ---
out_dir = Path("data/processed")
out_dir.mkdir(parents=True, exist_ok=True)

csv_out = out_dir / "uk_weather_clean.csv"
parquet_out = out_dir / "uk_weather_clean.parquet"

df.to_csv(csv_out, index=False)
df.to_parquet(parquet_out, index=False)

print(f"\n‚úÖ Clean data saved successfully:\n- {csv_out}\n- {parquet_out}")


# --- Notes (inline for reference) ---
# - Converted values to numeric
# - Interpolated missing temperatures per city
# - Filled precipitation NaNs with 0
# - Added date features (year, month, season)
# - Validated data (no nulls, balanced dates, sane ranges)
# - Exported to CSV and Parquet for later ETL work


In [None]:
# -----------------------------------------------------------
# STEP 3 ‚Äì EDA & VISUALIZATION
# - Loads cleaned dataset from /data/processed
# - Creates 3 visuals:
#   (1) Daily temperature trend lines per city
#   (2) Monthly precipitation bars for the latest full year
#   (3) Boxplot of daily temperature by season (per city)
# - Computes a city-level summary table
# - Saves all outputs to /figures and /data/processed
# -----------------------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# ---------- Load cleaned data ----------
data_path = Path("data/processed/uk_weather_clean.csv")
assert data_path.exists(), "Clean file not found. Complete Day 3 first."
df = pd.read_csv(data_path, parse_dates=["date"])

# ---------- Basic sanity prints ----------
print("Rows:", len(df), "| Cities:", df["city"].nunique(), df["city"].unique())
print("Date span:", df["date"].min().date(), "‚Üí", df["date"].max().date())

# ---------- Ensure output folders ----------
fig_dir = Path("figures")
fig_dir.mkdir(parents=True, exist_ok=True)

# ---------- (1) Daily temperature trend: one figure per city ----------
# Rationale: simple, readable time-series for portfolio screenshots.
for city, sub in df.sort_values("date").groupby("city"):
    plt.figure(figsize=(10, 4.5))
    plt.plot(sub["date"], sub["temperature_2m_mean"])
    plt.title(f"Daily Mean Temperature ‚Äì {city.title()}")
    plt.xlabel("Date"); plt.ylabel("Temperature (¬∞C)")
    plt.tight_layout()
    out = fig_dir / f"temp_trend_{city}.png"
    plt.savefig(out, dpi=180); plt.close()
    print("Saved:", out)

# ---------- (2) Monthly precipitation for latest full year ----------
# Find the latest year with all 12 months per city (balanced comparison).
year_month_counts = df.groupby(["city", "year"])["month"].nunique().reset_index()
full_years = year_month_counts[year_month_counts["month"] == 12]["year"]
assert not full_years.empty, "No full-year coverage detected."
target_year = int(full_years.max())  # latest full year
print("Latest full year selected for bars:", target_year)

monthly = (df[df["year"] == target_year]
           .groupby(["city", "month"], as_index=False)
           .agg(precip_sum=("precipitation_sum", "sum"),
                temp_mean=("temperature_2m_mean", "mean")))

# Bar chart: monthly precipitation by city
plt.figure(figsize=(10, 5))
sns.barplot(data=monthly, x="month", y="precip_sum", hue="city")
plt.title(f"Monthly Precipitation by City ‚Äì {target_year}")
plt.xlabel("Month"); plt.ylabel("Precipitation (mm)")
plt.tight_layout()
out = fig_dir / f"precip_bar_{target_year}.png"
plt.savefig(out, dpi=180); plt.close()
print("Saved:", out)

# ---------- (3) Seasonal temperature distribution ----------
# Boxplot helps show spread/variation; good interview talking point.
plt.figure(figsize=(10, 5))
sns.boxplot(data=df, x="season", y="temperature_2m_mean", hue="city")
plt.title("Daily Temperature by Season and City")
plt.xlabel("Season"); plt.ylabel("Temperature (¬∞C)")
plt.tight_layout()
out = fig_dir / "temp_boxplot_season_city.png"
plt.savefig(out, dpi=180); plt.close()
print("Saved:", out)

# ---------- Summary table (city-level) ----------
# Provides quick, quantified talking points for README/portfolio.
summary = (df.groupby("city")
             .agg(
                 days=("date","nunique"),
                 temp_mean=("temperature_2m_mean","mean"),
                 temp_min=("temperature_2m_mean","min"),
                 temp_max=("temperature_2m_mean","max"),
                 precip_total=("precipitation_sum","sum")
             )
             .round({"temp_mean":2, "temp_min":2, "temp_max":2, "precip_total":1})
          ).reset_index()

print("\nCity-level summary:\n", summary)

# Save summary to processed for reuse in README or later ETL joins.
out_table = Path("data/processed") / "city_summary.csv"
summary.to_csv(out_table, index=False)
print("Saved summary:", out_table)

# ---------- (optional) Quick peek tables for the notebook output ----------
# (kept tiny so the notebook remains tidy)
display(summary.head())
display(monthly.sort_values(["city","month"]).head(9))

print("\n‚úÖ EDA complete: figures exported to /figures and summary table saved.")


In [None]:
# -----------------------------------------------------------
# STEP 4 ‚Äì DOCUMENTATION & POLISH (error-proof version)
# -----------------------------------------------------------

import pandas as pd
from pathlib import Path

summary_path = Path("data/processed/city_summary.csv")
summary = pd.read_csv(summary_path)

# Instead of triple quotes in the markdown section,
# we'll use a plain-text layout to avoid syntax issues.
readme_text = f"""# UK Weather Data Cleaner

## Overview
A data cleaning and exploratory analysis project using **Open-Meteo** historical weather data
for London, Manchester, and Edinburgh (2020‚Äì2024).

## Key Insights
- Average daily temperature ranges:
  - London: {summary.loc[summary.city=='london','temp_mean'].values[0]:.1f} ¬∞C
  - Manchester: {summary.loc[summary.city=='manchester','temp_mean'].values[0]:.1f} ¬∞C
  - Edinburgh: {summary.loc[summary.city=='edinburgh','temp_mean'].values[0]:.1f} ¬∞C
- {int(summary.precip_total.max())} mm total precipitation recorded in the wettest city.
- Dataset: {len(summary)} cities √ó {summary.days.mean():.0f} days (‚âà5 years).

## Folder Structure
data/
 ‚îú‚îÄ‚îÄ raw/              - original downloads
 ‚îú‚îÄ‚îÄ processed/        - cleaned & summarized datasets
figures/               - exported charts
notebooks/             - Jupyter notebooks

## Outputs
- figures/temp_trend_*.png ‚Äì Daily temperature trends
- figures/precip_bar_YYYY.png ‚Äì Monthly precipitation comparison
- figures/temp_boxplot_season_city.png ‚Äì Seasonal temperature spread
- data/processed/city_summary.csv ‚Äì Aggregated statistics

## How to Run
1. Clone repo & open in VS Code
2. Ensure Python 3.12 + venv installed
3. Open notebook ‚Üí Run All
4. Results and figures will appear in /figures & /data/processed
"""

# Write README_TEMPLATE.md
Path("README_TEMPLATE.md").write_text(readme_text, encoding="utf-8")
print("‚úÖ README_TEMPLATE.md generated ‚Äî open it in VS Code and refine wording.")
