In [1]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [2]:
from dotenv import load_dotenv
import os
import pandas as pd
import yaml, json, time
from src.data_fetcher import (
    fetch_weather_data,
    fetch_energy_data,
)



In [3]:
load_dotenv()          # Load environment variables from .env file  

token = os.environ.get("NAOO_TOKEN")
eia_token = os.environ.get("EIA_KEY")

if token is None or eia_token is None:
    print("Error retrieving token") 
else:
    print("Token retrieved successfully")

Token retrieved successfully


In [4]:


with open("../config/config.yaml") as f:
    data = yaml.safe_load(f)

cities_list = data["cities"]

print(cities_list)

[{'city': 'New York', 'state': 'New York', 'station': 'GHCND:USW00094728', 'region': 'NYIS', 'timezone': 'Eastern'}, {'city': 'Chicago', 'state': 'Illinois', 'station': 'GHCND:USW00094846', 'region': 'PJM', 'timezone': 'Eastern'}, {'city': 'Houston', 'state': 'Texas', 'station': 'GHCND:USW00012960', 'region': 'ERCO', 'timezone': 'Central'}, {'city': 'Phoenix', 'state': 'Arizona', 'station': 'GHCND:USW00023183', 'region': 'AZPS', 'timezone': 'Mountain'}, {'city': 'Seattle', 'state': 'Washington', 'station': 'GHCND:USW00024233', 'region': 'SCL', 'timezone': 'Pacific'}]


In [5]:
output_file = "../data/raw/all_weather.csv"
first = True    #write header only for the first file

for city in cities_list:
    print(f"Fetching data for {city["city"]}...")
    
    result = fetch_weather_data(city["station"], "2025-03-01", "2025-07-26")
    df = pd.DataFrame(result["results"])
    df["city"] = city["city"]
    df["state"] = city["state"]
    
    # Write to CSV with mode (w) at the first loop, then (a) for subsequent loops
    df.to_csv(output_file,
              mode = "w" if first else "a",
              header = first,
              index = False)
    first = False
    
    print(f"Saved {len(df)} rows for {city['city']}. Sleeping for 10 Seconds…")
    print()
    print("-"*50)
    time.sleep(10)

Fetching data for New York...
Saved 294 rows for New York. Sleeping for 10 Seconds…

--------------------------------------------------
Fetching data for Chicago...
Saved 294 rows for Chicago. Sleeping for 10 Seconds…

--------------------------------------------------
Fetching data for Houston...
Saved 296 rows for Houston. Sleeping for 10 Seconds…

--------------------------------------------------
Fetching data for Phoenix...
Saved 296 rows for Phoenix. Sleeping for 10 Seconds…

--------------------------------------------------
Fetching data for Seattle...
Saved 296 rows for Seattle. Sleeping for 10 Seconds…

--------------------------------------------------


# fetch initial energy data

In [6]:
print(cities_list)

[{'city': 'New York', 'state': 'New York', 'station': 'GHCND:USW00094728', 'region': 'NYIS', 'timezone': 'Eastern'}, {'city': 'Chicago', 'state': 'Illinois', 'station': 'GHCND:USW00094846', 'region': 'PJM', 'timezone': 'Eastern'}, {'city': 'Houston', 'state': 'Texas', 'station': 'GHCND:USW00012960', 'region': 'ERCO', 'timezone': 'Central'}, {'city': 'Phoenix', 'state': 'Arizona', 'station': 'GHCND:USW00023183', 'region': 'AZPS', 'timezone': 'Mountain'}, {'city': 'Seattle', 'state': 'Washington', 'station': 'GHCND:USW00024233', 'region': 'SCL', 'timezone': 'Pacific'}]


the idea is to extract the demand and the net generation seperately and merge both data

In [7]:
output_file = "../data/raw/all_energy.csv"
first = True    #write header only for the first file
data = None

for city in cities_list:
    print(f"Fetching data for {city["city"]}...")
    
    region = city["region"]
    timezone = city["timezone"]
    start = "2025-03-01"
    end = "2025-07-26"
    types = ["D", "NG"]
    for type in types:
        energy_result = fetch_energy_data(region=region, types=type, timezone=timezone, start=start, end=end)
        data = energy_result["response"]["data"]
        


        df = pd.DataFrame(data)
        df["city"] = city["city"]
        df["state"] = city["state"]
    
        # Write to CSV with mode (w) at the first loop, then (a) for subsequent loops
        df.to_csv(output_file,
            mode = "w" if first else "a",
            header = first,
            index = False)
        first = False
    
        print(f"Saved {len(df)} rows for {city['city']} and {type}. Sleeping for 10 Seconds…")
        print()
    print("-"*50)
    time.sleep(10)

Fetching data for New York...
Saved 147 rows for New York and D. Sleeping for 10 Seconds…

Saved 147 rows for New York and NG. Sleeping for 10 Seconds…

--------------------------------------------------
Fetching data for Chicago...
Saved 147 rows for Chicago and D. Sleeping for 10 Seconds…

Saved 147 rows for Chicago and NG. Sleeping for 10 Seconds…

--------------------------------------------------
Fetching data for Houston...
Saved 147 rows for Houston and D. Sleeping for 10 Seconds…

Saved 147 rows for Houston and NG. Sleeping for 10 Seconds…

--------------------------------------------------
Fetching data for Phoenix...
Saved 147 rows for Phoenix and D. Sleeping for 10 Seconds…

Saved 147 rows for Phoenix and NG. Sleeping for 10 Seconds…

--------------------------------------------------
Fetching data for Seattle...
Saved 147 rows for Seattle and D. Sleeping for 10 Seconds…

Saved 147 rows for Seattle and NG. Sleeping for 10 Seconds…

-------------------------------------------

promise
