Exploring temperature data

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [2]:
import pandas as pd

In [3]:
df_temp = pd.read_csv("../data/raw/all_weather.csv")
df_temp.head()

Unnamed: 0,date,datatype,station,attributes,value,city,state
0,2025-03-01T00:00:00,TMAX,GHCND:USW00094728,",,W,2400",64.0,New York,New York
1,2025-03-01T00:00:00,TMIN,GHCND:USW00094728,",,W,2400",27.0,New York,New York
2,2025-03-02T00:00:00,TMAX,GHCND:USW00094728,",,W,2400",33.0,New York,New York
3,2025-03-02T00:00:00,TMIN,GHCND:USW00094728,",,W,2400",20.0,New York,New York
4,2025-03-03T00:00:00,TMAX,GHCND:USW00094728,",,W,2400",39.0,New York,New York


In [4]:
weather_pivot = df_temp.pivot_table(
    index=["date", "city", "state"],
    columns="datatype",
    values="value"
).reset_index()


In [5]:
weather_pivot.head(10)

datatype,date,city,state,TMAX,TMIN
0,2025-03-01T00:00:00,Chicago,Illinois,34.0,22.0
1,2025-03-01T00:00:00,Houston,Texas,85.0,51.0
2,2025-03-01T00:00:00,New York,New York,64.0,27.0
3,2025-03-01T00:00:00,Phoenix,Arizona,85.0,58.0
4,2025-03-01T00:00:00,Seattle,Washington,62.0,37.0
5,2025-03-02T00:00:00,Chicago,Illinois,37.0,18.0
6,2025-03-02T00:00:00,Houston,Texas,74.0,55.0
7,2025-03-02T00:00:00,New York,New York,33.0,20.0
8,2025-03-02T00:00:00,Phoenix,Arizona,80.0,55.0
9,2025-03-02T00:00:00,Seattle,Washington,51.0,44.0


Exploring energy data

In [6]:
energy_df = pd.read_csv("../data/raw/all_energy.csv")
energy_df.head(2)

Unnamed: 0,period,respondent,respondent-name,type,type-name,timezone,timezone-description,value,value-units,city,state
0,2025-07-26,NYIS,New York Independent System Operator,D,Demand,Eastern,Eastern,501454,megawatthours,New York,New York
1,2025-07-25,NYIS,New York Independent System Operator,D,Demand,Eastern,Eastern,575639,megawatthours,New York,New York


In [7]:
energy_pivot = energy_df.pivot_table(
    index= ['period', 'respondent-name', "timezone", "city", "state", "value-units"],
    columns=["type-name"],
    values="value"
).reset_index()

In [8]:
# Convert both to datetime
weather_pivot["date"] = pd.to_datetime(weather_pivot["date"]).dt.date
energy_pivot["period"] = pd.to_datetime(energy_pivot["period"]).dt.date

# rename period to date
energy_pivot = energy_pivot.rename(columns={'period':'date'})
energy_pivot.head()

type-name,date,respondent-name,timezone,city,state,value-units,Demand,Net generation
0,2025-03-01,Arizona Public Service Company,Mountain,Phoenix,Arizona,megawatthours,82316.0,58875.0
1,2025-03-01,"Electric Reliability Council of Texas, Inc.",Central,Houston,Texas,megawatthours,1059205.0,1057815.0
2,2025-03-01,New York Independent System Operator,Eastern,New York,New York,megawatthours,391122.0,365188.0
3,2025-03-01,"PJM Interconnection, LLC",Eastern,Chicago,Illinois,megawatthours,2110785.0,2206107.0
4,2025-03-01,Seattle City Light,Pacific,Seattle,Washington,megawatthours,25460.0,9262.0


In [9]:
# Merge on date, city, and state
merged_df = pd.merge(weather_pivot, energy_pivot, on=["date", "city", "state"], how="inner")

In [10]:
merged_df.head(20)

Unnamed: 0,date,city,state,TMAX,TMIN,respondent-name,timezone,value-units,Demand,Net generation
0,2025-03-01,Chicago,Illinois,34.0,22.0,"PJM Interconnection, LLC",Eastern,megawatthours,2110785.0,2206107.0
1,2025-03-01,Houston,Texas,85.0,51.0,"Electric Reliability Council of Texas, Inc.",Central,megawatthours,1059205.0,1057815.0
2,2025-03-01,New York,New York,64.0,27.0,New York Independent System Operator,Eastern,megawatthours,391122.0,365188.0
3,2025-03-01,Phoenix,Arizona,85.0,58.0,Arizona Public Service Company,Mountain,megawatthours,82316.0,58875.0
4,2025-03-01,Seattle,Washington,62.0,37.0,Seattle City Light,Pacific,megawatthours,25460.0,9262.0
5,2025-03-02,Chicago,Illinois,37.0,18.0,"PJM Interconnection, LLC",Eastern,megawatthours,2360071.0,2402257.0
6,2025-03-02,Houston,Texas,74.0,55.0,"Electric Reliability Council of Texas, Inc.",Central,megawatthours,1051763.0,1049151.0
7,2025-03-02,New York,New York,33.0,20.0,New York Independent System Operator,Eastern,megawatthours,417602.0,381132.0
8,2025-03-02,Phoenix,Arizona,80.0,55.0,Arizona Public Service Company,Mountain,megawatthours,79043.0,60650.0
9,2025-03-02,Seattle,Washington,51.0,44.0,Seattle City Light,Pacific,megawatthours,26038.0,9428.0


In [11]:
merged_df.to_csv("../data/processed/weather_energy_data.csv", index=False)


In [12]:
from src.data_fetcher import (
    get_temp_last_date,
    get_energy_last_date,
    get_cities
)

In [13]:
# get temp

start_temp = get_temp_last_date("../data/raw/all_weather.csv")
print(start_temp)
print(type(start_temp))

2025-07-27
<class 'datetime.date'>


In [14]:
# get enrgy

start_energy = get_energy_last_date("../data/raw/all_energy.csv")
print(start_energy)
print(type(start_energy))

2025-07-27
<class 'datetime.date'>


In [15]:
city = get_cities("../config/config.yaml")

for i in city:
    print(i)

{'city': 'New York', 'state': 'New York', 'station': 'GHCND:USW00094728', 'region': 'NYIS', 'timezone': 'Eastern'}
{'city': 'Chicago', 'state': 'Illinois', 'station': 'GHCND:USW00094846', 'region': 'PJM', 'timezone': 'Eastern'}
{'city': 'Houston', 'state': 'Texas', 'station': 'GHCND:USW00012960', 'region': 'ERCO', 'timezone': 'Central'}
{'city': 'Phoenix', 'state': 'Arizona', 'station': 'GHCND:USW00023183', 'region': 'AZPS', 'timezone': 'Mountain'}
{'city': 'Seattle', 'state': 'Washington', 'station': 'GHCND:USW00024233', 'region': 'SCL', 'timezone': 'Pacific'}


In [16]:
import datetime
print(datetime.date.today() + datetime.timedelta(days=1))

2025-07-31


In [17]:
if start_energy > datetime.date.today():
    print("hello")
elif start_energy < datetime.date.today():
    print("world")
else:
    print("none")

world


In [15]:
import pandas as pd
from datetime import datetime, timedelta

# 1. Load existing CSV
weather_path = "../data/raw/all_weather.csv"
df = pd.read_csv(weather_path, parse_dates=["date"])
df.head(2)

Unnamed: 0,date,datatype,station,attributes,value,city,state
0,2025-03-01,TMAX,GHCND:USW00094728,",,W,2400",64.0,New York,New York
1,2025-03-01,TMIN,GHCND:USW00094728,",,W,2400",27.0,New York,New York


In [16]:
df["date"] = pd.to_datetime(df["date"])

# 2. Get last date per city
last_dates = df.groupby("city")["date"].max().to_dict()
print(last_dates)



{'Chicago': Timestamp('2025-07-25 00:00:00'), 'Houston': Timestamp('2025-07-27 00:00:00'), 'New York': Timestamp('2025-07-25 00:00:00'), 'Phoenix': Timestamp('2025-07-27 00:00:00'), 'Seattle': Timestamp('2025-07-27 00:00:00')}


In [None]:
for key, value in 

In [9]:

import yaml
with open("../config/config.yaml") as f:
    data = yaml.safe_load(f)

cities_list = data["cities"]

# print(cities_list)

In [12]:
# 3. Use it to fetch data for each city
for city_info in cities_list:  # your YAML cities
    city = city_info["city"]

    
    # Use last saved date + 1 day or a default start
    start_date = last_dates.get(city, pd.Timestamp("2023-01-01")) + timedelta(days=1)
    # end_date = datetime.today()
    print(city, start_date)


New York 2025-07-26 00:00:00
Chicago 2025-07-26 00:00:00
Houston 2025-07-28 00:00:00
Phoenix 2025-07-28 00:00:00
Seattle 2025-07-28 00:00:00
