In [38]:
import pandas as pd

cities_df = pd.read_csv('data/uscities.csv')  # from simplemaps
race_cities_df= pd.read_csv('data/top_training_cities_80pct.csv')


In [39]:
cities_df_p = cities_df[['city','state_id','lat','lng']]
cities_df_p['city'] = cities_df_p['city'].apply(str.lower)
cities_df_p['state'] = cities_df_p['state_id'].apply(str.lower)
cities_df_p.head(5)


Unnamed: 0,city,state_id,lat,lng,state
0,new york,NY,40.6943,-73.9249,ny
1,los angeles,CA,34.1141,-118.4068,ca
2,chicago,IL,41.8375,-87.6866,il
3,miami,FL,25.784,-80.2101,fl
4,houston,TX,29.786,-95.3885,tx


In [40]:
merged = race_cities_df.merge(cities_df_p[['city', 'state', 'lat', 'lng']], 
                          left_on=['city', 'state'], 
                          right_on=['city', 'state'])
merged.to_csv('data/mapped_cities.csv', index=False)
merged.head(5)

Unnamed: 0,city,state,runner_count,lat,lng
0,los angeles,ca,121020,34.1141,-118.4068
1,chicago,il,120839,41.8375,-87.6866
2,new york,ny,96250,40.6943,-73.9249
3,houston,tx,59373,29.786,-95.3885
4,san diego,ca,57024,32.8313,-117.1222


In [41]:
missing_cities = race_cities_df[~race_cities_df.city.isin(merged.city)]

mapped_amount = race_cities_df.runner_count.sum() / (missing_cities.runner_count.sum() + race_cities_df.runner_count.sum())
print(f"Mapped: {race_cities_df.runner_count.sum()}")
print(f"Unmapped: {missing_cities.runner_count.sum()}")
print(f"Coverage: {mapped_amount*100:>11.2f}%")

Mapped: 5934659
Unmapped: 691650
Coverage:       89.56%


In [89]:
import pandas as pd
import requests
from time import sleep
from tqdm.notebook import tqdm
from pathlib import Path

def get_historical_weather(lat, lon, start_date, end_date):
    """Fetch daily weather from Open-Meteo archive API."""
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date,
        "end_date": end_date,
        "daily": ["temperature_2m_max", "temperature_2m_min", "precipitation_sum"],
        "temperature_unit": "fahrenheit",
        "timezone": "auto"
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Weather API Error: {response.status_code}")
        print(f"Message: {response.text}")
        if response.status_code == 429:
            print("sleeping for 60 seconds...")
            sleep(60)
        raise Exception("exception querying weather api")
    
def fetch_weather_for_cities(df, start_date, end_date, output_path="data/weather_data_v2.csv", save_every=10):
    """
    Fetch weather for all cities, with progress bar and checkpoint saving.
    
    Args:
        df: DataFrame with city, state, lat, lng columns
        start_date: Start date string "YYYY-MM-DD"
        end_date: End date string "YYYY-MM-DD"
        output_path: Where to save intermediate/final results
        save_every: Save checkpoint every N cities
    """
    output_path = Path(output_path)
    
    # Load existing results if resuming
    if output_path.exists():
        existing_df = pd.read_csv(output_path)
        completed = set(zip(existing_df['city'], existing_df['state']))
        weather_records = existing_df.to_dict('records')
        print(f"Resuming: found {len(completed)} cities already processed at {output_path}")
    else:
        completed = set()
        weather_records = []
    
    # Filter to only cities we haven't done yet
    remaining = df[~df.apply(lambda r: (r['city'], r['state']) in completed, axis=1)]
    
    if len(remaining) == 0:
        print("All cities already processed!")
        return pd.read_csv(output_path)
    
    print(f"Fetching weather for {len(remaining)} cities...")
    
    for i, (idx, row) in enumerate(tqdm(remaining.iterrows(), total=len(remaining), desc="Fetching weather")):
        try:
            print(f"Fetching weather for {row["city"]}...")
            data = get_historical_weather(row['lat'], row['lng'], start_date, end_date)
            
            if data and 'daily' in data:
                daily = data['daily']
                for j, date in enumerate(daily['time']):
                    weather_records.append({
                        'city': row['city'],
                        'state': row['state'],
                        'date': date,
                        'temp_max': daily['temperature_2m_max'][j],
                        'temp_min': daily['temperature_2m_min'][j],
                        'precip': daily['precipitation_sum'][j]
                    })
            
            # Save checkpoint periodically
            if (i + 1) % save_every == 0:
                print(f"..incremental save ({len(weather_records)} records) to {output_path}...")
                pd.DataFrame(weather_records).to_csv(output_path, index=False)
                
        except Exception as e:
            tqdm.write(f"Error fetching {row['city']}, {row['state']}: {e}")
            #raise e
        
        sleep(15)
    
    # Final save
    result_df = pd.DataFrame(weather_records)
    result_df.to_csv(output_path, index=False)
    print(f"Saved {len(result_df)} records to {output_path}")
    
    return result_df

In [None]:
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": merged.iloc[0].lat,
    "longitude": merged.iloc[0].lng,
    "start_date": "1999-01-01",
    "end_date": "2026-01-01",
    "daily": ["temperature_2m_max", "temperature_2m_min", "precipitation_sum"],
    "temperature_unit": "fahrenheit",
    "timezone": "auto"
}
response = requests.get(url, params=params)
response

In [99]:
merged.runner_count.iloc[0:85].sum()/merged.runner_count.sum()

np.float64(0.37401059859706975)

In [98]:
merged = merged.sort_values(by="runner_count", ascending=False) # start with the most common cities
# Fetch weather data (can be interrupted and resumed)
weather_df = fetch_weather_for_cities(
    merged, 
    "1999-01-01", 
    "2026-01-01",
    output_path="data/weather_data_v2.csv",
    save_every=1
)
weather_df.head()

Resuming: found 85 cities already processed at data/weather_data_v2.csv
Fetching weather for 2532 cities...


Fetching weather:   0%|          | 0/2532 [00:00<?, ?it/s]

Fetching weather for orlando...
Weather API Error: 429
Message: {"error":true,"reason":"Daily API request limit exceeded. Please try again tomorrow."}
sleeping for 60 seconds...
Error fetching orlando, fl: exception querying weather api
Fetching weather for baltimore...
Weather API Error: 429
Message: {"error":true,"reason":"Daily API request limit exceeded. Please try again tomorrow."}
sleeping for 60 seconds...
Error fetching baltimore, md: exception querying weather api
Fetching weather for miami...
Weather API Error: 429
Message: {"reason":"Daily API request limit exceeded. Please try again tomorrow.","error":true}
sleeping for 60 seconds...
Error fetching miami, fl: exception querying weather api
Fetching weather for richmond...
Weather API Error: 429
Message: {"error":true,"reason":"Daily API request limit exceeded. Please try again tomorrow."}
sleeping for 60 seconds...
Error fetching richmond, va: exception querying weather api
Fetching weather for san antonio...
Weather API Er

KeyboardInterrupt: 