In [101]:
import pandas as pd

cities_df = pd.read_csv('data/uscities.csv')  # from simplemaps
race_cities_df= pd.read_csv('data/top_training_cities_80pct.csv')


In [102]:
cities_df_p = cities_df[['city','state_id','lat','lng']]
cities_df_p['city'] = cities_df_p['city'].apply(str.lower)
cities_df_p['state'] = cities_df_p['state_id'].apply(str.lower)
cities_df_p.head(5)


Unnamed: 0,city,state_id,lat,lng,state
0,new york,NY,40.6943,-73.9249,ny
1,los angeles,CA,34.1141,-118.4068,ca
2,chicago,IL,41.8375,-87.6866,il
3,miami,FL,25.784,-80.2101,fl
4,houston,TX,29.786,-95.3885,tx


In [103]:
merged = race_cities_df.merge(cities_df_p[['city', 'state', 'lat', 'lng']], 
                          left_on=['city', 'state'], 
                          right_on=['city', 'state'])
merged.to_csv('data/mapped_cities.csv', index=False)
merged.head(5)

Unnamed: 0,city,state,runner_count,lat,lng
0,los angeles,ca,121020,34.1141,-118.4068
1,chicago,il,120839,41.8375,-87.6866
2,new york,ny,96250,40.6943,-73.9249
3,houston,tx,59373,29.786,-95.3885
4,san diego,ca,57024,32.8313,-117.1222


In [104]:
missing_cities = race_cities_df[~race_cities_df.city.isin(merged.city)]

mapped_amount = race_cities_df.runner_count.sum() / (missing_cities.runner_count.sum() + race_cities_df.runner_count.sum())
print(f"Mapped: {race_cities_df.runner_count.sum()}")
print(f"Unmapped: {missing_cities.runner_count.sum()}")
print(f"Coverage: {mapped_amount*100:>11.2f}%")

Mapped: 5934659
Unmapped: 691650
Coverage:       89.56%


In [105]:
import pandas as pd
import requests
from time import sleep
from tqdm.notebook import tqdm
from pathlib import Path

def get_historical_weather(lat, lon, start_date, end_date):
    """Fetch daily weather from Open-Meteo archive API."""
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date,
        "end_date": end_date,
        "daily": ["temperature_2m_max", "temperature_2m_min", "precipitation_sum"],
        "temperature_unit": "fahrenheit",
        "timezone": "auto"
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Weather API Error: {response.status_code}")
        print(f"Message: {response.text}")
        if response.status_code == 429:
            print("sleeping for 60 seconds...")
            sleep(60)
        raise Exception("exception querying weather api")
    
def fetch_weather_for_cities(df, start_date, end_date, output_path="data/weather_data_v2.csv", save_every=10):
    """
    Fetch weather for all cities, with progress bar and checkpoint saving.
    
    Args:
        df: DataFrame with city, state, lat, lng columns
        start_date: Start date string "YYYY-MM-DD"
        end_date: End date string "YYYY-MM-DD"
        output_path: Where to save intermediate/final results
        save_every: Save checkpoint every N cities
    """
    output_path = Path(output_path)
    
    # Load existing results if resuming
    if output_path.exists():
        existing_df = pd.read_csv(output_path)
        completed = set(zip(existing_df['city'], existing_df['state']))
        weather_records = existing_df.to_dict('records')
        print(f"Resuming: found {len(completed)} cities already processed at {output_path}")
    else:
        completed = set()
        weather_records = []
    
    # Filter to only cities we haven't done yet
    remaining = df[~df.apply(lambda r: (r['city'], r['state']) in completed, axis=1)]
    
    if len(remaining) == 0:
        print("All cities already processed!")
        return pd.read_csv(output_path)
    
    print(f"Fetching weather for {len(remaining)} cities...")
    
    for i, (idx, row) in enumerate(tqdm(remaining.iterrows(), total=len(remaining), desc="Fetching weather")):
        try:
            print(f"Fetching weather for {row["city"]}...")
            data = get_historical_weather(row['lat'], row['lng'], start_date, end_date)
            
            if data and 'daily' in data:
                daily = data['daily']
                for j, date in enumerate(daily['time']):
                    weather_records.append({
                        'city': row['city'],
                        'state': row['state'],
                        'date': date,
                        'temp_max': daily['temperature_2m_max'][j],
                        'temp_min': daily['temperature_2m_min'][j],
                        'precip': daily['precipitation_sum'][j]
                    })
            
            # Save checkpoint periodically
            if (i + 1) % save_every == 0:
                print(f"..incremental save ({len(weather_records)} records) to {output_path}...")
                pd.DataFrame(weather_records).to_csv(output_path, index=False)
                
        except Exception as e:
            tqdm.write(f"Error fetching {row['city']}, {row['state']}: {e}")
            #raise e
        
        sleep(15)
    
    # Final save
    result_df = pd.DataFrame(weather_records)
    result_df.to_csv(output_path, index=False)
    print(f"Saved {len(result_df)} records to {output_path}")
    
    return result_df

In [106]:
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": merged.iloc[0].lat,
    "longitude": merged.iloc[0].lng,
    "start_date": "1999-01-01",
    "end_date": "2026-01-01",
    "daily": ["temperature_2m_max", "temperature_2m_min", "precipitation_sum"],
    "temperature_unit": "fahrenheit",
    "timezone": "auto"
}
response = requests.get(url, params=params)
response

<Response [200]>

In [107]:
merged.runner_count.iloc[0:85].sum()/merged.runner_count.sum()

np.float64(0.37401059859706975)

In [108]:
# Load missing race cities prioritized by result count
missing_race_cities = pd.read_csv('data/missing_race_cities.csv')

# Filter out "anywhere" entries (virtual/international races without specific location)
missing_race_cities = missing_race_cities[missing_race_cities['city'] != 'anywhere']

print(f"Missing race cities (excluding 'anywhere'): {len(missing_race_cities)}")
print(f"Total results covered: {missing_race_cities['result_count'].sum():,}")
print(f"\nTop 20 missing cities:")
print(missing_race_cities.head(20).to_string(index=False))

Missing race cities (excluding 'anywhere'): 886
Total results covered: 432,152

Top 20 missing cities:
             city state  result_count
          orlando    fl         32256
           duluth    mn         17747
     saint george    ut         16667
         richmond    va         11455
        las vegas    nv         10857
carmel-by-the-sea    ca          9459
           oracle    az          9213
         carlsbad    ca          8415
             napa    ca          7888
 huntington beach    ca          7629
      miami beach    fl          6816
    newport beach    ca          6487
      new orleans    la          6432
        baltimore    md          5984
           eugene    or          5796
     indianapolis    in          5110
        vancouver    bc          5020
      long branch    nj          5003
    kiawah island    sc          4568
  manitou springs    co          4084


In [109]:
# Map missing cities to lat/lng coordinates
missing_mapped = missing_race_cities.merge(
    cities_df_p[['city', 'state', 'lat', 'lng']], 
    on=['city', 'state'],
    how='left'
)

# Check mapping success
has_coords = missing_mapped[['lat', 'lng']].notna().all(axis=1)
print(f"Successfully mapped: {has_coords.sum()} / {len(missing_mapped)} cities")
print(f"Missing coordinates: {(~has_coords).sum()} cities")

if (~has_coords).sum() > 0:
    print("\nCities without coordinates:")
    print(missing_mapped[~has_coords][['city', 'state', 'result_count']].head(20).to_string(index=False))

# Keep only cities with coordinates
missing_mapped_valid = missing_mapped[has_coords].copy()
missing_mapped_valid = missing_mapped_valid.sort_values('result_count', ascending=False)

print(f"\nReady to fetch weather for {len(missing_mapped_valid)} cities")
print(f"Potential coverage: {missing_mapped_valid['result_count'].sum():,} additional race results")

Successfully mapped: 720 / 887 cities
Missing coordinates: 167 cities

Cities without coordinates:
                     city state  result_count
             saint george    ut         16667
                vancouver    bc          5020
white sands missile range    nm          3392
              saint louis    mo          2954
                 falmouth    ma          1806
               bar harbor    me          1732
                 victoria    bc          1501
                  ventura    ca          1398
             east hampton    ny          1314
                   tanner    wa          1025
      carrabassett valley    me           684
                  bristol    nh           578
               waitsfield    vt           545
            saint charles    il           522
               barnstable    ma           506
                   sparks    md           496
             saint joseph    mn           482
                     todd    nc           393
             tomkins cove  

In [110]:
def fetch_weather_incremental(cities_df, start_date, end_date, output_path="data/weather_data_v2.csv", 
                              batch_size=10, delay_between=15):
    """
    Fetch weather incrementally with better progress tracking and API limit handling.
    
    Args:
        cities_df: DataFrame with city, state, lat, lng, result_count columns
        start_date: Start date "YYYY-MM-DD"
        end_date: End date "YYYY-MM-DD"
        output_path: Output CSV path
        batch_size: Number of cities to fetch before reporting progress
        delay_between: Seconds to wait between API calls (default 15 for free tier)
    """
    output_path = Path(output_path)
    
    # Load existing data
    if output_path.exists():
        existing_df = pd.read_csv(output_path)
        completed = set(zip(existing_df['city'], existing_df['state']))
        weather_records = existing_df.to_dict('records')
        print(f"Found {len(completed)} cities already in {output_path}")
    else:
        completed = set()
        weather_records = []
        print(f"Starting fresh - no existing data at {output_path}")
    
    # Filter to cities we haven't done
    remaining = cities_df[~cities_df.apply(lambda r: (r['city'], r['state']) in completed, axis=1)].copy()
    
    if len(remaining) == 0:
        print("All cities already processed!")
        return pd.read_csv(output_path)
    
    print(f"\nRemaining cities to fetch: {len(remaining)}")
    print(f"Estimated time: ~{len(remaining) * delay_between / 60:.1f} minutes")
    print(f"Potential new results covered: {remaining['result_count'].sum():,}")
    
    # Show progress tracking
    total_new_records_target = len(remaining) * ((pd.to_datetime(end_date) - pd.to_datetime(start_date)).days + 1)
    print(f"Expected weather records: ~{total_new_records_target:,}")
    
    successful = 0
    failed = 0
    
    for i, (idx, row) in enumerate(remaining.iterrows(), 1):
        try:
            print(f"[{i}/{len(remaining)}] {row['city']}, {row['state']} ({row['result_count']:,} results)...", 
                  end=' ', flush=True)
            
            data = get_historical_weather(row['lat'], row['lng'], start_date, end_date)
            
            if data and 'daily' in data:
                daily = data['daily']
                records_added = 0
                for j, date in enumerate(daily['time']):
                    weather_records.append({
                        'city': row['city'],
                        'state': row['state'],
                        'date': date,
                        'temp_max': daily['temperature_2m_max'][j],
                        'temp_min': daily['temperature_2m_min'][j],
                        'precip': daily['precipitation_sum'][j]
                    })
                    records_added += 1
                
                print(f"‚úì {records_added} records")
                successful += 1
                
                # Save progress every batch_size cities
                if i % batch_size == 0:
                    pd.DataFrame(weather_records).to_csv(output_path, index=False)
                    current_coverage = (85 + successful) / 977 * 100  # 977 total race cities
                    print(f"\n  ‚Üí Checkpoint: {successful} cities fetched, {len(weather_records):,} total records")
                    print(f"  ‚Üí Estimated coverage: ~{current_coverage:.1f}%\n")
            else:
                print("‚úó No data returned")
                failed += 1
                
        except Exception as e:
            print(f"‚úó Error: {e}")
            failed += 1
            
            # If we hit rate limits, save progress and stop
            if "429" in str(e) or "limit" in str(e).lower():
                print(f"\n‚ö† API rate limit reached after {successful} successful fetches")
                print(f"Saving progress and stopping...")
                break
        
        # Wait between requests to respect API limits
        if i < len(remaining):  # Don't wait after last city
            sleep(delay_between)
    
    # Final save
    result_df = pd.DataFrame(weather_records)
    result_df.to_csv(output_path, index=False)
    
    print(f"\n{'='*80}")
    print(f"Completed: {successful} successful, {failed} failed")
    print(f"Total weather records: {len(result_df):,}")
    print(f"Saved to: {output_path}")
    print(f"{'='*80}")
    
    return result_df

In [116]:
# Fetch weather for top N missing cities
# Start with top 25 to get from 64.7% to ~79.8% coverage

# Select top cities to fetch
TOP_N = 25  # Adjust this number based on API limits and time available
cities_to_fetch = missing_mapped_valid.head(TOP_N)

print(f"{'='*80}")
print(f"Fetching weather for top {TOP_N} missing cities")
print(f"{'='*80}")
print(f"Expected coverage increase: ~15% (64.7% ‚Üí ~79.8%)")
print(f"Expected time: ~{TOP_N * 15 / 60:.1f} minutes at 15 seconds per city")
print(f"Results covered: {cities_to_fetch['result_count'].sum():,}")
print(f"\nCities to fetch:")
print(cities_to_fetch[['city', 'state', 'result_count']].head(10).to_string(index=False))
if TOP_N > 10:
    print(f"... and {TOP_N - 10} more")

# Note: Uncomment the line below to actually start fetching
weather_df = fetch_weather_incremental(cities_to_fetch, "1999-01-01", "2026-01-01")

Fetching weather for top 25 missing cities
Expected coverage increase: ~15% (64.7% ‚Üí ~79.8%)
Expected time: ~6.2 minutes at 15 seconds per city
Results covered: 191,661

Cities to fetch:
             city state  result_count
          orlando    fl         32256
           duluth    mn         17747
         richmond    va         11455
        las vegas    nv         10857
carmel-by-the-sea    ca          9459
           oracle    az          9213
         carlsbad    ca          8415
             napa    ca          7888
 huntington beach    ca          7629
      miami beach    fl          6816
... and 15 more
Found 85 cities already in data/weather_data_v2.csv

Remaining cities to fetch: 25
Estimated time: ~6.2 minutes
Potential new results covered: 191,661
Expected weather records: ~246,575
[1/25] orlando, fl (32,256 results)... ‚úì 9863 records
[2/25] duluth, mn (17,747 results)... ‚úì 9863 records
[3/25] richmond, va (11,455 results)... ‚úì 9863 records
[4/25] las vegas, nv (1

## Weather Data Expansion Strategy

### Current Status
- **85 cities** with weather data ‚Üí **64.7% coverage**
- **912 missing cities** (excluding "anywhere" virtual races)

### Incremental Coverage Plan

| Cities Added | Coverage | Improvement | Est. Time (15s/city) |
|--------------|----------|-------------|----------------------|
| Top 10       | 74.7%    | +9.9%       | ~2.5 minutes         |
| Top 25       | 79.8%    | +15.1%      | ~6 minutes           |
| Top 50       | 84.4%    | +19.7%      | ~12 minutes          |
| Top 100      | 88.5%    | +23.8%      | ~25 minutes          |
| Top 200      | 91.9%    | +27.2%      | ~50 minutes          |

### API Options

**Open-Meteo Free Tier** (currently using)
- ‚úÖ Free, no API key needed
- ‚úÖ Historical data from 1940-present
- ‚ùå Rate limit: ~10,000 requests/day
- üìä Good for: Adding 10-50 cities per day

**Alternative APIs** (if needed)
1. **Visual Crossing** - Free tier: 1000 records/day
2. **WeatherAPI.com** - Free tier: 1M calls/month  
3. **NOAA API** - Free, unlimited (US only, requires station mapping)

### Recommended Approach
1. Start with top 25 cities (~15% improvement)
2. Run daily batches to avoid rate limits
3. Monitor API responses and save progress checkpoints
4. Consider paid tier ($40-50 one-time) if faster completion needed

In [118]:
# After fetching more weather data, re-run the enrichment script to update coverage
# This will merge the new weather data with the race results

import subprocess

def update_race_weather_enrichment():
    """
    Re-run the enrichment script to update race day weather with newly added cities.
    """
    print("Running enrichment script to update race day weather...")
    print("This will update: data/featurized_race_data_v2_with_raceday_weather.csv")
    print("="*80)
    
    result = subprocess.run(
        ['python', 'enrich_race_day_weather.py'],
        capture_output=True,
        text=True
    )
    
    print(result.stdout)
    if result.stderr:
        print("Errors:", result.stderr)
    
    print("="*80)
    print("Enrichment complete! Check the coverage statistics above.")
    
# Uncomment to re-run enrichment after adding weather data:
update_race_weather_enrichment()

Running enrichment script to update race day weather...
This will update: data/featurized_race_data_v2_with_raceday_weather.csv

Race Day Weather Enrichment

Loading data files...
1. Loading featurized_race_data_v2.csv... ‚úì 1,688,270 records
2. Loading race_locations_normalized.csv... ‚úì 1,857 races
3. Loading weather_data_v2.csv... ‚úì 1,058,210 daily records

Step 1: Joining with race locations...
Records with race location: 1,595,734 (94.5%)
Records without race location: 92,536 (5.5%)

Unique races without location: 67
First 10 unmapped races:
  - 26.2_with_donna:_the_national_marathon_to_finish_breast_cancer
  - alaska_series_-_day_1
  - alaska_series_-_day_2
  - alaska_series_-_day_3
  - alaska_series_-_day_4
  - appalachian_series_-_al
  - appalachian_series_-_ga
  - appalachian_series_-_nc
  - appalachian_series_-_sc
  - appalachian_series_-_tn

Step 2: Joining with race day weather...
Records with race day weather: 1,275,345 (75.5%)
Records without race day weather: 412,925