# Resarch Notebook

In [36]:
import pandas as pd
FEATURE_DATA_FILE = "../data/featurized_race_data_v2.csv"
df = pd.read_csv(FEATURE_DATA_FILE)

# Remove some rows that have parsing errors
df = df[df["sex"].isin(['M','F'])]

# Enrich with some additional metadata
Add some additional metadata, like the BQ times and the AG relative time

In [45]:
bq_data = pd.read_csv("../data/boston_marathon_qualifying.csv")
import numpy as np
bq_data['min_age'] = bq_data['min_age'].apply(float)
# Keep only rows with valid age values
df["temp_join_idx"] = df.index
df_with_age = df[df['age'].notna()].copy()

# Then do the merge
df_with_age_sorted = df_with_age.sort_values('age')
df_with_age_sorted = pd.merge_asof(
    df_with_age_sorted,
    bq_data[['min_age', 'men_standard_minutes', 'women_standard_minutes', 'nonbinary_standard_minutes']],
    left_on='age',
    right_on='min_age',
    direction='backward'
)

# Add the BQ standard based on sex
conditions = [
    df_with_age_sorted['sex'].str.lower().isin(['m', 'male', 'man']),
    df_with_age_sorted['sex'].str.lower().isin(['f', 'female', 'woman'])
]
choices = [
    df_with_age_sorted['men_standard_minutes'],
    df_with_age_sorted['women_standard_minutes']
]
df_with_age_sorted['bq_standard_minutes'] = np.select(conditions, choices, default=df_with_age_sorted['nonbinary_standard_minutes'])

# If you need the full dataset back with nulls for missing ages:
df_with_age_sorted = df_with_age_sorted.drop(columns=['min_age', 'men_standard_minutes', 'women_standard_minutes', 'nonbinary_standard_minutes'])
df_enriched = df.merge(df_with_age_sorted[['temp_join_idx', 'bq_standard_minutes']], on='temp_join_idx', how='left')
df_enriched = df_enriched.drop(columns="temp_join_idx")

df_enriched["bq_adjusted_time"] = df_enriched.apply(
    lambda row:None if pd.isnull(row["bq_standard_minutes"]) else row["time"] - row["bq_standard_minutes"],
    axis=1)
df_enriched = df_enriched[df_enriched["bq_adjusted_time"].notna()]
display(df_enriched)

Unnamed: 0,age,sex,time,race,date,city,state,full_temp_min,full_temp_max,full_temp_median_min,...,full_overall_weekend_days_of_precip,peak_temp_min,peak_temp_max,peak_temp_median_min,peak_temp_median_max,peak_overall_precip,peak_overall_days_of_precip,peak_overall_weekend_days_of_precip,bq_standard_minutes,bq_adjusted_time
0,41.0,M,203.450000,"""last_chance_for_boston""_marathon",2003-02-02,ann arbor,mi,-4.3,61.5,21.95,...,8,-4.3,42.7,12.00,25.25,13.3,9,3,185.0,18.450000
1,48.0,M,203.366667,"""last_chance_for_boston""_marathon",2003-02-02,champaign,il,-1.0,63.9,25.75,...,10,-1.0,53.0,16.60,31.65,20.7,9,5,195.0,8.366667
2,26.0,F,220.983333,"""last_chance_for_boston""_marathon",2003-02-02,chicago,il,-3.5,62.4,26.90,...,12,-3.5,50.8,13.70,28.55,15.8,9,6,205.0,15.983333
3,43.0,M,194.733333,"""last_chance_for_boston""_marathon",2003-02-02,cincinnati,oh,-10.2,69.2,27.10,...,11,-10.2,52.6,19.05,30.90,15.9,10,5,185.0,9.733333
4,31.0,M,213.250000,"""last_chance_for_boston""_marathon",2003-02-02,cincinnati,oh,-10.2,69.2,27.10,...,11,-10.2,52.6,19.05,30.90,15.9,10,5,175.0,38.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1687457,55.0,M,306.166667,zydeco_marathon,2020-03-08,fort worth,tx,26.1,78.3,40.60,...,9,26.1,78.3,42.00,60.65,112.8,12,3,210.0,96.166667
1687458,23.0,M,259.833333,zydeco_marathon,2020-03-08,houston,tx,31.9,82.1,47.25,...,11,34.1,77.7,49.70,67.70,110.5,15,4,175.0,84.833333
1687459,23.0,M,259.833333,zydeco_marathon,2020-03-08,houston,tx,31.9,82.1,47.25,...,11,34.1,77.7,49.70,67.70,110.5,15,4,175.0,84.833333
1687460,55.0,M,275.700000,zydeco_marathon,2020-03-08,mobile,al,29.6,75.6,48.20,...,13,35.7,73.5,49.45,65.40,127.4,15,3,210.0,65.700000


## Data Profiling

In [12]:
# Basic DataFrame Info
print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\n" + "="*80)
df.info()

Shape: 1,688,270 rows × 21 columns

Memory Usage: 343.69 MB

<class 'pandas.DataFrame'>
RangeIndex: 1688270 entries, 0 to 1688269
Data columns (total 21 columns):
 #   Column                               Non-Null Count    Dtype  
---  ------                               --------------    -----  
 0   age                                  1447190 non-null  float64
 1   sex                                  1688255 non-null  str    
 2   time                                 1561622 non-null  float64
 3   race                                 1688270 non-null  str    
 4   date                                 1688270 non-null  str    
 5   city                                 1688270 non-null  str    
 6   state                                1688270 non-null  str    
 7   full_temp_min                        1688270 non-null  float64
 8   full_temp_max                        1688270 non-null  float64
 9   full_temp_median_min                 1688270 non-null  float64
 10  full_temp_median

In [13]:
# Categorical Columns Distribution
print("Categorical Columns Summary:")
print("="*80)

categorical_cols = df.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    print(f"Found {len(categorical_cols)} categorical columns\n")
    for col in categorical_cols:
        unique_count = df[col].nunique()
        print(f"\n{col}: {unique_count} unique values")
        if unique_count <= 20:  # Only show value counts for columns with <= 20 unique values
            print(df[col].value_counts().head(10))
        else:
            print(f"  Top 5 most common values:")
            print(df[col].value_counts().head(5))
else:
    print("No categorical columns found")

Categorical Columns Summary:
Found 5 categorical columns


sex: 356 unique values
  Top 5 most common values:
sex
M    947632
F    739890
X       189
m        54
f        18
Name: count, dtype: int64

race: 1278 unique values
  Top 5 most common values:
race
city_of_los_angeles_marathon_(l.a._marathon)    133561
chicago_marathon                                128585
marine_corps_marathon                            89408
new_york_city_marathon                           74800
boston_marathon                                  74144
Name: count, dtype: int64

date: 2625 unique values
  Top 5 most common values:
date
2016-10-09    17126
2012-10-07    16993
2011-10-09    16497
2023-11-05    16342
2014-10-12    14532
Name: count, dtype: int64

city: 85 unique values
  Top 5 most common values:
city
los angeles    121020
chicago        120839
new york        96250
houston         59373
san diego       57024
Name: count, dtype: int64

state: 33 unique values
  Top 5 most common values:
state
ca 

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_cols = df.select_dtypes(include=['object']).columns


In [14]:
# Numeric Columns Distribution
print("Numeric Columns Summary:")
print("="*80)

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
if len(numeric_cols) > 0:
    print(f"Found {len(numeric_cols)} numeric columns\n")
    for col in numeric_cols:
        print(f"\n{col}:")
        print(f"  Min: {df[col].min()}")
        print(f"  Max: {df[col].max()}")
        print(f"  Mean: {df[col].mean():.2f}")
        print(f"  Median: {df[col].median():.2f}")
        print(f"  Std Dev: {df[col].std():.2f}")
else:
    print("No numeric columns found")

Numeric Columns Summary:
Found 16 numeric columns


age:
  Min: 0.0
  Max: 3225.0
  Mean: 36.57
  Median: 35.00
  Std Dev: 11.29

time:
  Min: 122.71666666666668
  Max: 1299.75
  Mean: 284.07
  Median: 273.78
  Std Dev: 67.61

full_temp_min:
  Min: -36.9
  Max: 75.6
  Mean: 31.87
  Median: 33.80
  Std Dev: 16.97

full_temp_max:
  Min: 35.5
  Max: 118.5
  Mean: 87.40
  Median: 88.30
  Std Dev: 9.60

full_temp_median_min:
  Min: -1.5
  Max: 84.85
  Mean: 51.51
  Median: 52.80
  Std Dev: 13.02

full_temp_median_max:
  Min: 15.85
  Max: 108.0
  Mean: 68.76
  Median: 70.60
  Std Dev: 12.71

full_overall_precip:
  Min: 0.0
  Max: 1313.9
  Mean: 217.90
  Median: 212.30
  Std Dev: 138.44

full_overall_days_of_precip:
  Min: 0
  Max: 90
  Mean: 31.15
  Median: 33.00
  Std Dev: 14.07

full_overall_weekend_days_of_precip:
  Min: 0
  Max: 26
  Mean: 8.59
  Median: 9.00
  Std Dev: 4.19

peak_temp_min:
  Min: -36.9
  Max: 81.1
  Mean: 37.27
  Median: 37.10
  Std Dev: 13.54

peak_temp_max:
  Min: 25.

In [15]:
# Data Types and Unique Values
print("Column Data Types and Unique Values:")
print("="*80)

column_info = pd.DataFrame({
    'Column': df.columns,
    'Data_Type': df.dtypes.values,
    'Unique_Count': [df[col].nunique() for col in df.columns],
    'Sample_Values': [str(df[col].dropna().unique()[:3].tolist()) if df[col].nunique() <= 50 else 'Too many to display' for col in df.columns]
})

display(column_info)

Column Data Types and Unique Values:


Unnamed: 0,Column,Data_Type,Unique_Count,Sample_Values
0,age,float64,102,Too many to display
1,sex,str,356,Too many to display
2,time,float64,26974,Too many to display
3,race,str,1278,Too many to display
4,date,str,2625,Too many to display
5,city,str,85,Too many to display
6,state,str,33,"['mi', 'il', 'oh']"
7,full_temp_min,float64,995,Too many to display
8,full_temp_max,float64,768,Too many to display
9,full_temp_median_min,float64,2064,Too many to display


In [16]:
# Missing Values Analysis
print("Missing Values Analysis:")
print("="*80)

missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum().values,
    'Missing_Percent': (df.isnull().sum().values / len(df) * 100).round(2)
})

# Only show columns with missing values
missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_data) > 0:
    display(missing_data)
else:
    print("No missing values found in any column!")

Missing Values Analysis:


Unnamed: 0,Column,Missing_Count,Missing_Percent
0,age,241080,14.28
2,time,126648,7.5
1,sex,15,0.0


In [17]:
# Statistical Summary
print("Statistical Summary (all columns):")
print("="*80)
display(df.describe(include='all').T)  # Transpose for better readability

Statistical Summary (all columns):


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
age,1447190.0,,,,36.574419,11.288382,0.0,28.0,35.0,44.0,3225.0
sex,1688255.0,356.0,M,947632.0,,,,,,,
time,1561622.0,,,,284.06795,67.605009,122.716667,235.5,273.783333,321.466667,1299.75
race,1688270.0,1278.0,city_of_los_angeles_marathon_(l.a._marathon),133561.0,,,,,,,
date,1688270.0,2625.0,2016-10-09,17126.0,,,,,,,
city,1688270.0,85.0,los angeles,121020.0,,,,,,,
state,1688270.0,33.0,ca,362883.0,,,,,,,
full_temp_min,1688270.0,,,,31.868774,16.974256,-36.9,24.1,33.8,42.1,75.6
full_temp_max,1688270.0,,,,87.395629,9.603565,35.5,81.5,88.3,94.1,118.5
full_temp_median_min,1688270.0,,,,51.506585,13.018333,-1.5,42.2,52.8,61.85,84.85


In [18]:
# Data Preview
print("First 5 rows:")
print("="*80)
display(df.head())

print("\n\nLast 5 rows:")
print("="*80)
display(df.tail())

print("\n\nRandom sample of 5 rows:")
print("="*80)
display(df.sample(5))

First 5 rows:


Unnamed: 0,age,sex,time,race,date,city,state,full_temp_min,full_temp_max,full_temp_median_min,...,full_overall_precip,full_overall_days_of_precip,full_overall_weekend_days_of_precip,peak_temp_min,peak_temp_max,peak_temp_median_min,peak_temp_median_max,peak_overall_precip,peak_overall_days_of_precip,peak_overall_weekend_days_of_precip
0,41.0,M,203.45,"""last_chance_for_boston""_marathon",2003-02-02,ann arbor,mi,-4.3,61.5,21.95,...,104.1,32,8,-4.3,42.7,12.0,25.25,13.3,9,3
1,48.0,M,203.366667,"""last_chance_for_boston""_marathon",2003-02-02,champaign,il,-1.0,63.9,25.75,...,117.7,29,10,-1.0,53.0,16.6,31.65,20.7,9,5
2,26.0,F,220.983333,"""last_chance_for_boston""_marathon",2003-02-02,chicago,il,-3.5,62.4,26.9,...,94.2,30,12,-3.5,50.8,13.7,28.55,15.8,9,6
3,43.0,M,194.733333,"""last_chance_for_boston""_marathon",2003-02-02,cincinnati,oh,-10.2,69.2,27.1,...,210.7,37,11,-10.2,52.6,19.05,30.9,15.9,10,5
4,31.0,M,213.25,"""last_chance_for_boston""_marathon",2003-02-02,cincinnati,oh,-10.2,69.2,27.1,...,210.7,37,11,-10.2,52.6,19.05,30.9,15.9,10,5




Last 5 rows:


Unnamed: 0,age,sex,time,race,date,city,state,full_temp_min,full_temp_max,full_temp_median_min,...,full_overall_precip,full_overall_days_of_precip,full_overall_weekend_days_of_precip,peak_temp_min,peak_temp_max,peak_temp_median_min,peak_temp_median_max,peak_overall_precip,peak_overall_days_of_precip,peak_overall_weekend_days_of_precip
1688265,64.0,M,,zydeco_marathon,2023-03-12,phoenix,az,32.0,76.8,42.55,...,161.7,22,4,33.3,76.8,43.7,64.35,56.0,8,2
1688266,64.0,M,,zydeco_marathon,2023-03-12,phoenix,az,32.0,76.8,42.55,...,161.7,22,4,33.3,76.8,43.7,64.35,56.0,8,2
1688267,27.0,F,,zydeco_marathon,2023-03-12,shreveport,la,11.1,81.4,43.35,...,398.7,45,10,30.4,81.4,51.8,73.25,90.5,18,4
1688268,43.0,M,,zydeco_marathon,2023-03-12,shreveport,la,11.1,81.4,43.35,...,398.7,45,10,30.4,81.4,51.8,73.25,90.5,18,4
1688269,59.0,F,,zydeco_marathon,2023-03-12,tulsa,ok,2.1,74.2,34.3,...,205.8,31,5,22.8,74.2,41.25,61.35,94.5,14,3




Random sample of 5 rows:


Unnamed: 0,age,sex,time,race,date,city,state,full_temp_min,full_temp_max,full_temp_median_min,...,full_overall_precip,full_overall_days_of_precip,full_overall_weekend_days_of_precip,peak_temp_min,peak_temp_max,peak_temp_median_min,peak_temp_median_max,peak_overall_precip,peak_overall_days_of_precip,peak_overall_weekend_days_of_precip
785638,34.0,F,291.433333,houston_marathon,2000-01-16,houston,tx,31.9,82.4,51.1,...,158.6,22,6,31.9,77.8,47.45,65.55,60.0,9,2
1259665,24.0,M,263.483333,philadelphia_marathon,2019-11-24,pittsburgh,pa,11.1,90.1,49.45,...,258.9,34,10,11.1,70.2,32.05,48.3,95.0,12,3
909116,,M,223.916667,madison_marathon_(mad_city_marathon),2003-05-25,champaign,il,2.5,85.4,43.35,...,209.2,41,11,37.4,85.4,51.9,69.4,125.3,19,5
704261,,M,217.233333,green_bay_marathon,2003-05-18,milwaukee,wi,-3.7,71.8,32.9,...,202.4,44,10,28.7,68.7,41.25,52.9,113.3,14,5
465001,28.0,M,234.1,city_of_los_angeles_marathon_(l.a._marathon),2015-03-15,oakland,ca,32.7,79.1,45.6,...,122.8,12,3,41.2,79.1,47.05,65.7,2.2,2,1


## Location Performance Questions
1. Which major cities have the fastest runners (overall and median)?

In [17]:
# Question 1: Which major cities have the fastest runners (overall and median)?

# Filter to major cities (cities with at least 10,000 runners)
city_counts = df.groupby(['city', 'state']).size()
major_cities = city_counts[city_counts >= 1000].index

# Filter df to only major cities
df_major = df[df.apply(lambda row: (row['city'], row['state']) in major_cities, axis=1)]

print(f"Analyzing {len(major_cities)} major cities (10,000+ runners each)")
print("="*80)

# Convert time to seconds for easier analysis
def time_to_seconds(time_minutes:int):
    return time_minutes*60

# Apply conversion
df_major['time_seconds'] = df_major['time'].apply(time_to_seconds)

# Group by city and calculate statistics
city_performance = df_major.groupby(['city', 'state'])['time_seconds'].agg([
    ('count', 'count'),
    ('mean_seconds', 'mean'),
    ('median_seconds', 'median'),
    ('std_seconds', 'std')
]).reset_index()

# Convert back to time format for display
def seconds_to_time(seconds):
    """Convert seconds to HH:MM:SS format"""
    if pd.isna(seconds):
        return None
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    return f"{hours:02d}:{minutes:02d}:{secs:02d}"

city_performance['mean_time'] = city_performance['mean_seconds'].apply(seconds_to_time)
city_performance['median_time'] = city_performance['median_seconds'].apply(seconds_to_time)

# Sort by fastest mean time
#city_performance_sorted_mean = city_performance.sort_values('mean_seconds').head(20)
#print("\n\nTop 20 Fastest Cities by MEAN finish time:")
#print("="*80)
#display(city_performance_sorted_mean[['city', 'state', 'count', 'mean_time', 'median_time']])

# Sort by fastest median time
city_performance_sorted_median = city_performance.sort_values('median_seconds').head(10)
print("\n\nTop 10 Fastest Cities by MEDIAN finish time:")
print("="*80)
display(city_performance_sorted_median[['city', 'state', 'count', 'mean_time', 'median_time']])

# Find slowest cities for comparison
city_performance_sorted_slowest = city_performance.sort_values('median_seconds', ascending=False).head(10)
print("\n\nSlowest 10 Major Cities by MEDIAN finish time (for comparison):")
print("="*80)
display(city_performance_sorted_slowest[['city', 'state', 'count', 'mean_time', 'median_time']])

# NEW ANALYSIS: Top 10% of runners by city
print("\n\n" + "="*80)
print("ELITE RUNNER ANALYSIS - Top 10% of Runners by City")
print("="*80)

# Calculate top 10% performance for each major city
elite_performance = []

for city, state in major_cities:
    city_df = df_major[(df_major['city'] == city) & (df_major['state'] == state)]
    city_df_valid = city_df[city_df['time_seconds'].notna()]
    
    if len(city_df_valid) > 0:
        # Get top 10% (fastest times = lowest seconds)
        top_10_pct_threshold = city_df_valid['time_seconds'].quantile(0.10)
        top_10_pct = city_df_valid[city_df_valid['time_seconds'] <= top_10_pct_threshold]
        
        elite_performance.append({
            'city': city,
            'state': state,
            'total_runners': len(city_df_valid),
            'elite_count': len(top_10_pct),
            'elite_mean_seconds': top_10_pct['time_seconds'].mean(),
            'elite_median_seconds': top_10_pct['time_seconds'].median(),
            'elite_max_seconds': top_10_pct['time_seconds'].max(),  # Slowest of the elite
            'elite_min_seconds': top_10_pct['time_seconds'].min()   # Fastest overall
        })

elite_df = pd.DataFrame(elite_performance)

# Convert times to readable format
elite_df['elite_mean_time'] = elite_df['elite_mean_seconds'].apply(seconds_to_time)
elite_df['elite_median_time'] = elite_df['elite_median_seconds'].apply(seconds_to_time)
elite_df['elite_max_time'] = elite_df['elite_max_seconds'].apply(seconds_to_time)
elite_df['elite_min_time'] = elite_df['elite_min_seconds'].apply(seconds_to_time)

# Sort by fastest elite mean time
elite_sorted = elite_df.sort_values('elite_mean_seconds')

print("\n\nTop 10 Cities with Fastest ELITE Runners (top 10% by median):")
print("="*80)
elite_sorted_median = elite_df.sort_values('elite_median_seconds')
display(elite_sorted_median.head(10)[['city', 'state', 'total_runners', 'elite_count', 
                                        'elite_mean_time', 'elite_median_time', 'elite_min_time']])

# Show cities with the absolute fastest individual runners
print("\n\nCities with the Absolute Fastest Individual Runners:")
print("="*80)
elite_fastest_individual = elite_df.sort_values('elite_min_seconds')
display(elite_fastest_individual.head(10)[['city', 'state', 'total_runners', 
                                             'elite_min_time', 'elite_mean_time', 'elite_median_time']])

Analyzing 85 major cities (10,000+ runners each)


Top 10 Fastest Cities by MEDIAN finish time:


Unnamed: 0,city,state,count,mean_time,median_time
14,cambridge,ma,10499,04:09:22,04:01:24
48,new york,ny,62877,04:19:21,04:10:54
11,boulder,co,11027,04:20:14,04:12:09
40,madison,wi,13504,04:19:57,04:13:11
44,minneapolis,mn,38260,04:20:21,04:13:43
12,brooklyn,ny,24883,04:23:10,04:14:11
10,boston,ma,20957,04:18:33,04:14:12
3,ann arbor,mi,10318,04:22:39,04:15:05
52,omaha,ne,11156,04:24:25,04:16:37
20,columbus,oh,21788,04:25:33,04:17:12




Slowest 10 Major Cities by MEDIAN finish time (for comparison):


Unnamed: 0,city,state,count,mean_time,median_time
31,honolulu,hi,40656,05:51:26,05:34:44
37,los angeles,ca,118834,05:29:30,05:24:29
62,riverside,ca,7418,05:21:46,05:12:12
75,south pasadena,ca,1817,05:14:50,05:08:14
69,santa clarita,ca,2136,05:13:33,05:04:00
36,long beach,ca,17201,05:10:43,05:01:15
53,pasadena,ca,9850,05:07:00,04:58:37
80,tustin,ca,2901,05:03:40,04:56:20
70,santa monica,ca,12649,04:58:42,04:48:51
26,fort worth,tx,10415,04:54:19,04:47:47




ELITE RUNNER ANALYSIS - Top 10% of Runners by City


Top 10 Cities with Fastest ELITE Runners (top 10% by median):


Unnamed: 0,city,state,total_runners,elite_count,elite_mean_time,elite_median_time,elite_min_time
11,boulder,co,11027,1103,02:47:40,02:51:56,02:11:53
14,cambridge,ma,10499,1050,02:53:19,02:56:24,02:10:17
10,boston,ma,20957,2098,03:00:19,03:02:27,02:08:26
48,new york,ny,62877,6290,03:01:24,03:03:53,02:11:05
0,albuquerque,nm,11787,1180,02:59:42,03:04:10,02:11:41
40,madison,wi,13504,1352,03:02:10,03:04:39,02:11:39
12,brooklyn,ny,24883,2489,03:02:27,03:04:41,02:14:58
84,westminster,co,2098,210,03:03:57,03:06:02,02:16:31
59,provo,ut,7721,773,03:02:35,03:06:24,02:12:56
44,minneapolis,mn,38260,3829,03:03:50,03:06:31,02:12:24




Cities with the Absolute Fastest Individual Runners:


Unnamed: 0,city,state,total_runners,elite_min_time,elite_mean_time,elite_median_time
77,tampa,fl,13827,02:02:43,03:17:57,03:21:57
10,boston,ma,20957,02:08:26,03:00:19,03:02:27
66,san diego,ca,52457,02:08:37,03:14:37,03:17:51
16,charlotte,nc,22740,02:09:06,03:07:32,03:10:16
38,louisville,ky,16187,02:09:19,03:12:19,03:15:56
57,portland,or,43917,02:09:20,03:07:01,03:10:00
78,tucson,az,13873,02:09:47,03:09:04,03:13:07
34,irvine,ca,12036,02:09:56,03:17:40,03:20:31
37,los angeles,ca,118834,02:09:58,03:29:06,03:33:06
14,cambridge,ma,10499,02:10:17,02:53:19,02:56:24


2. What is the relation of weather during the training period (full and peak) to the performance during the race?  How do things like heat, cold, and precipitation effect performance?

In [4]:
# Question 2: Relation of weather during training period to race performance

# First, check what weather columns we have
weather_cols = [col for col in df.columns if 'weather' in col.lower() or 'temp' in col.lower() or 'precip' in col.lower()]
print("Available weather-related columns:")
print(weather_cols)
print("="*80)

# If we have weather columns, analyze their relationship with performance
if len(weather_cols) > 0:
    # Create a copy with time in seconds
    df_weather = df.copy()
    df_weather['time_seconds'] = df_weather['time'].apply(time_to_seconds)
    
    # Remove invalid times
    df_weather = df_weather[df_weather['time_seconds'].notna()]
    
    # For each weather column, calculate correlation with performance
    print("\n\nCorrelation between weather features and finish time:")
    print("(Positive correlation = worse performance with higher values)")
    print("="*80)
    
    weather_correlations = {}
    for col in weather_cols:
        if df_weather[col].dtype in ['float64', 'int64']:
            corr = df_weather[[col, 'time_seconds']].corr().iloc[0, 1]
            weather_correlations[col] = corr
            print(f"{col}: {corr:.4f}")
    
    # Analyze specific weather patterns
    print("\n\nAnalyzing temperature effects on performance:")
    print("="*80)
    
    # Check for temperature columns (full and peak training periods)
    temp_cols = [col for col in weather_cols if 'temp' in col.lower()]
    
    for temp_col in temp_cols:
        if df_weather[temp_col].dtype in ['float64', 'int64']:
            # Bin temperatures and calculate average finish times
            df_weather[f'{temp_col}_bin'] = pd.cut(df_weather[temp_col], bins=10)
            temp_performance = df_weather.groupby(f'{temp_col}_bin')['time_seconds'].agg(['mean', 'median', 'count'])
            temp_performance['mean_time'] = temp_performance['mean'].apply(seconds_to_time)
            temp_performance['median_time'] = temp_performance['median'].apply(seconds_to_time)
            
            print(f"\n{temp_col} effect on performance:")
            display(temp_performance[['count', 'mean_time', 'median_time']])
    
    # Analyze precipitation effects
    print("\n\nAnalyzing precipitation effects on performance:")
    print("="*80)
    
    precip_cols = [col for col in weather_cols if 'precip' in col.lower()]
    
    for precip_col in precip_cols:
        if df_weather[precip_col].dtype in ['float64', 'int64']:
            # Categorize precipitation levels
            df_weather[f'{precip_col}_category'] = pd.cut(
                df_weather[precip_col], 
                bins=[-0.1, 0.1, 1, 5, 100],
                labels=['Dry (<0.1)', 'Light (0.1-1)', 'Moderate (1-5)', 'Heavy (>5)']
            )
            
            precip_performance = df_weather.groupby(f'{precip_col}_category')['time_seconds'].agg(['mean', 'median', 'count'])
            precip_performance['mean_time'] = precip_performance['mean'].apply(seconds_to_time)
            precip_performance['median_time'] = precip_performance['median'].apply(seconds_to_time)
            
            print(f"\n{precip_col} effect on performance:")
            display(precip_performance[['count', 'mean_time', 'median_time']])
            
else:
    print("\nNo weather columns found in the dataset.")
    print("Weather features may need to be joined with the race data first.")

Available weather-related columns:
['full_temp_min', 'full_temp_max', 'full_temp_median_min', 'full_temp_median_max', 'full_overall_precip', 'full_overall_days_of_precip', 'full_overall_weekend_days_of_precip', 'peak_temp_min', 'peak_temp_max', 'peak_temp_median_min', 'peak_temp_median_max', 'peak_overall_precip', 'peak_overall_days_of_precip', 'peak_overall_weekend_days_of_precip']


Correlation between weather features and finish time:
(Positive correlation = worse performance with higher values)
full_temp_min: 0.1302
full_temp_max: 0.0089
full_temp_median_min: 0.0620
full_temp_median_max: 0.0650
full_overall_precip: -0.0811
full_overall_days_of_precip: -0.0931
full_overall_weekend_days_of_precip: -0.0661
peak_temp_min: 0.1088
peak_temp_max: 0.0458
peak_temp_median_min: 0.0843
peak_temp_median_max: 0.0755
peak_overall_precip: -0.0577
peak_overall_days_of_precip: -0.0714
peak_overall_weekend_days_of_precip: -0.0330


Analyzing temperature effects on performance:

full_temp_min effect 

Unnamed: 0_level_0,count,mean_time,median_time
full_temp_min_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-37.012, -25.65]",1130,04:38:23,04:21:52
"(-25.65, -14.4]",18286,04:28:26,04:16:34
"(-14.4, -3.15]",54847,04:25:24,04:15:42
"(-3.15, 8.1]",88707,04:28:15,04:18:52
"(8.1, 19.35]",143940,04:35:54,04:26:15
"(19.35, 30.6]",312268,04:40:17,04:30:19
"(30.6, 41.85]",543414,04:48:24,04:38:27
"(41.85, 53.1]",304242,04:43:41,04:35:50
"(53.1, 64.35]",48903,04:44:27,04:36:00
"(64.35, 75.6]",45885,05:45:19,05:27:29



full_temp_max effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
full_temp_max_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(35.417, 43.8]",760,04:27:16,04:17:51
"(43.8, 52.1]",1598,04:38:08,04:27:01
"(52.1, 60.4]",11083,04:33:28,04:22:15
"(60.4, 68.7]",50169,04:30:14,04:19:00
"(68.7, 77.0]",151758,04:37:21,04:25:55
"(77.0, 85.3]",419988,04:53:31,04:40:32
"(85.3, 93.6]",496357,04:41:09,04:32:19
"(93.6, 101.9]",364728,04:41:30,04:33:18
"(101.9, 110.2]",58863,04:47:37,04:38:54
"(110.2, 118.5]",6318,04:50:37,04:42:02



full_temp_median_min effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
full_temp_median_min_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-1.586, 7.135]",230,04:34:25,04:26:22
"(7.135, 15.77]",3105,04:31:50,04:16:49
"(15.77, 24.405]",26815,04:25:21,04:14:09
"(24.405, 33.04]",125856,04:26:49,04:16:34
"(33.04, 41.675]",219930,04:44:30,04:33:05
"(41.675, 50.31]",332726,04:53:08,04:42:11
"(50.31, 58.945]",354734,04:40:43,04:31:06
"(58.945, 67.58]",370536,04:38:46,04:32:27
"(67.58, 76.215]",119762,05:06:52,04:50:48
"(76.215, 84.85]",7928,04:45:19,04:35:56



full_temp_median_max effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
full_temp_median_max_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(15.758, 25.065]",369,04:37:41,04:30:31
"(25.065, 34.28]",7684,04:28:43,04:16:37
"(34.28, 43.495]",58568,04:23:10,04:13:20
"(43.495, 52.71]",133728,04:29:33,04:18:31
"(52.71, 61.925]",253212,04:47:14,04:35:56
"(61.925, 71.14]",380798,04:48:45,04:38:07
"(71.14, 80.355]",481741,04:45:34,04:35:33
"(80.355, 89.57]",211451,04:43:29,04:35:51
"(89.57, 98.785]",29811,04:46:23,04:37:46
"(98.785, 108.0]",4260,04:47:34,04:36:50



peak_temp_min effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
peak_temp_min_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-37.018, -25.1]",280,04:39:40,04:24:27
"(-25.1, -13.3]",4501,04:41:42,04:29:34
"(-13.3, -1.5]",16764,04:45:02,04:30:27
"(-1.5, 10.3]",31202,04:40:48,04:27:17
"(10.3, 22.1]",96621,04:34:09,04:24:18
"(22.1, 33.9]",474717,04:38:06,04:28:06
"(33.9, 45.7]",555170,04:44:57,04:35:26
"(45.7, 57.5]",292389,04:45:05,04:37:15
"(57.5, 69.3]",65747,05:10:41,04:53:02
"(69.3, 81.1]",24231,05:39:12,05:22:45



peak_temp_max effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
peak_temp_max_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(25.107, 34.45]",169,04:28:56,04:21:27
"(34.45, 43.7]",4221,04:40:49,04:28:41
"(43.7, 52.95]",13765,04:44:52,04:31:30
"(52.95, 62.2]",50984,04:40:57,04:28:21
"(62.2, 71.45]",159136,04:36:48,04:24:46
"(71.45, 80.7]",555501,04:44:53,04:33:52
"(80.7, 89.95]",563913,04:43:48,04:34:57
"(89.95, 99.2]",181626,04:47:37,04:37:31
"(99.2, 108.45]",29283,04:54:20,04:45:12
"(108.45, 117.7]",3024,05:02:12,04:52:40



peak_temp_median_min effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
peak_temp_median_min_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-5.544, 4.0]",854,04:39:45,04:27:50
"(4.0, 13.45]",5367,04:42:06,04:29:17
"(13.45, 22.9]",18435,04:40:37,04:28:02
"(22.9, 32.35]",68582,04:39:44,04:26:38
"(32.35, 41.8]",280432,04:36:24,04:25:34
"(41.8, 51.25]",552126,04:44:51,04:34:50
"(51.25, 60.7]",443930,04:41:53,04:34:00
"(60.7, 70.15]",123327,04:44:22,04:36:17
"(70.15, 79.6]",67231,05:28:19,05:09:34
"(79.6, 89.05]",1338,05:11:21,04:53:34



peak_temp_median_max effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
peak_temp_median_max_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(10.099, 20.345]",556,04:33:24,04:22:22
"(20.345, 30.49]",5910,04:46:27,04:33:05
"(30.49, 40.635]",28483,04:42:48,04:30:32
"(40.635, 50.78]",84169,04:34:48,04:22:53
"(50.78, 60.925]",311445,04:35:32,04:24:34
"(60.925, 71.07]",651720,04:44:45,04:35:53
"(71.07, 81.215]",357626,04:50:22,04:38:36
"(81.215, 91.36]",102647,04:49:07,04:40:00
"(91.36, 101.505]",17634,04:55:33,04:44:53
"(101.505, 111.65]",1432,05:08:10,04:52:53




Analyzing precipitation effects on performance:

full_overall_precip effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
full_overall_precip_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dry (<0.1),3567,04:46:21,04:36:54
Light (0.1-1),4434,04:48:35,04:40:52
Moderate (1-5),23127,04:52:23,04:44:30
Heavy (>5),326807,04:56:03,04:44:03



full_overall_days_of_precip effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
full_overall_days_of_precip_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dry (<0.1),4277,04:45:06,04:34:50
Light (0.1-1),10037,04:52:22,04:44:45
Moderate (1-5),57691,04:55:45,04:47:05
Heavy (>5),1489617,04:43:33,04:33:11



full_overall_weekend_days_of_precip effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
full_overall_weekend_days_of_precip_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dry (<0.1),33578,04:49:11,04:41:00
Light (0.1-1),55533,04:54:07,04:44:50
Moderate (1-5),290253,04:52:09,04:41:23
Heavy (>5),1182258,04:41:27,04:31:23



peak_overall_precip effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
peak_overall_precip_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dry (<0.1),42561,04:55:56,04:47:16
Light (0.1-1),31748,04:47:27,04:39:11
Moderate (1-5),55583,04:47:37,04:38:22
Heavy (>5),1014637,04:45:46,04:35:04



peak_overall_days_of_precip effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
peak_overall_days_of_precip_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dry (<0.1),52086,04:54:05,04:45:22
Light (0.1-1),55964,04:53:06,04:44:13
Moderate (1-5),198324,04:52:15,04:41:46
Heavy (>5),1255248,04:41:57,04:31:42



peak_overall_weekend_days_of_precip effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
peak_overall_weekend_days_of_precip_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dry (<0.1),179158,04:48:51,04:39:32
Light (0.1-1),212168,04:46:15,04:36:36
Moderate (1-5),1021409,04:43:30,04:33:00
Heavy (>5),148887,04:38:59,04:28:27


3. Are there differences in how training weather effect performance based on location, age, or sex?  Are there places that are "hardier" (i.e. training weather has less of an effect)?  Are older people more or less able to manage bad weather in training?  Etc.

In [47]:
df.columns

Index(['age', 'sex', 'time', 'race', 'date', 'city', 'state', 'full_temp_min',
       'full_temp_max', 'full_temp_median_min', 'full_temp_median_max',
       'full_overall_precip', 'full_overall_days_of_precip',
       'full_overall_weekend_days_of_precip', 'peak_temp_min', 'peak_temp_max',
       'peak_temp_median_min', 'peak_temp_median_max', 'peak_overall_precip',
       'peak_overall_days_of_precip', 'peak_overall_weekend_days_of_precip',
       'temp_join_idx'],
      dtype='str')

In [14]:
# Question 3: Differences in weather effects by location, age, and sex

# Check for demographic columns
demo_cols = [col for col in df.columns if any(x in col.lower() for x in ['age', 'sex', 'gender'])]
print("Available demographic columns:")
print(demo_cols)
print("="*80)

weather_cols = [col for col in df.columns if 'weather' in col.lower() or 'temp' in col.lower() or 'precip' in col.lower()]

if len(weather_cols) > 0 and len(demo_cols) > 0:
    
    df_interact = df.copy()
    df_interact['time_seconds'] = df_interact['time'].apply(time_to_seconds)
    df_interact = df_interact[df_interact['time_seconds'].notna()]
    
    # Analyze by sex/gender if available
    sex_cols = [col for col in demo_cols if 'sex' in col.lower() or 'gender' in col.lower()]
    if len(sex_cols) > 0:
        sex_col = sex_cols[0]
        print(f"\n\nAnalyzing weather effects by {sex_col}:")
        print("="*80)
        
        # For a key weather variable, see if effects differ by sex
        if len(weather_cols) > 0:
            # Use first temperature column if available
            temp_cols = [col for col in weather_cols if 'temp' in col.lower() and 'median' in col.lower()]
            if len(temp_cols) > 0:
                temp_col = temp_cols[0]
                
                # Create temperature categories
                df_interact[f'{temp_col}_category'] = pd.cut(
                    df_interact[temp_col],
                    bins=3,
                    labels=['Cold', 'Moderate', 'Hot']
                )
                
                print(f"\nPerformance by {temp_col} and {sex_col}:")
                sex_temp_performance = df_interact.groupby([sex_col, f'{temp_col}_category'])['time_seconds'].agg(['mean', 'count'])
                sex_temp_performance['mean_time'] = sex_temp_performance['mean'].apply(seconds_to_time)
                display(sex_temp_performance[['count', 'mean_time']])
    
    # Analyze by age if available
    age_cols = [col for col in demo_cols if 'age' in col.lower()]
    if len(age_cols) > 0:
        age_col = age_cols[0]
        print(f"\n\nAnalyzing weather effects by {age_col}:")
        print("="*80)
        
        # Create age categories
        df_interact['age_category'] = pd.cut(
            df_interact[age_col],
            bins=[0, 30, 40, 50, 60, 100],
            labels=['Under 30', '30-39', '40-49', '50-59', '60+']
        )
        
        if len(weather_cols) > 0:
            temp_cols = [col for col in weather_cols if 'temp' in col.lower() and 'median' in col.lower()]
            if len(temp_cols) > 0:
                temp_col = temp_cols[0]
                
                df_interact[f'{temp_col}_category'] = pd.cut(
                    df_interact[temp_col],
                    bins=3,
                    labels=['Cold', 'Moderate', 'Hot']
                )
                
                print(f"\nPerformance by {temp_col} and age category:")
                age_temp_performance = df_interact.groupby(['age_category', f'{temp_col}_category'])['time_seconds'].agg(['mean', 'count'])
                age_temp_performance['mean_time'] = age_temp_performance['mean'].apply(seconds_to_time)
                display(age_temp_performance[['count', 'mean_time']])
    
    # Analyze "hardiness" by location
    print("\n\nAnalyzing location 'hardiness' (cities where weather has less effect):")
    print("="*80)
    
    # For major cities, calculate how much performance varies with weather
    major_cities = df_interact.groupby(['city', 'state']).size()
    major_cities = major_cities[major_cities >= 5000].index
    
    if len(weather_cols) > 0:
        temp_cols = [col for col in weather_cols if 'temp' in col.lower() and 'median' in col.lower()]
        if len(temp_cols) > 0:
            temp_col = temp_cols[0]
            
            # For each major city, calculate correlation between weather and performance
            city_weather_sensitivity = {}
            
            for city, state in major_cities:
                city_df = df_interact[(df_interact['city'] == city) & (df_interact['state'] == state)]
                if len(city_df) > 100 and city_df[temp_col].notna().sum() > 100:
                    corr = city_df[[temp_col, 'time_seconds']].corr().iloc[0, 1]
                    city_weather_sensitivity[(city, state)] = {
                        'correlation': corr,
                        'count': len(city_df)
                    }
            
            # Convert to dataframe
            sensitivity_df = pd.DataFrame([
                {'city': city, 'state': state, 'temp_correlation': data['correlation'], 'count': data['count']}
                for (city, state), data in city_weather_sensitivity.items()
            ])
            
            # Sort by correlation (lower = more hardy, less affected by temperature)
            sensitivity_df = sensitivity_df.sort_values('temp_correlation')
            
            print(f"\nMost 'hardy' cities (least affected by {temp_col}):")
            print("(Lower correlation = better performance despite temperature variation)")
            display(sensitivity_df.head(15))
            
            print(f"\nLeast 'hardy' cities (most affected by {temp_col}):")
            print("(Higher correlation = worse performance with temperature variation)")
            display(sensitivity_df.tail(15))
            
else:
    print("\nInsufficient weather or demographic data for interaction analysis.")

Available demographic columns:
['age', 'sex']


Analyzing weather effects by sex:

Performance by full_temp_median_min and sex:


Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean_time
sex,full_temp_median_min_category,Unnamed: 2_level_1,Unnamed: 3_level_1
F,Cold,26260,04:38:06
F,Moderate,384981,05:02:22
F,Hot,270355,04:59:39
M,Cold,33255,04:14:23
M,Moderate,507578,04:31:41
M,Hot,338527,04:32:42




Analyzing weather effects by age:

Performance by full_temp_median_min and age category:


Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean_time
age_category,full_temp_median_min_category,Unnamed: 2_level_1,Unnamed: 3_level_1
Under 30,Cold,18461,04:20:22
Under 30,Moderate,273127,04:46:09
Under 30,Hot,181725,04:35:58
30-39,Cold,16258,04:18:59
30-39,Moderate,251323,04:37:28
30-39,Hot,176282,04:37:14
40-49,Cold,9891,04:26:40
40-49,Moderate,159534,04:44:27
40-49,Hot,107058,04:45:06
50-59,Cold,4741,04:43:48




Analyzing location 'hardiness' (cities where weather has less effect):

Most 'hardy' cities (least affected by full_temp_median_min):
(Lower correlation = better performance despite temperature variation)


Unnamed: 0,city,state,temp_correlation,count
0,albuquerque,nm,-0.269713,11787
27,los angeles,ca,-0.162472,118834
45,riverside,ca,-0.150718,7418
38,pasadena,ca,-0.140811,9850
52,santa monica,ca,-0.120343,12649
26,long beach,ca,-0.109255,17201
24,irvine,ca,-0.090472,12036
25,jacksonville,fl,-0.067352,12215
8,boston,ma,-0.063109,20957
41,pittsburgh,pa,-0.062322,27794



Least 'hardy' cities (most affected by full_temp_median_min):
(Higher correlation = worse performance with temperature variation)


Unnamed: 0,city,state,temp_correlation,count
59,virginia beach,va,0.07083,11167
18,denver,co,0.072008,30752
51,san jose,ca,0.073107,22796
23,houston,tx,0.079183,55577
1,alexandria,va,0.082453,24371
13,chicago,il,0.111947,117314
4,arlington,va,0.11807,31318
19,fairfax,va,0.118304,8020
60,washington,dc,0.126345,47300
20,falls church,va,0.146723,5237


## Hardiness in Cities
For this hardiness analysis, we examine the race results in aggregate.  First, we normalize the race times by taking taking the zscore of the times in a particular race.  Then we group by the city/state and date and then compute the median time, top 10% time, and weather features (which should all be the same, since they are shared amongst the city/state and date).  Then for each city/state we look at the correlation of all of the weather features to time, and generate a dataframe with the results.

In [50]:
# Hardiness Analysis: How weather affects performance by location
# Methodology:
# 1. Normalize race times using z-score within each race
# 2. Group by city/state/date to compute aggregated metrics
# 3. Calculate correlation between weather features and performance for each city
# 4. Identify "hardy" cities where weather has minimal impact

import numpy as np
from scipy import stats

# Prepare data: use df_enriched which has BQ-adjusted times
df_hardy = df_enriched.copy()

# Step 1: Calculate z-score for each runner's time within their race
# This normalizes for race difficulty
df_hardy['time_zscore'] = df_hardy.groupby(['race', 'date'])['time'].transform(
    lambda x: (x - x.mean()) / x.std() if x.std() > 0 else 0
)

# Step 2: Group by city, state, and date to get race-level metrics
# Using named aggregation for clearer column names
race_aggregates = df_hardy.groupby(['city', 'state', 'date']).agg(
    median_time=('time', 'median'),
    top10_time=('time', lambda x: np.percentile(x.dropna(), 10)),
    median_zscore=('time_zscore', 'median'),
    top10_zscore=('time_zscore', lambda x: np.percentile(x.dropna(), 10)),
    full_temp_min=('full_temp_min', 'first'),
    full_temp_max=('full_temp_max', 'first'),
    full_temp_median_min=('full_temp_median_min', 'first'),
    full_temp_median_max=('full_temp_median_max', 'first'),
    full_overall_precip=('full_overall_precip', 'first'),
    full_overall_days_of_precip=('full_overall_days_of_precip', 'first'),
    peak_temp_min=('peak_temp_min', 'first'),
    peak_temp_max=('peak_temp_max', 'first'),
    peak_temp_median_min=('peak_temp_median_min', 'first'),
    peak_temp_median_max=('peak_temp_median_max', 'first'),
    peak_overall_precip=('peak_overall_precip', 'first'),
    peak_overall_days_of_precip=('peak_overall_days_of_precip', 'first')
).reset_index()

print(f"Race-level aggregates: {len(race_aggregates)} unique race events")
print("="*80)
display(race_aggregates.head(10))

# Step 3: For each city/state, calculate correlation between weather and performance
# Weather features to analyze
weather_features = [
    'full_temp_min', 'full_temp_max', 'full_temp_median_min', 'full_temp_median_max',
    'full_overall_precip', 'full_overall_days_of_precip',
    'peak_temp_min', 'peak_temp_max', 'peak_temp_median_min', 'peak_temp_median_max',
    'peak_overall_precip', 'peak_overall_days_of_precip'
]

# Filter to cities with enough data (at least 10 race events)
city_counts = race_aggregates.groupby(['city', 'state']).size()
cities_with_data = city_counts[city_counts >= 10].index

print(f"\nAnalyzing {len(cities_with_data)} cities with 10+ race events")
print("="*80)

# Calculate correlations for each city
hardiness_results = []

for city, state in cities_with_data:
    city_df = race_aggregates[(race_aggregates['city'] == city) & 
                               (race_aggregates['state'] == state)]
    
    # Calculate correlations with median zscore (higher = worse performance with that weather)
    correlations = {}
    for feature in weather_features:
        if city_df[feature].notna().sum() > 5:  # Need at least 5 valid data points
            try:
                corr = city_df[['median_zscore', feature]].corr().iloc[0, 1]
                correlations[f'{feature}_corr'] = corr
            except:
                # Skip if correlation can't be calculated
                pass
    
    # Add city info and correlations
    result = {
        'city': city,
        'state': state,
        'num_races': len(city_df),
        'avg_median_time': city_df['median_time'].mean(),
        'avg_top10_time': city_df['top10_time'].mean()
    }
    result.update(correlations)
    hardiness_results.append(result)

# Convert to dataframe
hardiness_df = pd.DataFrame(hardiness_results)

# Calculate an overall "hardiness score" (average absolute correlation across all weather features)
# Lower score = more hardy (less affected by weather)
corr_cols = [col for col in hardiness_df.columns if col.endswith('_corr')]
hardiness_df['hardiness_score'] = hardiness_df[corr_cols].abs().mean(axis=1)

# Sort by hardiness score (lower = more hardy)
hardiness_df = hardiness_df.sort_values('hardiness_score')

print("\n\nMost 'Hardy' Cities (least affected by weather):")
print("="*80)
display(hardiness_df.head(15)[['city', 'state', 'num_races', 'hardiness_score', 
                                 'full_temp_median_min_corr', 'full_overall_precip_corr']])

print("\n\nLeast 'Hardy' Cities (most affected by weather):")
print("="*80)
display(hardiness_df.tail(15)[['city', 'state', 'num_races', 'hardiness_score',
                                 'full_temp_median_min_corr', 'full_overall_precip_corr']])

# Add absolute value columns for specific weather features to show strength of relationship
hardiness_df['temp_sensitivity'] = hardiness_df['full_temp_median_min_corr'].abs()
hardiness_df['precip_sensitivity'] = hardiness_df['full_overall_precip_corr'].abs()

print("\n\nCities Most Affected by Training Temperature (highest absolute correlation):")
print("="*80)
print("(High sensitivity = performance varies significantly with temperature)")
hardy_temp = hardiness_df.sort_values('temp_sensitivity', ascending=False)
display(hardy_temp.head(10)[['city', 'state', 'num_races', 'full_temp_median_min_corr', 'temp_sensitivity', 'hardiness_score']])

print("\n\nCities LEAST Affected by Training Temperature (lowest absolute correlation):")
print("="*80)
print("(Low sensitivity = performance consistent regardless of temperature)")
hardy_temp_resistant = hardiness_df.sort_values('temp_sensitivity')
display(hardy_temp_resistant.head(10)[['city', 'state', 'num_races', 'full_temp_median_min_corr', 'temp_sensitivity', 'hardiness_score']])

print("\n\nCities Most Affected by Training Precipitation (highest absolute correlation):")
print("="*80)
print("(High sensitivity = performance varies significantly with precipitation)")
hardy_precip = hardiness_df.sort_values('precip_sensitivity', ascending=False)
display(hardy_precip.head(10)[['city', 'state', 'num_races', 'full_overall_precip_corr', 'precip_sensitivity', 'hardiness_score']])

Race-level aggregates: 70212 unique race events


Unnamed: 0,city,state,date,median_time,top10_time,median_zscore,top10_zscore,full_temp_min,full_temp_max,full_temp_median_min,full_temp_median_max,full_overall_precip,full_overall_days_of_precip,peak_temp_min,peak_temp_max,peak_temp_median_min,peak_temp_median_max,peak_overall_precip,peak_overall_days_of_precip
0,albuquerque,nm,2000-01-08,321.3,258.273333,0.577953,-0.630077,19.7,83.0,35.65,57.15,15.4,8,19.7,53.3,27.15,45.3,3.7,3
1,albuquerque,nm,2000-01-16,249.666667,212.961667,-0.61811,-1.608105,19.7,73.7,32.6,55.85,5.9,6,19.7,59.4,27.65,48.9,3.7,3
2,albuquerque,nm,2000-02-06,180.666667,178.8,-1.499709,-1.532705,19.7,73.1,30.5,51.05,11.1,9,23.3,66.9,31.5,56.4,5.6,4
3,albuquerque,nm,2000-02-20,218.016667,141.766667,-0.868897,-2.518547,19.7,68.3,30.35,51.05,11.4,9,23.4,68.3,33.55,58.1,5.9,4
4,albuquerque,nm,2000-03-05,275.258333,177.018333,-0.533794,-2.205296,19.7,68.4,31.0,53.4,10.7,8,26.8,68.4,33.95,59.5,0.6,0
5,albuquerque,nm,2000-04-17,212.816667,165.843333,-0.289549,-1.610874,21.0,77.4,35.15,59.75,85.9,15,21.0,77.4,39.6,63.0,76.6,9
6,albuquerque,nm,2000-04-30,307.308333,258.941667,1.171419,-0.202887,21.0,84.7,37.7,62.35,82.5,14,31.7,84.7,44.55,72.95,10.8,5
7,albuquerque,nm,2000-05-06,277.7,254.681667,0.123694,-0.179875,21.0,89.9,39.25,64.85,80.7,12,36.5,89.9,47.2,75.5,5.1,3
8,albuquerque,nm,2000-05-07,197.666667,143.966667,-0.085107,-0.91051,21.0,89.9,39.5,64.9,80.7,12,36.5,89.9,47.45,76.8,5.1,3
9,albuquerque,nm,2000-05-27,385.766667,385.766667,2.217595,2.217595,21.0,96.4,43.7,69.95,81.5,13,44.1,96.4,56.0,85.35,1.5,1



Analyzing 85 cities with 10+ race events


Most 'Hardy' Cities (least affected by weather):


Unnamed: 0,city,state,num_races,hardiness_score,full_temp_median_min_corr,full_overall_precip_corr
8,birmingham,al,783,0.009906,0.004283,-0.017619
1,alexandria,va,1164,0.009965,-0.008987,-9.2e-05
55,phoenix,az,1134,0.010812,0.012959,-0.013306
83,washington,dc,1300,0.013556,-0.010645,-0.00298
22,denver,co,1296,0.014082,-0.010256,0.014299
17,chicago,il,1560,0.014429,0.006444,-0.014743
69,santa clarita,ca,283,0.016874,0.01467,0.028078
37,los angeles,ca,1101,0.017004,-0.025926,-0.003768
36,long beach,ca,684,0.017427,-0.015772,0.007713
63,sacramento,ca,725,0.017656,0.036651,0.017831




Least 'Hardy' Cities (most affected by weather):


Unnamed: 0,city,state,num_races,hardiness_score,full_temp_median_min_corr,full_overall_precip_corr
50,oklahoma city,ok,692,0.074231,0.063124,0.045525
35,jacksonville,fl,897,0.074275,0.086844,0.049643
24,fairfax,va,727,0.076596,0.067331,0.034902
6,atlanta,ga,1367,0.081481,0.08061,0.014831
18,cincinnati,oh,1318,0.084142,0.09821,0.03654
73,shreveport,la,436,0.085135,0.101767,-0.121078
64,salem,or,546,0.089436,-0.176592,0.103193
80,tustin,ca,336,0.091916,0.099484,-0.075968
40,madison,wi,972,0.093458,0.098756,0.089655
21,dallas,tx,1344,0.100504,0.120944,0.031818




Cities Most Affected by Training Temperature (highest absolute correlation):
(High sensitivity = performance varies significantly with temperature)


Unnamed: 0,city,state,num_races,full_temp_median_min_corr,temp_sensitivity,hardiness_score
64,salem,or,546,-0.176592,0.176592,0.089436
76,st. george,ut,76,0.15612,0.15612,0.163057
41,matthews,nc,387,0.149912,0.149912,0.115027
30,hermosa beach,ca,389,0.139676,0.139676,0.132855
21,dallas,tx,1344,0.120944,0.120944,0.100504
29,greenville,sc,746,0.118629,0.118629,0.102865
73,shreveport,la,436,0.101767,0.101767,0.085135
80,tustin,ca,336,0.099484,0.099484,0.091916
40,madison,wi,972,0.098756,0.098756,0.093458
18,cincinnati,oh,1318,0.09821,0.09821,0.084142




Cities LEAST Affected by Training Temperature (lowest absolute correlation):
(Low sensitivity = performance consistent regardless of temperature)


Unnamed: 0,city,state,num_races,full_temp_median_min_corr,temp_sensitivity,hardiness_score
49,oakland,ca,757,0.002111,0.002111,0.024267
33,hudson,wi,294,-0.003281,0.003281,0.051092
68,san jose,ca,1109,0.003415,0.003415,0.026065
8,birmingham,al,783,0.004283,0.004283,0.009906
27,frederick,md,627,-0.005092,0.005092,0.032258
17,chicago,il,1560,0.006444,0.006444,0.014429
12,brooklyn,ny,1169,0.006446,0.006446,0.036265
53,pasadena,ca,575,0.008141,0.008141,0.025073
1,alexandria,va,1164,-0.008987,0.008987,0.009965
48,new york,ny,1495,0.009128,0.009128,0.02342




Cities Most Affected by Training Precipitation (highest absolute correlation):
(High sensitivity = performance varies significantly with precipitation)


Unnamed: 0,city,state,num_races,full_overall_precip_corr,precip_sensitivity,hardiness_score
76,st. george,ut,76,-0.164687,0.164687,0.163057
73,shreveport,la,436,-0.121078,0.121078,0.085135
9,boca raton,fl,531,0.108985,0.108985,0.123153
64,salem,or,546,0.103193,0.103193,0.089436
30,hermosa beach,ca,389,-0.10125,0.10125,0.132855
47,nashville,tn,1108,-0.091039,0.091039,0.040915
40,madison,wi,972,0.089655,0.089655,0.093458
79,tulsa,ok,1003,0.08538,0.08538,0.060603
51,olathe,ks,668,0.082993,0.082993,0.064721
33,hudson,wi,294,0.080459,0.080459,0.051092
