# Resarch Notebook

In [1]:
import pandas as pd
FEATURE_DATA_FILE = "../data/featurized_race_data_v2.csv"
df = pd.read_csv(FEATURE_DATA_FILE)

# Remove some rows that have parsing errors
df = df[df["sex"].isin(['M','F'])]

# Enrich with some additional metadata
Add some additional metadata, like the BQ times and the AG relative time

In [2]:
bq_data = pd.read_csv("../data/boston_marathon_qualifying.csv")
import numpy as np
bq_data['min_age'] = bq_data['min_age'].apply(float)
# Keep only rows with valid age values
df["temp_join_idx"] = df.index
df_with_age = df[df['age'].notna()].copy()

# Then do the merge
df_with_age_sorted = df_with_age.sort_values('age')
df_with_age_sorted = pd.merge_asof(
    df_with_age_sorted,
    bq_data[['min_age', 'men_standard_minutes', 'women_standard_minutes', 'nonbinary_standard_minutes']],
    left_on='age',
    right_on='min_age',
    direction='backward'
)

# Add the BQ standard based on sex
conditions = [
    df_with_age_sorted['sex'].str.lower().isin(['m', 'male', 'man']),
    df_with_age_sorted['sex'].str.lower().isin(['f', 'female', 'woman'])
]
choices = [
    df_with_age_sorted['men_standard_minutes'],
    df_with_age_sorted['women_standard_minutes']
]
df_with_age_sorted['bq_standard_minutes'] = np.select(conditions, choices, default=df_with_age_sorted['nonbinary_standard_minutes'])

# If you need the full dataset back with nulls for missing ages:
df_with_age_sorted = df_with_age_sorted.drop(columns=['min_age', 'men_standard_minutes', 'women_standard_minutes', 'nonbinary_standard_minutes'])
df_enriched = df.merge(df_with_age_sorted[['temp_join_idx', 'bq_standard_minutes']], on='temp_join_idx', how='left')
df_enriched = df_enriched.drop(columns="temp_join_idx")

df_enriched["bq_adjusted_time"] = df_enriched.apply(
    lambda row:None if pd.isnull(row["bq_standard_minutes"]) else row["time"] - row["bq_standard_minutes"],
    axis=1)
df_enriched = df_enriched[df_enriched["bq_adjusted_time"].notna()]
display(df_enriched)

Unnamed: 0,age,sex,time,race,date,city,state,full_temp_min,full_temp_max,full_temp_median_min,...,full_overall_weekend_days_of_precip,peak_temp_min,peak_temp_max,peak_temp_median_min,peak_temp_median_max,peak_overall_precip,peak_overall_days_of_precip,peak_overall_weekend_days_of_precip,bq_standard_minutes,bq_adjusted_time
0,41.0,M,203.450000,"""last_chance_for_boston""_marathon",2003-02-02,ann arbor,mi,-4.3,61.5,21.95,...,8,-4.3,42.7,12.00,25.25,13.3,9,3,185.0,18.450000
1,48.0,M,203.366667,"""last_chance_for_boston""_marathon",2003-02-02,champaign,il,-1.0,63.9,25.75,...,10,-1.0,53.0,16.60,31.65,20.7,9,5,195.0,8.366667
2,26.0,F,220.983333,"""last_chance_for_boston""_marathon",2003-02-02,chicago,il,-3.5,62.4,26.90,...,12,-3.5,50.8,13.70,28.55,15.8,9,6,205.0,15.983333
3,43.0,M,194.733333,"""last_chance_for_boston""_marathon",2003-02-02,cincinnati,oh,-10.2,69.2,27.10,...,11,-10.2,52.6,19.05,30.90,15.9,10,5,185.0,9.733333
4,31.0,M,213.250000,"""last_chance_for_boston""_marathon",2003-02-02,cincinnati,oh,-10.2,69.2,27.10,...,11,-10.2,52.6,19.05,30.90,15.9,10,5,175.0,38.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1687457,55.0,M,306.166667,zydeco_marathon,2020-03-08,fort worth,tx,26.1,78.3,40.60,...,9,26.1,78.3,42.00,60.65,112.8,12,3,210.0,96.166667
1687458,23.0,M,259.833333,zydeco_marathon,2020-03-08,houston,tx,31.9,82.1,47.25,...,11,34.1,77.7,49.70,67.70,110.5,15,4,175.0,84.833333
1687459,23.0,M,259.833333,zydeco_marathon,2020-03-08,houston,tx,31.9,82.1,47.25,...,11,34.1,77.7,49.70,67.70,110.5,15,4,175.0,84.833333
1687460,55.0,M,275.700000,zydeco_marathon,2020-03-08,mobile,al,29.6,75.6,48.20,...,13,35.7,73.5,49.45,65.40,127.4,15,3,210.0,65.700000


## Data Profiling

In [None]:
# Basic DataFrame Info
print(f"Shape: {df_enriched.shape[0]:,} rows × {df_enriched.shape[1]} columns")
print(f"\nMemory Usage: {df_enriched.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\n" + "="*80)
df_enriched.info()

Shape: 1,688,270 rows × 21 columns

Memory Usage: 343.69 MB

<class 'pandas.DataFrame'>
RangeIndex: 1688270 entries, 0 to 1688269
Data columns (total 21 columns):
 #   Column                               Non-Null Count    Dtype  
---  ------                               --------------    -----  
 0   age                                  1447190 non-null  float64
 1   sex                                  1688255 non-null  str    
 2   time                                 1561622 non-null  float64
 3   race                                 1688270 non-null  str    
 4   date                                 1688270 non-null  str    
 5   city                                 1688270 non-null  str    
 6   state                                1688270 non-null  str    
 7   full_temp_min                        1688270 non-null  float64
 8   full_temp_max                        1688270 non-null  float64
 9   full_temp_median_min                 1688270 non-null  float64
 10  full_temp_median

In [3]:
# Categorical Columns Distribution
print("Categorical Columns Summary:")
print("="*80)

categorical_cols = df_enriched.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    print(f"Found {len(categorical_cols)} categorical columns\n")
    for col in categorical_cols:
        unique_count = df_enriched[col].nunique()
        print(f"\n{col}: {unique_count} unique values")
        if unique_count <= 20:  # Only show value counts for columns with <= 20 unique values
            print(df_enriched[col].value_counts().head(10))
        else:
            print(f"  Top 5 most common values:")
            print(df_enriched[col].value_counts().head(5))
else:
    print("No categorical columns found")

Categorical Columns Summary:
Found 5 categorical columns


sex: 2 unique values
sex
M    753026
F    580182
Name: count, dtype: int64

race: 1150 unique values
  Top 5 most common values:
race
chicago_marathon                                117255
city_of_los_angeles_marathon_(l.a._marathon)    113140
marine_corps_marathon                            89224
boston_marathon                                  60412
philadelphia_marathon                            46234
Name: count, dtype: int64

date: 2446 unique values
  Top 5 most common values:
date
2016-10-09    17071
2012-10-07    16807
2023-11-05    16267
2011-10-09    15388
2017-10-08    14396
Name: count, dtype: int64

city: 85 unique values
  Top 5 most common values:
city
chicago          104226
los angeles       95566
new york          58044
houston           49534
san francisco     46718
Name: count, dtype: int64

state: 33 unique values
  Top 5 most common values:
state
ca    297434
tx    124162
il    107089
ny     80740
va     

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_cols = df_enriched.select_dtypes(include=['object']).columns


In [None]:
# Numeric Columns Distribution
print("Numeric Columns Summary:")
print("="*80)

numeric_cols = df_enriched.select_dtypes(include=['int64', 'float64']).columns
if len(numeric_cols) > 0:
    print(f"Found {len(numeric_cols)} numeric columns\n")
    for col in numeric_cols:
        print(f"\n{col}:")
        print(f"  Min: {df[col].min()}")
        print(f"  Max: {df[col].max()}")
        print(f"  Mean: {df[col].mean():.2f}")
        print(f"  Median: {df[col].median():.2f}")
        print(f"  Std Dev: {df[col].std():.2f}")
else:
    print("No numeric columns found")

Numeric Columns Summary:
Found 16 numeric columns


age:
  Min: 0.0
  Max: 3225.0
  Mean: 36.57
  Median: 35.00
  Std Dev: 11.29

time:
  Min: 122.71666666666668
  Max: 1299.75
  Mean: 284.07
  Median: 273.78
  Std Dev: 67.61

full_temp_min:
  Min: -36.9
  Max: 75.6
  Mean: 31.87
  Median: 33.80
  Std Dev: 16.97

full_temp_max:
  Min: 35.5
  Max: 118.5
  Mean: 87.40
  Median: 88.30
  Std Dev: 9.60

full_temp_median_min:
  Min: -1.5
  Max: 84.85
  Mean: 51.51
  Median: 52.80
  Std Dev: 13.02

full_temp_median_max:
  Min: 15.85
  Max: 108.0
  Mean: 68.76
  Median: 70.60
  Std Dev: 12.71

full_overall_precip:
  Min: 0.0
  Max: 1313.9
  Mean: 217.90
  Median: 212.30
  Std Dev: 138.44

full_overall_days_of_precip:
  Min: 0
  Max: 90
  Mean: 31.15
  Median: 33.00
  Std Dev: 14.07

full_overall_weekend_days_of_precip:
  Min: 0
  Max: 26
  Mean: 8.59
  Median: 9.00
  Std Dev: 4.19

peak_temp_min:
  Min: -36.9
  Max: 81.1
  Mean: 37.27
  Median: 37.10
  Std Dev: 13.54

peak_temp_max:
  Min: 25.

In [5]:
# Data Types and Unique Values
print("Column Data Types and Unique Values:")
print("="*80)

column_info = pd.DataFrame({
    'Column': df_enriched.columns,
    'Data_Type': df_enriched.dtypes.values,
    'Unique_Count': [df_enriched[col].nunique() for col in df_enriched.columns],
    'Sample_Values': [str(df_enriched[col].dropna().unique()[:3].tolist()) if df_enriched[col].nunique() <= 50 else 'Too many to display' for col in df_enriched.columns]
})

display(column_info)

Column Data Types and Unique Values:


Unnamed: 0,Column,Data_Type,Unique_Count,Sample_Values
0,age,float64,84,Too many to display
1,sex,str,2,"['M', 'F']"
2,time,float64,25760,Too many to display
3,race,str,1150,Too many to display
4,date,str,2446,Too many to display
5,city,str,85,Too many to display
6,state,str,33,"['mi', 'il', 'oh']"
7,full_temp_min,float64,991,Too many to display
8,full_temp_max,float64,765,Too many to display
9,full_temp_median_min,float64,2047,Too many to display


In [6]:
# Missing Values Analysis
print("Missing Values Analysis:")
print("="*80)

missing_data = pd.DataFrame({
    'Column': df_enriched.columns,
    'Missing_Count': df_enriched.isnull().sum().values,
    'Missing_Percent': (df_enriched.isnull().sum().values / len(df_enriched) * 100).round(2)
})

# Only show columns with missing values
missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_data) > 0:
    display(missing_data)
else:
    print("No missing values found in any column!")

Missing Values Analysis:
No missing values found in any column!


In [7]:
# Statistical Summary
print("Statistical Summary (all columns):")
print("="*80)
display(df_enriched.describe(include='all').T)  # Transpose for better readability

Statistical Summary (all columns):


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
age,1333208.0,,,,36.907757,11.044292,18.0,29.0,35.0,44.0,3225.0
sex,1333208.0,2.0,M,753026.0,,,,,,,
time,1333208.0,,,,282.1556,65.992492,122.716667,234.633333,272.466667,319.233333,1299.75
race,1333208.0,1150.0,chicago_marathon,117255.0,,,,,,,
date,1333208.0,2446.0,2016-10-09,17071.0,,,,,,,
city,1333208.0,85.0,chicago,104226.0,,,,,,,
state,1333208.0,33.0,ca,297434.0,,,,,,,
full_temp_min,1333208.0,,,,31.261177,16.345674,-36.9,23.7,33.5,41.7,75.1
full_temp_max,1333208.0,,,,87.691344,9.632707,35.5,81.6,89.0,94.5,118.5
full_temp_median_min,1333208.0,,,,51.158307,12.691853,-1.5,42.15,52.35,61.4,84.85


In [8]:
# Data Preview
print("First 5 rows:")
print("="*80)
display(df_enriched.head())

print("\n\nLast 5 rows:")
print("="*80)
display(df_enriched.tail())

print("\n\nRandom sample of 5 rows:")
print("="*80)
display(df_enriched.sample(5))

First 5 rows:


Unnamed: 0,age,sex,time,race,date,city,state,full_temp_min,full_temp_max,full_temp_median_min,...,full_overall_weekend_days_of_precip,peak_temp_min,peak_temp_max,peak_temp_median_min,peak_temp_median_max,peak_overall_precip,peak_overall_days_of_precip,peak_overall_weekend_days_of_precip,bq_standard_minutes,bq_adjusted_time
0,41.0,M,203.45,"""last_chance_for_boston""_marathon",2003-02-02,ann arbor,mi,-4.3,61.5,21.95,...,8,-4.3,42.7,12.0,25.25,13.3,9,3,185.0,18.45
1,48.0,M,203.366667,"""last_chance_for_boston""_marathon",2003-02-02,champaign,il,-1.0,63.9,25.75,...,10,-1.0,53.0,16.6,31.65,20.7,9,5,195.0,8.366667
2,26.0,F,220.983333,"""last_chance_for_boston""_marathon",2003-02-02,chicago,il,-3.5,62.4,26.9,...,12,-3.5,50.8,13.7,28.55,15.8,9,6,205.0,15.983333
3,43.0,M,194.733333,"""last_chance_for_boston""_marathon",2003-02-02,cincinnati,oh,-10.2,69.2,27.1,...,11,-10.2,52.6,19.05,30.9,15.9,10,5,185.0,9.733333
4,31.0,M,213.25,"""last_chance_for_boston""_marathon",2003-02-02,cincinnati,oh,-10.2,69.2,27.1,...,11,-10.2,52.6,19.05,30.9,15.9,10,5,175.0,38.25




Last 5 rows:


Unnamed: 0,age,sex,time,race,date,city,state,full_temp_min,full_temp_max,full_temp_median_min,...,full_overall_weekend_days_of_precip,peak_temp_min,peak_temp_max,peak_temp_median_min,peak_temp_median_max,peak_overall_precip,peak_overall_days_of_precip,peak_overall_weekend_days_of_precip,bq_standard_minutes,bq_adjusted_time
1687457,55.0,M,306.166667,zydeco_marathon,2020-03-08,fort worth,tx,26.1,78.3,40.6,...,9,26.1,78.3,42.0,60.65,112.8,12,3,210.0,96.166667
1687458,23.0,M,259.833333,zydeco_marathon,2020-03-08,houston,tx,31.9,82.1,47.25,...,11,34.1,77.7,49.7,67.7,110.5,15,4,175.0,84.833333
1687459,23.0,M,259.833333,zydeco_marathon,2020-03-08,houston,tx,31.9,82.1,47.25,...,11,34.1,77.7,49.7,67.7,110.5,15,4,175.0,84.833333
1687460,55.0,M,275.7,zydeco_marathon,2020-03-08,mobile,al,29.6,75.6,48.2,...,13,35.7,73.5,49.45,65.4,127.4,15,3,210.0,65.7
1687461,55.0,M,275.7,zydeco_marathon,2020-03-08,mobile,al,29.6,75.6,48.2,...,13,35.7,73.5,49.45,65.4,127.4,15,3,210.0,65.7




Random sample of 5 rows:


Unnamed: 0,age,sex,time,race,date,city,state,full_temp_min,full_temp_max,full_temp_median_min,...,full_overall_weekend_days_of_precip,peak_temp_min,peak_temp_max,peak_temp_median_min,peak_temp_median_max,peak_overall_precip,peak_overall_days_of_precip,peak_overall_weekend_days_of_precip,bq_standard_minutes,bq_adjusted_time
1626476,31.0,F,330.266667,twin_cities_marathon,2006-10-01,minneapolis,mn,42.2,98.7,64.05,...,11,42.2,80.0,52.2,65.15,113.6,14,6,205.0,125.266667
1205981,46.0,M,348.516667,oklahoma_city_memorial_marathon,2013-04-28,dallas,tx,31.3,88.0,46.25,...,9,39.1,88.0,53.0,73.25,94.2,11,2,195.0,153.516667
229779,28.0,M,345.916667,charlotte's_thunder_road_marathon,2019-11-16,charlotte,nc,22.5,95.2,63.2,...,10,22.5,79.9,45.9,64.35,132.1,14,4,175.0,170.916667
1356288,36.0,M,269.116667,richmond_marathon,2021-11-13,raleigh,nc,37.0,96.2,61.3,...,7,37.0,84.4,49.65,69.85,65.8,8,1,180.0,89.116667
1193635,62.0,M,271.95,oc_marathon,2013-05-05,san diego,ca,38.2,84.5,50.35,...,2,47.3,84.5,52.9,68.9,1.5,1,0,230.0,41.95


## Location Performance Questions
1. Which major cities have the fastest runners (overall and median)?

In [9]:
# change df to df_enriched, and override the time column to make it all age/sex normalized
df = df_enriched
df["time"] = df["bq_adjusted_time"]

# Question 1: Which major cities have the fastest runners (overall and median)?

# Filter to major cities (cities with at least 10,000 runners)
city_counts = df.groupby(['city', 'state']).size()
major_cities = city_counts[city_counts >= 1000].index

# Filter df to only major cities
df_major = df[df.apply(lambda row: (row['city'], row['state']) in major_cities, axis=1)]

print(f"Analyzing {len(major_cities)} major cities (10,000+ runners each)")
print("="*80)

# Convert time to seconds for easier analysis
def time_to_seconds(time_minutes:int):
    return time_minutes*60

# Apply conversion
df_major['time_seconds'] = df_major['time'].apply(time_to_seconds)

# Group by city and calculate statistics
city_performance = df_major.groupby(['city', 'state'])['time_seconds'].agg([
    ('count', 'count'),
    ('mean_seconds', 'mean'),
    ('median_seconds', 'median'),
    ('std_seconds', 'std')
]).reset_index()

# Convert back to time format for display
def seconds_to_time(seconds):
    """Convert seconds to HH:MM:SS format"""
    if pd.isna(seconds):
        return None
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    return f"{hours:02d}:{minutes:02d}:{secs:02d}"

city_performance['mean_time'] = city_performance['mean_seconds'].apply(seconds_to_time)
city_performance['median_time'] = city_performance['median_seconds'].apply(seconds_to_time)

# Sort by fastest mean time
#city_performance_sorted_mean = city_performance.sort_values('mean_seconds').head(20)
#print("\n\nTop 20 Fastest Cities by MEAN finish time:")
#print("="*80)
#display(city_performance_sorted_mean[['city', 'state', 'count', 'mean_time', 'median_time']])

# Sort by fastest median time
city_performance_sorted_median = city_performance.sort_values('median_seconds').head(10)
print("\n\nTop 10 Fastest Cities by MEDIAN finish time:")
print("="*80)
display(city_performance_sorted_median[['city', 'state', 'count', 'mean_time', 'median_time']])

# Find slowest cities for comparison
city_performance_sorted_slowest = city_performance.sort_values('median_seconds', ascending=False).head(10)
print("\n\nSlowest 10 Major Cities by MEDIAN finish time (for comparison):")
print("="*80)
display(city_performance_sorted_slowest[['city', 'state', 'count', 'mean_time', 'median_time']])

# NEW ANALYSIS: Top 10% of runners by city
print("\n\n" + "="*80)
print("ELITE RUNNER ANALYSIS - Top 10% of Runners by City")
print("="*80)

# Calculate top 10% performance for each major city
elite_performance = []

for city, state in major_cities:
    city_df = df_major[(df_major['city'] == city) & (df_major['state'] == state)]
    city_df_valid = city_df[city_df['time_seconds'].notna()]
    
    if len(city_df_valid) > 0:
        # Get top 10% (fastest times = lowest seconds)
        top_10_pct_threshold = city_df_valid['time_seconds'].quantile(0.10)
        top_10_pct = city_df_valid[city_df_valid['time_seconds'] <= top_10_pct_threshold]
        
        elite_performance.append({
            'city': city,
            'state': state,
            'total_runners': len(city_df_valid),
            'elite_count': len(top_10_pct),
            'elite_mean_seconds': top_10_pct['time_seconds'].mean(),
            'elite_median_seconds': top_10_pct['time_seconds'].median(),
            'elite_max_seconds': top_10_pct['time_seconds'].max(),  # Slowest of the elite
            'elite_min_seconds': top_10_pct['time_seconds'].min()   # Fastest overall
        })

elite_df = pd.DataFrame(elite_performance)

# Convert times to readable format
elite_df['elite_mean_time'] = elite_df['elite_mean_seconds'].apply(seconds_to_time)
elite_df['elite_median_time'] = elite_df['elite_median_seconds'].apply(seconds_to_time)
elite_df['elite_max_time'] = elite_df['elite_max_seconds'].apply(seconds_to_time)
elite_df['elite_min_time'] = elite_df['elite_min_seconds'].apply(seconds_to_time)

# Sort by fastest elite mean time
elite_sorted = elite_df.sort_values('elite_mean_seconds')

print("\n\nTop 10 Cities with Fastest ELITE Runners (top 10% by median):")
print("="*80)
elite_sorted_median = elite_df.sort_values('elite_median_seconds')
display(elite_sorted_median.head(10)[['city', 'state', 'total_runners', 'elite_count', 
                                        'elite_mean_time', 'elite_median_time', 'elite_min_time']])

# Show cities with the absolute fastest individual runners
print("\n\nCities with the Absolute Fastest Individual Runners:")
print("="*80)
elite_fastest_individual = elite_df.sort_values('elite_min_seconds')
display(elite_fastest_individual.head(10)[['city', 'state', 'total_runners', 
                                             'elite_min_time', 'elite_mean_time', 'elite_median_time']])

Analyzing 84 major cities (10,000+ runners each)


Top 10 Fastest Cities by MEDIAN finish time:


Unnamed: 0,city,state,count,mean_time,median_time
14,cambridge,ma,9540,00:56:21,00:49:34
11,boulder,co,9608,01:01:34,00:51:14
3,ann arbor,mi,8431,01:04:49,00:55:35
48,new york,ny,58044,01:04:05,00:56:18
39,loveland,oh,1803,01:06:20,00:56:56
40,madison,wi,11276,01:04:34,00:58:11
12,brooklyn,ny,22696,01:08:49,00:59:24
44,minneapolis,mn,36339,01:05:43,00:59:28
74,sioux falls,sd,3899,01:06:02,00:59:58
52,omaha,ne,10286,01:07:47,01:00:13




Slowest 10 Major Cities by MEDIAN finish time (for comparison):


Unnamed: 0,city,state,count,mean_time,median_time
31,honolulu,hi,16099,02:20:47,02:07:26
37,los angeles,ca,95566,02:05:57,01:59:04
62,riverside,ca,6738,01:59:50,01:49:41
75,south pasadena,ca,1721,01:53:32,01:45:41
36,long beach,ca,15930,01:52:04,01:43:30
69,santa clarita,ca,2035,01:52:00,01:43:26
79,tustin,ca,2697,01:45:10,01:38:57
53,pasadena,ca,9135,01:47:42,01:38:21
70,santa monica,ca,11859,01:38:16,01:29:34
34,irvine,ca,11203,01:35:24,01:27:08




ELITE RUNNER ANALYSIS - Top 10% of Runners by City


Top 10 Cities with Fastest ELITE Runners (top 10% by median):


Unnamed: 0,city,state,total_runners,elite_count,elite_mean_time,elite_median_time,elite_min_time
11,boulder,co,9608,961,-1:35:34,-1:37:00,-2:50:37
14,cambridge,ma,9540,954,-1:47:22,-1:49:56,-1:03:39
0,albuquerque,nm,10119,1012,-1:46:20,-1:50:36,-1:01:20
2,anchorage,ak,5129,513,-1:50:45,-1:53:26,-2:51:35
61,redmond,wa,1896,190,-1:52:03,-1:54:22,-1:10:03
74,sioux falls,sd,3899,390,-1:53:37,-1:54:40,-1:21:46
8,birmingham,al,8851,886,-1:52:41,-1:54:50,-1:03:19
3,ann arbor,mi,8431,844,-1:52:16,-1:55:00,-1:02:16
13,broomfield,co,2181,220,-1:51:26,-1:55:07,-1:07:48
65,salt lake city,ut,16328,1633,-1:52:02,-1:55:09,-2:55:54




Cities with the Absolute Fastest Individual Runners:


Unnamed: 0,city,state,total_runners,elite_min_time,elite_mean_time,elite_median_time
76,tampa,fl,12919,-3:18:49,00:01:02,00:05:05
31,honolulu,hi,16099,-3:47:46,00:08:54,00:11:41
37,los angeles,ca,95566,-2:03:04,00:13:59,00:17:09
68,san jose,ca,19278,-2:09:36,00:00:14,00:03:13
18,cincinnati,oh,20110,-2:19:09,-1:56:33,-1:59:18
36,long beach,ca,15930,-2:19:09,00:09:05,00:11:30
44,minneapolis,mn,36339,-2:21:01,-1:54:45,-1:57:54
54,philadelphia,pa,29904,-2:32:39,-1:55:19,-1:58:43
53,pasadena,ca,9135,-2:33:53,00:04:08,00:07:36
75,south pasadena,ca,1721,-2:33:58,00:02:02,00:11:09


2. What is the relation of weather during the training period (full and peak) to the performance during the race?  How do things like heat, cold, and precipitation effect performance?

In [10]:
# Question 2: Relation of weather during training period to race performance

# First, check what weather columns we have
weather_cols = [col for col in df.columns if 'weather' in col.lower() or 'temp' in col.lower() or 'precip' in col.lower()]
print("Available weather-related columns:")
print(weather_cols)
print("="*80)

# If we have weather columns, analyze their relationship with performance
if len(weather_cols) > 0:
    # Create a copy with time in seconds
    df_weather = df.copy()
    df_weather['time_seconds'] = df_weather['time'].apply(time_to_seconds)
    
    # Remove invalid times
    df_weather = df_weather[df_weather['time_seconds'].notna()]
    
    # For each weather column, calculate correlation with performance
    print("\n\nCorrelation between weather features and finish time:")
    print("(Positive correlation = worse performance with higher values)")
    print("="*80)
    
    weather_correlations = {}
    for col in weather_cols:
        if df_weather[col].dtype in ['float64', 'int64']:
            corr = df_weather[[col, 'time_seconds']].corr().iloc[0, 1]
            weather_correlations[col] = corr
            print(f"{col}: {corr:.4f}")
    
    # Analyze specific weather patterns
    print("\n\nAnalyzing temperature effects on performance:")
    print("="*80)
    
    # Check for temperature columns (full and peak training periods)
    temp_cols = [col for col in weather_cols if 'temp' in col.lower()]
    
    for temp_col in temp_cols:
        if df_weather[temp_col].dtype in ['float64', 'int64']:
            # Bin temperatures and calculate average finish times
            df_weather[f'{temp_col}_bin'] = pd.cut(df_weather[temp_col], bins=10)
            temp_performance = df_weather.groupby(f'{temp_col}_bin')['time_seconds'].agg(['mean', 'median', 'count'])
            temp_performance['mean_time'] = temp_performance['mean'].apply(seconds_to_time)
            temp_performance['median_time'] = temp_performance['median'].apply(seconds_to_time)
            
            print(f"\n{temp_col} effect on performance:")
            display(temp_performance[['count', 'mean_time', 'median_time']])
    
    # Analyze precipitation effects
    print("\n\nAnalyzing precipitation effects on performance:")
    print("="*80)
    
    precip_cols = [col for col in weather_cols if 'precip' in col.lower()]
    
    for precip_col in precip_cols:
        if df_weather[precip_col].dtype in ['float64', 'int64']:
            # Categorize precipitation levels
            df_weather[f'{precip_col}_category'] = pd.cut(
                df_weather[precip_col], 
                bins=[-0.1, 0.1, 1, 5, 100],
                labels=['Dry (<0.1)', 'Light (0.1-1)', 'Moderate (1-5)', 'Heavy (>5)']
            )
            
            precip_performance = df_weather.groupby(f'{precip_col}_category')['time_seconds'].agg(['mean', 'median', 'count'])
            precip_performance['mean_time'] = precip_performance['mean'].apply(seconds_to_time)
            precip_performance['median_time'] = precip_performance['median'].apply(seconds_to_time)
            
            print(f"\n{precip_col} effect on performance:")
            display(precip_performance[['count', 'mean_time', 'median_time']])
            
else:
    print("\nNo weather columns found in the dataset.")
    print("Weather features may need to be joined with the race data first.")

Available weather-related columns:
['full_temp_min', 'full_temp_max', 'full_temp_median_min', 'full_temp_median_max', 'full_overall_precip', 'full_overall_days_of_precip', 'full_overall_weekend_days_of_precip', 'peak_temp_min', 'peak_temp_max', 'peak_temp_median_min', 'peak_temp_median_max', 'peak_overall_precip', 'peak_overall_days_of_precip', 'peak_overall_weekend_days_of_precip']


Correlation between weather features and finish time:
(Positive correlation = worse performance with higher values)
full_temp_min: 0.1034
full_temp_max: 0.0107
full_temp_median_min: 0.0400
full_temp_median_max: 0.0515
full_overall_precip: -0.0685
full_overall_days_of_precip: -0.1027
full_overall_weekend_days_of_precip: -0.0783
peak_temp_min: 0.0768
peak_temp_max: 0.0377
peak_temp_median_min: 0.0530
peak_temp_median_max: 0.0537
peak_overall_precip: -0.0511
peak_overall_days_of_precip: -0.0741
peak_overall_weekend_days_of_precip: -0.0371


Analyzing temperature effects on performance:

full_temp_min effect 

Unnamed: 0_level_0,count,mean_time,median_time
full_temp_min_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-37.012, -25.7]",717,01:35:52,01:16:23
"(-25.7, -14.5]",14058,01:06:53,00:55:10
"(-14.5, -3.3]",43603,01:10:18,00:59:52
"(-3.3, 7.9]",78437,01:10:14,01:00:47
"(7.9, 19.1]",118302,01:18:49,01:08:47
"(19.1, 30.3]",249535,01:21:52,01:12:21
"(30.3, 41.5]",484169,01:29:26,01:20:25
"(41.5, 52.7]",276431,01:26:47,01:18:50
"(52.7, 63.9]",46664,01:24:19,01:15:30
"(63.9, 75.1]",21292,02:09:33,01:53:21



full_temp_max effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
full_temp_max_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(35.417, 43.8]",620,01:05:38,00:53:38
"(43.8, 52.1]",1258,01:17:13,01:05:10
"(52.1, 60.4]",8822,01:16:36,01:05:37
"(60.4, 68.7]",38891,01:12:48,01:01:52
"(68.7, 77.0]",125134,01:20:52,01:09:16
"(77.0, 85.3]",333079,01:31:13,01:20:17
"(85.3, 93.6]",438299,01:23:16,01:14:58
"(93.6, 101.9]",328520,01:23:46,01:15:49
"(101.9, 110.2]",53159,01:27:50,01:19:18
"(110.2, 118.5]",5426,01:29:28,01:21:21



full_temp_median_min effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
full_temp_median_min_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-1.586, 7.135]",182,01:15:22,01:06:57
"(7.135, 15.77]",2626,01:12:29,00:58:22
"(15.77, 24.405]",22637,01:07:04,00:55:44
"(24.405, 33.04]",105210,01:09:27,00:59:07
"(33.04, 41.675]",180036,01:27:14,01:16:15
"(41.675, 50.31]",277045,01:33:08,01:23:04
"(50.31, 58.945]",317788,01:22:51,01:13:38
"(58.945, 67.58]",333313,01:22:40,01:16:23
"(67.58, 76.215]",87760,01:35:06,01:23:49
"(76.215, 84.85]",6611,01:23:23,01:12:59



full_temp_median_max effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
full_temp_median_max_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(15.758, 25.065]",301,01:21:54,01:15:13
"(25.065, 34.28]",6136,01:09:54,00:56:33
"(34.28, 43.495]",49121,01:05:57,00:55:52
"(43.495, 52.71]",109244,01:12:41,01:01:11
"(52.71, 61.925]",203285,01:28:54,01:18:26
"(61.925, 71.14]",332741,01:29:38,01:19:48
"(71.14, 80.355]",417660,01:24:46,01:16:39
"(80.355, 89.57]",186695,01:25:36,01:17:42
"(89.57, 98.785]",24333,01:25:43,01:16:11
"(98.785, 108.0]",3692,01:26:19,01:14:53



peak_temp_min effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
peak_temp_min_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-37.018, -25.1]",263,01:19:54,01:09:06
"(-25.1, -13.3]",3606,01:26:15,01:12:57
"(-13.3, -1.5]",13358,01:31:22,01:16:48
"(-1.5, 10.3]",26831,01:23:01,01:10:07
"(10.3, 22.1]",86540,01:16:25,01:06:19
"(22.1, 33.9]",407079,01:19:52,01:10:24
"(33.9, 45.7]",487342,01:26:50,01:18:06
"(45.7, 57.5]",250658,01:27:56,01:20:00
"(57.5, 69.3]",39762,01:33:15,01:20:52
"(69.3, 81.1]",17769,02:12:13,01:56:38



peak_temp_max effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
peak_temp_max_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(25.107, 34.45]",149,01:07:57,01:02:40
"(34.45, 43.7]",3482,01:22:32,01:11:10
"(43.7, 52.95]",11650,01:30:17,01:17:24
"(52.95, 62.2]",39835,01:24:42,01:12:36
"(62.2, 71.45]",136364,01:19:29,01:07:35
"(71.45, 80.7]",466244,01:24:12,01:15:00
"(80.7, 89.95]",491344,01:25:33,01:17:19
"(89.95, 99.2]",156878,01:28:27,01:18:18
"(99.2, 108.45]",24743,01:34:02,01:24:17
"(108.45, 117.7]",2519,01:42:40,01:34:38



peak_temp_median_min effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
peak_temp_median_min_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-5.544, 4.0]",715,01:26:12,01:15:02
"(4.0, 13.45]",4481,01:27:22,01:14:55
"(13.45, 22.9]",14927,01:24:06,01:12:34
"(22.9, 32.35]",58937,01:22:39,01:09:07
"(32.35, 41.8]",240118,01:18:24,01:07:48
"(41.8, 51.25]",474217,01:26:13,01:17:02
"(51.25, 60.7]",392398,01:25:00,01:17:24
"(60.7, 70.15]",107290,01:25:36,01:17:22
"(70.15, 79.6]",38976,01:51:49,01:35:53
"(79.6, 89.05]",1149,01:52:11,01:37:11



peak_temp_median_max effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
peak_temp_median_max_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(10.099, 20.345]",473,01:17:42,01:08:25
"(20.345, 30.49]",5015,01:31:19,01:18:44
"(30.49, 40.635]",23838,01:27:50,01:15:19
"(40.635, 50.78]",70329,01:16:43,01:04:41
"(50.78, 60.925]",267517,01:17:55,01:07:10
"(60.925, 71.07]",565295,01:27:06,01:18:58
"(71.07, 81.215]",300169,01:26:56,01:17:11
"(81.215, 91.36]",84652,01:29:56,01:20:15
"(91.36, 101.505]",14746,01:36:32,01:25:12
"(101.505, 111.65]",1174,01:46:04,01:34:04




Analyzing precipitation effects on performance:

full_overall_precip effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
full_overall_precip_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dry (<0.1),3032,01:22:32,01:13:28
Light (0.1-1),3704,01:28:49,01:21:34
Moderate (1-5),20614,01:36:09,01:28:32
Heavy (>5),269544,01:35:02,01:24:15



full_overall_days_of_precip effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
full_overall_days_of_precip_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dry (<0.1),3581,01:22:08,01:13:41
Light (0.1-1),9040,01:35:25,01:27:47
Moderate (1-5),49900,01:37:17,01:29:12
Heavy (>5),1270687,01:24:27,01:15:03



full_overall_weekend_days_of_precip effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
full_overall_weekend_days_of_precip_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dry (<0.1),29188,01:30:36,01:23:11
Light (0.1-1),43995,01:34:57,01:25:57
Moderate (1-5),252242,01:33:14,01:23:08
Heavy (>5),1007783,01:22:20,01:13:18



peak_overall_precip effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
peak_overall_precip_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dry (<0.1),38054,01:37:50,01:29:44
Light (0.1-1),28301,01:32:19,01:25:15
Moderate (1-5),45761,01:29:22,01:20:03
Heavy (>5),861050,01:25:47,01:16:18



peak_overall_days_of_precip effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
peak_overall_days_of_precip_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dry (<0.1),46331,01:36:35,01:28:41
Light (0.1-1),47486,01:36:07,01:27:53
Moderate (1-5),167347,01:31:44,01:21:38
Heavy (>5),1072044,01:22:57,01:13:41



peak_overall_weekend_days_of_precip effect on performance:


Unnamed: 0_level_0,count,mean_time,median_time
peak_overall_weekend_days_of_precip_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dry (<0.1),155616,01:30:34,01:21:53
Light (0.1-1),184408,01:28:21,01:19:04
Moderate (1-5),865699,01:23:42,01:14:26
Heavy (>5),127485,01:22:11,01:11:37


3. Are there differences in how training weather effect performance based on location, age, or sex?  Are there places that are "hardier" (i.e. training weather has less of an effect)?  Are older people more or less able to manage bad weather in training?  Etc.

In [11]:
# Question 3: Differences in weather effects by location, age, and sex

# Check for demographic columns
demo_cols = [col for col in df.columns if any(x in col.lower() for x in ['age', 'sex', 'gender'])]
print("Available demographic columns:")
print(demo_cols)
print("="*80)

weather_cols = [col for col in df.columns if 'weather' in col.lower() or 'temp' in col.lower() or 'precip' in col.lower()]

if len(weather_cols) > 0 and len(demo_cols) > 0:
    
    df_interact = df.copy()
    df_interact['time_seconds'] = df_interact['time'].apply(time_to_seconds)
    df_interact = df_interact[df_interact['time_seconds'].notna()]
    
    # Analyze by sex/gender if available
    sex_cols = [col for col in demo_cols if 'sex' in col.lower() or 'gender' in col.lower()]
    if len(sex_cols) > 0:
        sex_col = sex_cols[0]
        print(f"\n\nAnalyzing weather effects by {sex_col}:")
        print("="*80)
        
        # For a key weather variable, see if effects differ by sex
        if len(weather_cols) > 0:
            # Use first temperature column if available
            temp_cols = [col for col in weather_cols if 'temp' in col.lower() and 'median' in col.lower()]
            if len(temp_cols) > 0:
                temp_col = temp_cols[0]
                
                # Create temperature categories
                df_interact[f'{temp_col}_category'] = pd.cut(
                    df_interact[temp_col],
                    bins=3,
                    labels=['Cold', 'Moderate', 'Hot']
                )
                
                print(f"\nPerformance by {temp_col} and {sex_col}:")
                sex_temp_performance = df_interact.groupby([sex_col, f'{temp_col}_category'])['time_seconds'].agg(['mean', 'count'])
                sex_temp_performance['mean_time'] = sex_temp_performance['mean'].apply(seconds_to_time)
                display(sex_temp_performance[['count', 'mean_time']])
    
    # Analyze by age if available
    age_cols = [col for col in demo_cols if 'age' in col.lower()]
    if len(age_cols) > 0:
        age_col = age_cols[0]
        print(f"\n\nAnalyzing weather effects by {age_col}:")
        print("="*80)
        
        # Create age categories
        df_interact['age_category'] = pd.cut(
            df_interact[age_col],
            bins=[0, 30, 40, 50, 60, 100],
            labels=['Under 30', '30-39', '40-49', '50-59', '60+']
        )
        
        if len(weather_cols) > 0:
            temp_cols = [col for col in weather_cols if 'temp' in col.lower() and 'median' in col.lower()]
            if len(temp_cols) > 0:
                temp_col = temp_cols[0]
                
                df_interact[f'{temp_col}_category'] = pd.cut(
                    df_interact[temp_col],
                    bins=3,
                    labels=['Cold', 'Moderate', 'Hot']
                )
                
                print(f"\nPerformance by {temp_col} and age category:")
                age_temp_performance = df_interact.groupby(['age_category', f'{temp_col}_category'])['time_seconds'].agg(['mean', 'count'])
                age_temp_performance['mean_time'] = age_temp_performance['mean'].apply(seconds_to_time)
                display(age_temp_performance[['count', 'mean_time']])
    
    # Analyze "hardiness" by location
    print("\n\nAnalyzing location 'hardiness' (cities where weather has less effect):")
    print("="*80)
    
    # For major cities, calculate how much performance varies with weather
    major_cities = df_interact.groupby(['city', 'state']).size()
    major_cities = major_cities[major_cities >= 5000].index
    
    if len(weather_cols) > 0:
        temp_cols = [col for col in weather_cols if 'temp' in col.lower() and 'median' in col.lower()]
        if len(temp_cols) > 0:
            temp_col = temp_cols[0]
            
            # For each major city, calculate correlation between weather and performance
            city_weather_sensitivity = {}
            
            for city, state in major_cities:
                city_df = df_interact[(df_interact['city'] == city) & (df_interact['state'] == state)]
                if len(city_df) > 100 and city_df[temp_col].notna().sum() > 100:
                    corr = city_df[[temp_col, 'time_seconds']].corr().iloc[0, 1]
                    city_weather_sensitivity[(city, state)] = {
                        'correlation': corr,
                        'count': len(city_df)
                    }
            
            # Convert to dataframe
            sensitivity_df = pd.DataFrame([
                {'city': city, 'state': state, 'temp_correlation': data['correlation'], 'count': data['count']}
                for (city, state), data in city_weather_sensitivity.items()
            ])
            
            # Sort by correlation (lower = more hardy, less affected by temperature)
            sensitivity_df = sensitivity_df.sort_values('temp_correlation')
            
            print(f"\nMost 'hardy' cities (least affected by {temp_col}):")
            print("(Lower correlation = better performance despite temperature variation)")
            display(sensitivity_df.head(15))
            
            print(f"\nLeast 'hardy' cities (most affected by {temp_col}):")
            print("(Higher correlation = worse performance with temperature variation)")
            display(sensitivity_df.tail(15))
            
else:
    print("\nInsufficient weather or demographic data for interaction analysis.")

Available demographic columns:
['age', 'sex']


Analyzing weather effects by sex:

Performance by full_temp_median_min and sex:


Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean_time
sex,full_temp_median_min_category,Unnamed: 2_level_1,Unnamed: 3_level_1
F,Cold,22422,01:06:55
F,Moderate,324338,01:28:40
F,Hot,233422,01:25:16
M,Cold,28212,01:07:55
M,Moderate,431571,01:24:34
M,Hot,293243,01:24:24




Analyzing weather effects by age:

Performance by full_temp_median_min and age category:


Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean_time
age_category,full_temp_median_min_category,Unnamed: 2_level_1,Unnamed: 3_level_1
Under 30,Cold,18383,01:08:33
Under 30,Moderate,250744,01:28:27
Under 30,Hot,180028,01:24:11
30-39,Cold,16258,01:08:05
30-39,Moderate,251323,01:26:44
30-39,Hot,176282,01:26:14
40-49,Cold,9891,01:04:39
40-49,Moderate,159534,01:22:24
40-49,Hot,107058,01:22:50
50-59,Cold,4741,01:08:41




Analyzing location 'hardiness' (cities where weather has less effect):

Most 'hardy' cities (least affected by full_temp_median_min):
(Lower correlation = better performance despite temperature variation)


Unnamed: 0,city,state,temp_correlation,count
0,albuquerque,nm,-0.238047,10119
44,riverside,ca,-0.180637,6738
37,pasadena,ca,-0.163992,9135
26,los angeles,ca,-0.149695,95566
50,santa monica,ca,-0.134256,11859
25,long beach,ca,-0.11565,15930
23,irvine,ca,-0.115591,11203
40,pittsburgh,pa,-0.089519,25531
24,jacksonville,fl,-0.068255,11268
32,nashville,tn,-0.063762,11128



Least 'hardy' cities (most affected by full_temp_median_min):
(Higher correlation = worse performance with temperature variation)


Unnamed: 0,city,state,temp_correlation,count
28,madison,wi,0.066606,11276
2,anchorage,ak,0.066868,5129
3,ann arbor,mi,0.067438,8431
18,denver,co,0.089699,25948
1,alexandria,va,0.097264,22841
31,minneapolis,mn,0.102363,36339
15,colorado springs,co,0.105873,12125
58,washington,dc,0.121862,43077
19,fairfax,va,0.122208,7497
4,arlington,va,0.126541,29256
