In [1]:
import pandas as pd
import os

data_file = "/Users/thatcher/dev/analysis/projects/marathon_results/data/race_final/global/data.csv"

print("=" * 80)
print("DATA PROFILE: race_records_final.csv")
print("=" * 80)

DATA PROFILE: race_records_final.csv


## Load Data

In [2]:
# Read the CSV file
print("\nLoading data...")
df = pd.read_csv(data_file)

print(f"\n### OVERALL STATISTICS ###")
print(f"Total rows: {len(df):,}")
print(f"Total columns: {len(df.columns)}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")


Loading data...


  df = pd.read_csv(data_file)



### OVERALL STATISTICS ###
Total rows: 10,478,198
Total columns: 7
Memory usage: 1015.45 MB


## Column Information

In [3]:
print(f"\n### COLUMN INFORMATION ###")
for col in df.columns:
    dtype = df[col].dtype
    null_count = df[col].isnull().sum()
    null_pct = (null_count / len(df)) * 100
    print(f"  {col:15} - {str(dtype):15} - {null_count:8,} nulls ({null_pct:5.2f}%)")

print(f"\n### DATA TYPE SUMMARY ###")
print(df.dtypes)


### COLUMN INFORMATION ###
  age             - float64         - 2,532,160 nulls (24.17%)
  sex             - str             -      535 nulls ( 0.01%)
  time            - str             -  965,994 nulls ( 9.22%)
  race            - str             -        0 nulls ( 0.00%)
  date            - str             -        0 nulls ( 0.00%)
  city            - str             - 2,907,950 nulls (27.75%)
  state           - str             - 3,068,934 nulls (29.29%)

### DATA TYPE SUMMARY ###
age      float64
sex          str
time         str
race         str
date         str
city         str
state        str
dtype: object


## Statistical Summary

In [4]:
print(f"\n### NUMERIC COLUMNS STATISTICS ###")
print(df.describe())


### NUMERIC COLUMNS STATISTICS ###
                age
count  7.946038e+06
mean   3.935986e+01
std    1.146695e+01
min    0.000000e+00
25%    3.100000e+01
50%    3.900000e+01
75%    4.700000e+01
max    3.225000e+03


## Sample Records

In [5]:
print(f"\n### SAMPLE RECORDS (first 10) ###")
print(df.head(10).to_string())


### SAMPLE RECORDS (first 10) ###
    age sex     time               race      date             city state
0  50.0   M  3:01:39  dog_lake_marathon  10_15_16  Apache Junction    AZ
1  39.0   M  3:08:34  dog_lake_marathon  10_15_16        Kennewick    WA
2  27.0   M  3:16:29  dog_lake_marathon  10_15_16           Yakima    WA
3  33.0   M  3:38:40  dog_lake_marathon  10_15_16             Mesa    AZ
4  26.0   F  3:39:01  dog_lake_marathon  10_15_16       Costa Mesa    CA
5  36.0   M  3:39:01  dog_lake_marathon  10_15_16          Clayton    CA
6  40.0   F  3:44:06  dog_lake_marathon  10_15_16      Lake Oswego    OR
7  44.0   F  3:46:19  dog_lake_marathon  10_15_16          Seattle    WA
8  52.0   M  3:50:42  dog_lake_marathon  10_15_16        Vancouver    WA
9  48.0   F  3:53:19  dog_lake_marathon  10_15_16              Bow    WA


## City & State Analysis

In [6]:
print(f"\n### CITY & STATE BREAKDOWN ###")

# Runners per city-state combination
print(f"\nTotal unique cities: {df['city'].nunique():,}")
print(f"Total unique states: {df['state'].nunique():,}")
print(f"Total unique city-state combinations: {df.groupby(['city', 'state']).size().shape[0]:,}")


### CITY & STATE BREAKDOWN ###

Total unique cities: 143,733
Total unique states: 8,504
Total unique city-state combinations: 218,662


## Top 50 Cities by Runner Count

In [24]:
# Create city-state summary
def safe_lower(val):
    if type(val) == str:
        return val.lower()
    return val

df['city'] = df['city'].apply(safe_lower)
df['state'] = df['state'].apply(safe_lower)
city_state_summary = df.groupby(['city', 'state']).size().reset_index(name='runner_count')
city_state_summary = city_state_summary.sort_values('runner_count', ascending=False)

# Calculate cumulative sum and running percentage
total_runners = city_state_summary['runner_count'].sum()
city_state_summary['cumulative_runners'] = city_state_summary['runner_count'].cumsum()
city_state_summary['running_percentage'] = (city_state_summary['cumulative_runners'] / total_runners) * 100

print(f"\n### TOP 20 CITIES BY RUNNER COUNT ###")
print(f"{'City':<30} {'State':<6} {'Runners':>12} {'Running %':>12}")
print("-" * 62)
for idx, row in city_state_summary.head(20).iterrows():
    print(f"{row['city']:<30} {row['state']:<6} {row['runner_count']:>12,} {row['running_percentage']:>11.2f}%")
print("=========")
city_state_summary[['city','state','runner_count']][city_state_summary.running_percentage < 80.1].to_csv("/Users/thatcher/dev/analysis/projects/marathon_results/data/top_training_cities_80pct.csv", index=False)


### TOP 20 CITIES BY RUNNER COUNT ###
City                           State       Runners    Running %
--------------------------------------------------------------
los angeles                    ca          121,020        1.63%
chicago                        il          120,839        3.26%
new york                       ny           96,250        4.56%
houston                        tx           59,373        5.36%
san diego                      ca           57,024        6.13%
san francisco                  ca           56,570        6.90%
honolulu                       hi           55,192        7.64%
washington                     dc           48,946        8.30%
austin                         tx           48,759        8.96%
portland                       or           45,243        9.57%
seattle                        wa           40,735       10.12%
minneapolis                    mn           40,340       10.67%
brooklyn                       ny           39,933       11.21%
de

## Runners by State

In [8]:
# State summary
state_summary = df.groupby('state').size().reset_index(name='runner_count')
state_summary = state_summary.sort_values('runner_count', ascending=False)

print(f"\n### RUNNERS BY STATE ###")
print(f"{'State':<10} {'Runners':>15}")
print("-" * 26)
for idx, row in state_summary.iterrows():
    print(f"{row['state']:<10} {row['runner_count']:>15,}")


### RUNNERS BY STATE ###
State              Runners
--------------------------
CA               1,130,474
TX                 368,938
NY                 358,328
IL                 343,168
FL                 276,982
VA                 267,714
PA                 264,795
OH                 253,620
MN                 239,336
MA                 229,069
MI                 189,214
UT                 174,192
WA                 172,362
NC                 170,739
CO                 164,631
MD                 162,178
NJ                 145,904
GA                 132,924
OR                 126,630
WI                 125,649
HI                 109,258
AZ                 105,828
IN                  96,343
MO                  95,586
TN                  93,996
CT                  76,359
BC                  72,251
Japan               69,270
JAPAN               63,849
ON                  62,374
OK                  58,390
KY                  56,744
SC                  56,269
AL                  53,053
AB

## Save City-State Summary

In [9]:
# Save the city-state summary to a new file
output_file = "/Users/thatcher/dev/analysis/projects/marathon_results/data/race_final/global/city_state_runner_counts.csv"
city_state_summary.to_csv(output_file, index=False)
print(f"✓ City-state breakdown saved to: {output_file}")

✓ City-state breakdown saved to: /Users/thatcher/dev/analysis/projects/marathon_results/data/race_final/global/city_state_runner_counts.csv
