# Data Profile Analysis

This notebook analyzes and profiles files stored in the `temp_page_cache` and `race_results` directories.

## 1. Import Required Libraries

In [10]:
import pandas as pd
import numpy as np
import csv
import os
import json
from pathlib import Path
from collections import defaultdict
import warnings

# Disable PyArrow string type to avoid extension type conflicts
pd.options.mode.string_storage = "python"
warnings.filterwarnings("ignore", category=UserWarning)

## 2. Load Data from temp_page_cache

In [2]:
temp_cache_path = "/Users/thatcher/dev/analysis/projects/marathon_results/temp_page_cache"

# Explore temp_page_cache directory structure
if os.path.exists(temp_cache_path):
    print(f"temp_page_cache directory exists")
    print(f"Total items: {len(os.listdir(temp_cache_path))}\n")
    
    # List first 20 items
    items = os.listdir(temp_cache_path)[:20]
    for item in items:
        item_path = os.path.join(temp_cache_path, item)
        if os.path.isdir(item_path):
            print(f"[DIR]  {item} - {len(os.listdir(item_path))} items")
        else:
            size = os.path.getsize(item_path)
            print(f"[FILE] {item} - {size:,} bytes")
else:
    print("temp_page_cache directory not found")

temp_page_cache directory exists
Total items: 1851

[DIR]  earth_day_challenge_marathon - 4 items
[DIR]  black_bear_marathon - 3 items
[DIR]  jackson_river_scenic_trail_marathon - 4 items
[DIR]  indian - 1 items
[DIR]  az_legs_for_literacy_marathon - 1 items
[DIR]  steamboat_marathon - 23 items
[DIR]  mooathon - 7 items
[DIR]  olympics_2000_(women) - 1 items
[DIR]  rutland_marathon - 4 items
[DIR]  rust_buster_marathon - 1 items
[DIR]  nesbit_park_trails_classic - 1 items
[DIR]  belfast_city_marathon - 19 items
[DIR]  marathon-to-marathon - 18 items
[DIR]  the_instant_classic_marathon_trail_run - 7 items
[DIR]  beaver_canyon_marathon,_half_marathon_and_10k - 2 items
[DIR]  zoom!_yah!_yah!_indoor_marathon - 15 items
[DIR]  naked_prussian - 3 items
[DIR]  resurgo_marathon - 1 items
[DIR]  mountain_marathon - 9 items
[DIR]  mid-atlantic_cross_country_challenge - 1 items


## 3. Load Data from race_results

In [3]:
race_results_path = "/Users/thatcher/dev/analysis/projects/marathon_results/race_results"

# Explore race_results directory structure
if os.path.exists(race_results_path):
    print(f"race_results directory exists")
    items = os.listdir(race_results_path)
    print(f"Total items: {len(items)}\n")
    
    # Analyze file types and sizes
    file_types = defaultdict(list)
    total_size = 0
    
    for item in items:
        item_path = os.path.join(race_results_path, item)
        if os.path.isfile(item_path):
            ext = os.path.splitext(item)[1] or "no_extension"
            size = os.path.getsize(item_path)
            file_types[ext].append((item, size))
            total_size += size
    
    print(f"Total size: {total_size:,} bytes ({total_size / (1024**2):.2f} MB)\n")
    print("File types:")
    for ext, files in sorted(file_types.items()):
        print(f"  {ext}: {len(files)} files")
    
    # Show sample files
    print("\nSample files:")
    for item in sorted(items)[:10]:
        item_path = os.path.join(race_results_path, item)
        if os.path.isfile(item_path):
            size = os.path.getsize(item_path)
            print(f"  {item} - {size:,} bytes")
else:
    print("race_results directory not found")

race_results directory exists
Total items: 1836

Total size: 420,714,492 bytes (401.22 MB)

File types:
  .parquet: 1836 files

Sample files:
  "last_chance_for_boston"_marathon.parquet - 52,370 bytes
  "running_for_the_bay!"_marathon.parquet - 51,394 bytes
  11-11-11_marathon_of_sarasota.parquet - 10,437 bytes
  2004_usa_olympic_team_trials_-_men's_marathon.parquet - 10,253 bytes
  2004_usa_olympic_team_trials_-_women's_marathon.parquet - 12,478 bytes
  2008_u.s._olympic_team_trials_-_men's_marathon.parquet - 12,305 bytes
  2012_u.s._olympic_team_trials_-_men's_marathon.parquet - 11,288 bytes
  2012_u.s._olympic_team_trials_-_women's_marathon.parquet - 14,866 bytes
  2014_commonwealth_games_marathon_-_men.parquet - 7,004 bytes
  2014_commonwealth_games_marathon_-_women.parquet - 6,831 bytes


## 4. Profile race_results Data

In [4]:
# Load and analyze parquet files from race_results
parquet_files = [f for f in os.listdir(race_results_path) if f.endswith('.parquet')]

print(f"Total parquet files: {len(parquet_files)}\n")

# Profile first few files
profile_data = []
for i, filename in enumerate(sorted(parquet_files)[:5]):
    try:
        filepath = os.path.join(race_results_path, filename)
        df = pd.read_parquet(filepath)
        
        profile_info = {
            'filename': filename,
            'rows': df.shape[0],
            'columns': df.shape[1],
            'column_names': list(df.columns),
            'dtypes': df.dtypes.to_dict(),
            'memory_usage_mb': df.memory_usage(deep=True).sum() / (1024**2),
            'missing_values': df.isnull().sum().to_dict()
        }
        profile_data.append(profile_info)
        
        print(f"File: {filename}")
        print(f"  Shape: {df.shape[0]} rows × {df.shape[1]} columns")
        print(f"  Memory usage: {profile_info['memory_usage_mb']:.2f} MB")
        print(f"  Columns: {', '.join(df.columns[:5])}{'...' if df.shape[1] > 5 else ''}")
        print()
    except Exception as e:
        print(f"Error reading {filename}: {str(e)}\n")

Total parquet files: 1836

File: "last_chance_for_boston"_marathon.parquet
  Shape: 1402 rows × 13 columns
  Memory usage: 0.93 MB
  Columns: Last Name, First Name (Sex/Age), Time, OverAllPlace, Sex Place/Div Place, DIV...

File: "running_for_the_bay!"_marathon.parquet
  Shape: 1279 rows × 12 columns
  Memory usage: 0.82 MB
  Columns: Last Name, First Name (Sex/Age), Time, OverAllPlace, Sex Place/Div Place, DIV...

File: 11-11-11_marathon_of_sarasota.parquet
  Shape: 66 rows × 9 columns
  Memory usage: 0.03 MB
  Columns: Last Name, First Name (Sex/Age), Time, OverAllPlace, Sex Place, City, State, Country...

File: 2004_usa_olympic_team_trials_-_men's_marathon.parquet
  Shape: 71 rows × 9 columns
  Memory usage: 0.04 MB
  Columns: Last Name, First Name (Sex/Age), Time, OverAllPlace, Sex Place, City, State, Country...

File: 2004_usa_olympic_team_trials_-_women's_marathon.parquet
  Shape: 200 rows × 9 columns
  Memory usage: 0.10 MB
  Columns: Last Name, First Name (Sex/Age), Time, OverA

## 5. Generate Summary Statistics

In [5]:
# Generate comprehensive summary statistics
print("=" * 60)
print("RACE RESULTS SUMMARY")
print("=" * 60)

# Overall statistics
total_parquet_files = len(parquet_files)
total_races = len(os.listdir(race_results_path))

print(f"\nDirectory Statistics:")
print(f"  Total items: {total_races}")
print(f"  Parquet files: {total_parquet_files}")

# Load all files and aggregate statistics
column_concordance = defaultdict(int)
total_rows = 0
total_cols = 0

for race_file in parquet_files[:50]:  # Sample first 50 files
    try:
        filepath = os.path.join(race_results_path, race_file)
        df = pd.read_parquet(filepath)
        
        total_rows += df.shape[0]
        total_cols += df.shape[1]
        
        for col in df.columns:
            column_concordance[col] += 1
    except Exception as e:
        pass

print(f"\nAggregated Statistics (from sample):")
print(f"  Total rows analyzed: {total_rows:,}")
print(f"  Average columns per file: {total_cols / max(1, len(parquet_files[:50])):.1f}")

print(f"\nMost Common Columns:")
sorted_cols = sorted(column_concordance.items(), key=lambda x: x[1], reverse=True)
for col, count in sorted_cols[:10]:
    print(f"  {col}: {count} files")

RACE RESULTS SUMMARY

Directory Statistics:
  Total items: 1836
  Parquet files: 1836

Aggregated Statistics (from sample):
  Total rows analyzed: 567,778
  Average columns per file: 10.6

Most Common Columns:
  Last Name, First Name (Sex/Age): 50 files
  OverAllPlace: 50 files
  date: 50 files
  race: 50 files
  Time: 49 files
  BQ*: 44 files
  AG Time*: 43 files
  Sex Place/Div Place: 39 files
  DIV: 39 files
  City, State, Country: 35 files

Aggregated Statistics (from sample):
  Total rows analyzed: 567,778
  Average columns per file: 10.6

Most Common Columns:
  Last Name, First Name (Sex/Age): 50 files
  OverAllPlace: 50 files
  date: 50 files
  race: 50 files
  Time: 49 files
  BQ*: 44 files
  AG Time*: 43 files
  Sex Place/Div Place: 39 files
  DIV: 39 files
  City, State, Country: 35 files


# Approach

Data Manipulation
-----------------
The end goal is to get a feature set of run descriptor (age, sex), home (city, state), outcome (time), event descriptors (race, date), and weather conditioners (total rainfall, weekend rainfall, last month rainfall, temperature, etc.).

Along the way rows (and races) will have to be filtered.  I would like to keep a record of this in some way, to examine the results for bias.  As a goal, each operation that results in a filtering should output metadata on what is filtered (number of rows, reason, from what file/race/record).

Steps
-----
1) Iterate through race files, determine for each how to extract (age, sex, time, race, date, city, state)
2) Determine mapping of city, state -> OrderedList(weather stations) to use for weather data
3) Featurize weather data to (weather station) -> List(weather conditioners)
4) Join the data together to get starting data frame


Research
--------
Once we have the data manipulated into a form that is able to be researched, the goal is to explore a number of questions related to how, for a given runner, the weather in their "home town" before a race affects their race results.  And then some extra details around that, like "toughest" town, race with toughest runners, difference in age/sex on whether people are affected by weather

## 1) Iterate through race files, determine for each how to extract (age, sex, time, race, date, city, state)

In [None]:
processed_race_files = set()

race_records_home = "/Users/thatcher/dev/analysis/projects/marathon_results/data/race_records"
race_records_skipped = "/Users/thatcher/dev/analysis/projects/marathon_results/data/race_records_skipped.csv"
## Helper Functions
def extract_age(name_string):
    """
    Extract age from formats like:
      - "Smith, John (M/45)" -> age = 45
      - "Smith, John (M45)" -> age = 45
    Returns the age as integer, or None if not found
    """
    try:
        if pd.isna(name_string):
            return None
        
        name_str = str(name_string)
        
        # Find content within parentheses
        if '(' not in name_str or ')' not in name_str:
            return None
        
        # Extract content between ( and )
        paren_content = name_str[name_str.find('(') + 1 : name_str.find(')')]
        
        # Handle both formats: "M/45" and "M45"
        if '/' in paren_content:
            # Format: (Sex/Age)
            age_part = paren_content.split('/')[-1].strip()
        else:
            # Format: (SexAge) - extract just the numeric part
            age_part = ''.join(c for c in paren_content if c.isdigit())
        
        if age_part:
            return int(age_part)
            
    except (ValueError, AttributeError, IndexError):
        return None
    return None
        
def extract_sex(name_string):
    """
    Extract sex from formats like:
      - "Smith, John (M/45)" -> sex = M
      - "Smith, John (M45)" -> sex = M
    Returns the sex as string, or None if not found
    """
    try:
        if pd.isna(name_string):
            return None
        
        name_str = str(name_string)
        
        # Find content within parentheses
        if '(' not in name_str or ')' not in name_str:
            return None
        
        # Extract content between ( and )
        paren_content = name_str[name_str.find('(') + 1 : name_str.find(')')]
        
        # Handle both formats: "M/45" and "M45"
        if '/' in paren_content:
            # Format: (Sex/Age)
            sex_part = paren_content.split('/')[0].strip()
        else:
            # Format: (SexAge) - extract just the non-numeric part
            sex_part = ''.join(c for c in paren_content if not c.isdigit()).strip()
        
        if sex_part:
            return sex_part
            
    except (ValueError, AttributeError, IndexError):
        return None
    return None

def extract_city(location_string):
    """
    Extract city from format like: "City, State, Country"
    Returns the city as string, or None if not found
    """
    try:
        if pd.isna(location_string):
            return None
        return location_string.split(",")[0].strip()
    except (ValueError, AttributeError, IndexError):
        return None
    return None

def extract_state(location_string):
    """
    Extract state from format like: "City, State, Country"
    Returns the state as string, or None if not found
    """
    try:
        if pd.isna(location_string):
            return None
        return location_string.split(",")[1].strip()
    except (ValueError, AttributeError, IndexError):
        return None
    return None

skipped_races = set()
with open(race_records_skipped,'r') as handle:
    reader = csv.DictReader(handle)
    skipped_races = set([r["race_file"] for r in reader])

for race_file in parquet_files:
    if race_file in skipped_races:
        print(f"Skipping {race_file}, reason in {race_records_skipped}")
        continue 

    output_path = os.path.join(race_records_home, race_file)
    if os.path.exists(output_path):
        print(f"Skipping {race_file}, already exists in output")
        continue

    filepath = os.path.join(race_results_path, race_file)
    df = pd.read_parquet(filepath)
    
    record_column_vectors = {
        "age":None,
        "sex":None,
        "time":None,
        "race":None,
        "date":None,
        "city":None,
        "state":None
    }
    
    # Extract age from 'Last Name, First Name (Sex/Age)' column
    if 'Last Name, First Name (Sex/Age)' in df.columns:
        record_column_vectors["age"] = df['Last Name, First Name (Sex/Age)'].apply(extract_age)
        record_column_vectors["sex"] = df['Last Name, First Name (Sex/Age)'].apply(extract_sex)
    
    if "Time" in df.columns:
        record_column_vectors["time"] = df['Time']
        # todo: ensure that time is reported in a standard way
    elif "Net Time" in df.columns:
        record_column_vectors["time"] = df['Net Time']
    
    if "race" in df.columns:
        record_column_vectors["race"] = df['race']
    if "date" in df.columns:
        record_column_vectors["date"] = df['date']
    
    if 'City, State, Country' in df.columns:
        record_column_vectors["city"] = df['City, State, Country'].apply(extract_city)
        record_column_vectors["state"] = df['City, State, Country'].apply(extract_state)
    elif 'City, State' in df.columns:
        record_column_vectors["city"] = df['City, State'].apply(extract_city)
        record_column_vectors["state"] = df['City, State'].apply(extract_state)

    unmappable_attributes = set([k for k,v in record_column_vectors.items() if v is None])
    if len(unmappable_attributes) > 0:
        error = f"missing attributs: {unmappable_attributes}, available columns: {df.columns}"
        print(f"Unable to process file: {race_file}")
        print(f" {error}")
        with open(race_records_skipped,'a') as w_handle:
            writer = csv.writer(w_handle)
            writer.writerow([race_file, len(df), error])
        
        if len(df) > 100000:
            raise Exception("Check, missed big race")
    
    # Create DataFrame from column vectors
    record_frame = pd.DataFrame(record_column_vectors,
                                columns = record_column_vectors.keys())
    record_frame.to_parquet(output_path)
    
    print(f"Processed: {race_file}")
    print(f"  Records: {len(record_frame)}")
    print(f"  Shape: {record_frame.shape}")
    print(f"  Sample:\n{record_frame.head(3)}\n")

Processed: dog_lake_marathon.parquet
  Records: 189
  Shape: (189, 7)
  Sample:
    age sex     time               race      date             city state
0  50.0   M  3:01:39  dog_lake_marathon  10_15_16  Apache Junction    AZ
1  39.0   M  3:08:34  dog_lake_marathon  10_15_16        Kennewick    WA
2  27.0   M  3:16:29  dog_lake_marathon  10_15_16           Yakima    WA

Skipping georgina_spring_fling_marathon.parquet, reason in /Users/thatcher/dev/analysis/projects/marathon_results/race_records_skipped.csv
Processed: pacific_crest_marathon.parquet
  Records: 2903
  Shape: (2903, 7)
  Sample:
    age sex     time                    race     date   city state
0  27.0   M  2:37:45  pacific_crest_marathon  6_29_02   Bend    OR
1  33.0   M  2:55:30  pacific_crest_marathon  6_29_02   Bend    OR
2  37.0   M  2:59:20  pacific_crest_marathon  6_29_02  Salem    OR

Processed: nebraska_marathon.parquet
  Records: 558
  Shape: (558, 7)
  Sample:
    age sex     time               race      date   

## Scrap analysis of big marathon (Boston) to smoke test whether processing worked

In [44]:
# Load and profile boston_marathon.parquet from race_records

boston_file = os.path.join(race_records_home, "boston_marathon.parquet")

if os.path.exists(boston_file):
    print("=" * 70)
    print("BOSTON MARATHON PROFILE")
    print("=" * 70)
    
    boston_df = pd.read_parquet(boston_file)
    
    # Basic information
    print(f"\nBasic Information:")
    print(f"  Shape: {boston_df.shape[0]} rows × {boston_df.shape[1]} columns")
    print(f"  Memory usage: {boston_df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
    
    # Column overview
    print(f"\nColumns and Data Types:")
    for col in boston_df.columns:
        dtype = boston_df[col].dtype
        null_count = boston_df[col].isnull().sum()
        null_pct = (null_count / len(boston_df)) * 100
        print(f"  {col:15} - {str(dtype):15} - {null_count:6} nulls ({null_pct:5.2f}%)")
    
    # Statistical summary
    print(f"\nStatistical Summary:")
    print(boston_df.describe())
    
    # Missing values
    print(f"\nMissing Values:")
    missing = boston_df.isnull().sum()
    if missing.sum() > 0:
        print(missing[missing > 0])
    else:
        print("  No missing values")
    
    # Sample data
    print(f"\nSample Records (first 10):")
    print(boston_df.head(10).to_string())
    
    # Value distributions
    print(f"\nValue Distributions:")
    print(f"  Unique ages: {boston_df['age'].nunique()} - Range: {boston_df['age'].min()} to {boston_df['age'].max()}")
    print(f"  Unique sexes: {boston_df['sex'].nunique()} - Values: {boston_df['sex'].unique()}")
    print(f"  Unique cities: {boston_df['city'].nunique()}")
    print(f"  Unique states: {boston_df['state'].nunique()}")
    print(f"  Unique dates: {boston_df['date'].nunique()}")
    
    # Time analysis
    print(f"\nTime Statistics:")
    print(f"  Min time: {boston_df['time'].min()}")
    print(f"  Max time: {boston_df['time'].max()}")
    
else:
    print(f"File not found: {boston_file}")
    print(f"Available files in {race_records_home}:")
    if os.path.exists(race_records_home):
        files = [f for f in os.listdir(race_records_home) if f.endswith('.parquet')]
        for f in sorted(files)[:10]:
            print(f"  {f}")
    else:
        print(f"  Directory does not exist yet")

BOSTON MARATHON PROFILE

Basic Information:
  Shape: 454282 rows × 7 columns
  Memory usage: 144.14 MB

Columns and Data Types:
  age             - float64         -  66902 nulls (14.73%)
  sex             - str             -      0 nulls ( 0.00%)
  time            - str             -  72200 nulls (15.89%)
  race            - str             -      0 nulls ( 0.00%)
  date            - str             -      0 nulls ( 0.00%)
  city            - str             -  66800 nulls (14.70%)
  state           - str             -  66876 nulls (14.72%)

Statistical Summary:
                 age
count  387380.000000
mean       41.483791
std        10.910682
min        18.000000
25%        33.000000
50%        41.000000
75%        49.000000
max        87.000000

Missing Values:
age      66902
time     72200
city     66800
state    66876
dtype: int64

Sample Records (first 10):
    age sex     time             race     date        city state
0  47.0   M  4:19:54  boston_marathon  4_16_12    Palacios

### Filter all of the race records for files that have sufficient information for analysis, convert time to minutes

In [None]:
def time_to_minutes(time_str):
    """
    Convert time string from HH:MM:SS format to total minutes
    """
    try:
        if pd.isna(time_str):
            return None
        time_str = str(time_str).strip()
        parts = time_str.split(':')
        if len(parts) == 3:
            hours, minutes, seconds = int(parts[0]), int(parts[1]), float(parts[2])
            return hours * 60 + minutes + seconds / 60
        elif len(parts) == 2:
            minutes, seconds = int(parts[0]), float(parts[1])
            return minutes + seconds / 60
        else:
            return None
    except (ValueError, AttributeError, IndexError):
        return None

output_csv = "/Users/thatcher/dev/analysis/projects/marathon_results/data/race_records_final.csv"

# Initialize CSV with header on first write
first_write = True

for file in sorted(os.listdir(race_records_home)):
    if not file.endswith('.parquet'):
        continue
        
    full_path = os.path.join(race_records_home, file)
    print(f"Processing {file}...")
    df = pd.read_parquet(full_path)
    
    # Filter for records with both city and state
    candidate_records = df[(df['city'].notna()) & (df['state'].notna())]
    
    # Convert time column from HH:MM:SS to total minutes
    candidate_records = candidate_records.copy()
    candidate_records['time'] = candidate_records['time'].apply(time_to_minutes)
    
    # Incrementally write to CSV
    if len(candidate_records) > 0:
        candidate_records.to_csv(
            output_csv,
            mode='w' if first_write else 'a',
            header=first_write,
            index=False
        )
        print(f"  Written {len(candidate_records)} records")
        first_write = False
    else:
        print(f"  No records with complete city/state")

print(f"\nFinished! Results saved to {output_csv}")


Looking at dog_lake_marathon.parquet...
Looking at pacific_crest_marathon.parquet...
Looking at nebraska_marathon.parquet...
Looking at las_vegas_deja_vu_marathon.parquet...
Looking at bellingham_bay_marathon.parquet...
Looking at central_park_marathon.parquet...
Looking at california_wine_country_marathon.parquet...
Looking at sogonapmit_marathon.parquet...
Looking at rivanna_greenbelt_october__marathon.parquet...
Looking at bemidji_blue_ox_marathon.parquet...
Looking at alaska_series_-_day_3.parquet...
Looking at mount_desert_island_marathon.parquet...
Looking at missoula_marathon.parquet...
Looking at dust_bowl_series_marathon_-_kansas.parquet...
Looking at spring_chance_bq.2_marathon_-_sunday.parquet...
Looking at chosen:_marathon_for_adoption.parquet...
Looking at california_international_marathon.parquet...
Looking at haulin'_aspen_trail_marathon.parquet...
Looking at horse_capital_marathon.parquet...
Looking at disney_world_marathon.parquet...
Looking at new_england_series_-_ri.

In [None]:
full_records.to_csv("/Users/thatcher/dev/analysis/projects/marathon_results/data/race_records_final.csv")

## Profile boston_marathon.parquet from race_results (Before Processing)

In [36]:
# Load and profile boston_marathon.parquet from race_results (original)

boston_raw_file = os.path.join(race_results_path, "boston_marathon.parquet")

if os.path.exists(boston_raw_file):
    print("=" * 70)
    print("BOSTON MARATHON PROFILE (RAW DATA)")
    print("=" * 70)
    
    boston_raw_df = pd.read_parquet(boston_raw_file)
    
    # Basic information
    print(f"\nBasic Information:")
    print(f"  Shape: {boston_raw_df.shape[0]} rows × {boston_raw_df.shape[1]} columns")
    print(f"  Memory usage: {boston_raw_df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
    
    # Column overview
    print(f"\nColumns:")
    for col in boston_raw_df.columns:
        dtype = boston_raw_df[col].dtype
        null_count = boston_raw_df[col].isnull().sum()
        null_pct = (null_count / len(boston_raw_df)) * 100
        print(f"  {col:30} - {str(dtype):15} - {null_count:6} nulls ({null_pct:5.2f}%)")
    
    # Sample data
    print(f"\nSample Records (first 5):")
    print(boston_raw_df.head(5).to_string())
    
else:
    print(f"File not found: {boston_raw_file}")

BOSTON MARATHON PROFILE (RAW DATA)

Basic Information:
  Shape: 454282 rows × 13 columns

Basic Information:
  Shape: 454282 rows × 13 columns
  Memory usage: 300.97 MB

Columns:
  Last Name, First Name (Sex/Age) - str             -      0 nulls ( 0.00%)
  Time                           - str             -  72200 nulls (15.89%)
  OverAllPlace                   - str             -      0 nulls ( 0.00%)
  Sex Place/Div Place            - str             -  15400 nulls ( 3.39%)
  DIV                            - str             -  66800 nulls (14.70%)
  City, State, Country           - str             -  66800 nulls (14.70%)
  AG Time*                       - str             -  66800 nulls (14.70%)
  BQ*                            - str             -  29098 nulls ( 6.41%)
  date                           - str             -      0 nulls ( 0.00%)
  Net Time                       - str             - 220300 nulls (48.49%)
  Sex Place                      - str             - 438882 nulls (96.

In [37]:
# Examine all files in race_records and find those with None/missing age and sex

print("=" * 70)
print("EXAMINING RACE RECORDS FOR MISSING AGE AND SEX")
print("=" * 70)

if os.path.exists(race_records_home):
    race_record_files = [f for f in os.listdir(race_records_home) if f.endswith('.parquet')]
    print(f"\nTotal race record files: {len(race_record_files)}\n")
    
    files_with_issues = []
    
    for race_file in sorted(race_record_files):
        filepath = os.path.join(race_records_home, race_file)
        try:
            df = pd.read_parquet(filepath)
            
            # Check for None/null values in age and sex columns
            age_nulls = df['age'].isnull().sum()
            sex_nulls = df['sex'].isnull().sum()
            
            if age_nulls > 0 or sex_nulls > 0:
                files_with_issues.append({
                    'file': race_file,
                    'total_rows': len(df),
                    'age_nulls': age_nulls,
                    'sex_nulls': sex_nulls,
                    'age_null_pct': (age_nulls / len(df)) * 100,
                    'sex_null_pct': (sex_nulls / len(df)) * 100
                })
        except Exception as e:
            print(f"Error reading {race_file}: {str(e)}")
    
    # Display results
    if len(files_with_issues) > 0:
        print(f"Found {len(files_with_issues)} files with missing age/sex:\n")
        print(f"{'File':<50} {'Total Rows':>12} {'Age Nulls':>12} {'Sex Nulls':>12}")
        print("-" * 88)
        
        for issue in files_with_issues:
            print(f"{issue['file']:<50} {issue['total_rows']:>12} {issue['age_nulls']:>12} ({issue['age_null_pct']:>5.2f}%) {issue['sex_nulls']:>6} ({issue['sex_null_pct']:>5.2f}%)")
        
        # Summary
        total_affected_rows = sum([i['age_nulls'] + i['sex_nulls'] for i in files_with_issues])
        print("-" * 88)
        print(f"\nSummary:")
        print(f"  Files with issues: {len(files_with_issues)}/{len(race_record_files)}")
        print(f"  Total affected records: {total_affected_rows}")
    else:
        print(f"✓ All {len(race_record_files)} files have complete age and sex data!")
else:
    print(f"race_records directory not found: {race_records_home}")

EXAMINING RACE RECORDS FOR MISSING AGE AND SEX

Total race record files: 1817

Found 1811 files with missing age/sex:

File                                                 Total Rows    Age Nulls    Sex Nulls
----------------------------------------------------------------------------------------
"last_chance_for_boston"_marathon.parquet                  1402         1402 (100.00%)   1402 (100.00%)
"running_for_the_bay!"_marathon.parquet                    1279         1279 (100.00%)   1279 (100.00%)
11-11-11_marathon_of_sarasota.parquet                        66           66 (100.00%)     66 (100.00%)
2004_usa_olympic_team_trials_-_men's_marathon.parquet           71           71 (100.00%)     71 (100.00%)
2004_usa_olympic_team_trials_-_women's_marathon.parquet          200          200 (100.00%)    200 (100.00%)
2008_u.s._olympic_team_trials_-_men's_marathon.parquet          200          200 (100.00%)    200 (100.00%)
2012_u.s._olympic_team_trials_-_men's_marathon.parquet           8