In [1]:
import pandas as pd
from datetime import datetime
import os

# Date Column
Investigate what is happening to the date column, causing so many records to get filtered prior to 2015

In [9]:
from collections import Counter

RACE_RECORDS_DIR = "data/race_records"

date_concordance = Counter()

for race_file in os.listdir(RACE_RECORDS_DIR):
    filepath = os.path.join(RACE_RECORDS_DIR, race_file)
    df = pd.read_parquet(filepath)
    
    # Add all date values to the concordance
    date_concordance.update(df["date"].dropna())

# Convert to DataFrame
concordance_df = pd.DataFrame([
    {"date": date, "count": count}
    for date, count in date_concordance.most_common()
])

def parse_date_column(col:str) -> datetime:
    (month, day, year) = col.split("_")
    return datetime(int("20"+year), int(month), int(day))

concordance_df["actual_date"] = concordance_df['date'].apply(parse_date_column)
concordance_df["year"] = concordance_df["actual_date"].apply(lambda x:x.year)
concordance_df["month"] = concordance_df["actual_date"].apply(lambda x:x.month)
concordance_df["day"] = concordance_df["actual_date"].apply(lambda x:x.day)

print(f"Total unique dates: {len(concordance_df)}")
print(f"Total date records: {concordance_df['count'].sum():,}")
print("\n")
display(concordance_df)

Total unique dates: 2958
Total date records: 10,478,198




Unnamed: 0,date,count,actual_date,year,month,day
0,10_7_12,64656,2012-10-07,2012,10,7
1,10_9_16,63317,2016-10-09,2016,10,9
2,10_9_11,62007,2011-10-09,2011,10,9
3,10_7_18,61894,2018-10-07,2018,10,7
4,10_13_19,60650,2019-10-13,2019,10,13
...,...,...,...,...,...,...
2953,4_4_20,8,2020-04-04,2020,4,4
2954,1_22_06,8,2006-01-22,2006,1,22
2955,3_15_10,6,2010-03-15,2010,3,15
2956,5_8_14,4,2014-05-08,2014,5,8


In [12]:
concordance_df.groupby("day").count()

Unnamed: 0_level_0,date,count,actual_date,year,month
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,107,107,107,107,107
2,93,93,93,93,93
3,94,94,94,94,94
4,106,106,106,106,106
5,96,96,96,96,96
6,94,94,94,94,94
7,92,92,92,92,92
8,98,98,98,98,98
9,95,95,95,95,95
10,100,100,100,100,100
