In [0]:
%pip install pybaseball

In [0]:
from pybaseball import statcast
import os
from pyspark.sql import functions as F
from datetime import datetime, timedelta

In [0]:
root_path = "/Volumes/mlb/00_landing/data_sources/statcast"

def get_latest_date_in_volumes(path):
    """Return the latest date stored in the volumns """
    try:
        years = [f.name.replace('y=', '').replace('/', '') 
                 for f in dbutils.fs.ls(path) 
                 if f.name.startswith('y=')]
        if not years: return datetime(2024, 3, 28)
        latest_y = max(years)
        
        months = [f.name.replace('m=', '').replace('/', '') 
                  for f in dbutils.fs.ls(f"{path}/y={latest_y}") 
                  if f.name.startswith('m=')]
        latest_m = max(months)
        
        days = [f.name.replace('d=', '').replace('/', '') 
                for f in dbutils.fs.ls(f"{path}/y={latest_y}/m={latest_m}") 
                if f.name.startswith('d=')]
        latest_d = max(days)
        
        return datetime.strptime(f"{latest_y}-{latest_m}-{latest_d}", '%Y-%m-%d')
    except Exception as e:
        print(f"Info: Could not find existing data, starting from default. Details: {e}")
        return datetime(2024, 3, 28)

In [0]:
latest_stored_date = get_latest_date_in_volumes(root_path)
start_date = latest_stored_date + timedelta(days=1)
yesterday = datetime.strptime("2024-07-10", '%Y-%m-%d')  # DEBUG
# yesterday = datetime.now() - timedelta(days=1)

current_date = start_date

any_new_data = False
processed_dates = []

while current_date <= yesterday:
    target_date_str = current_date.strftime('%Y-%m-%d')
    y, m, d = current_date.strftime('%Y'), current_date.strftime('%m'), current_date.strftime('%d')
    dir_path = f"{root_path}/y={y}/m={m}/d={d}"
    success_file = f"{dir_path}/_SUCCESS"

    try:
        # File exists
        dbutils.fs.ls(success_file)
    except:
        # File does not exists
        try:
            stat_df = statcast(start_dt=target_date_str, end_dt=target_date_str)
            
            if stat_df is not None and not stat_df.empty:
                df = spark.createDataFrame(stat_df)
                
                (df.withColumn("y", F.lit(y))
                   .withColumn("m", F.lit(m))
                   .withColumn("d", F.lit(d))
                   .write.format("parquet").mode("overwrite").save(dir_path))
        
                any_new_data = True
                processed_dates.append(target_date_str)
                # dbutils.jobs.taskValues.set(key="continue_downstream", value="yes")
            else:   
                dbutils.fs.put(success_file, "no data", overwrite=True)
                # dbutils.jobs.taskValues.set(key="continue_downstream", value="no")

        except Exception as e:
            print(f"Failed to fetch {target_date_str}: {e}")
            # dbutils.jobs.taskValues.set(key="continue_downstream", value="no")
            raise e
            
    current_date += timedelta(days=1)

In [0]:
if any_new_data:
    dbutils.jobs.taskValues.set(key="continue_downstream", value="yes")
    dbutils.jobs.taskValues.set(key="start_date", value=min(processed_dates))
    dbutils.jobs.taskValues.set(key="end_date", value=max(processed_dates))
else:
    dbutils.jobs.taskValues.set(key="continue_downstream", value="no")