In [41]:
from pyspark.sql.functions import col, from_json, when, size, explode, max as Fmax, year, month, dayofmonth
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, ArrayType, IntegerType, FloatType, TimestampType
from datetime import datetime

In [42]:
# JSON schema for parsed JSON in raw data
json_schema = StructType([
    StructField("status", StringType()),
    StructField("venue", StringType()),
    StructField("date", StringType()),
    StructField("dateTimeGMT", StringType()),
    StructField("teams", ArrayType(StringType())),
    StructField("teamInfo", ArrayType(
        StructType([
            StructField("name", StringType()),
            StructField("img", StringType())
        ])
    )),
    StructField("score", ArrayType(
        StructType([
            StructField("r", IntegerType()),
            StructField("w", IntegerType()),
            StructField("o", FloatType()),
            StructField("inning", StringType())
        ])
    )),
    StructField("series_id", StringType()),
    StructField("fantasyEnabled", BooleanType()),
    StructField("bbbEnabled", BooleanType()),
    StructField("hasSquad", BooleanType()),
    StructField("matchStarted", BooleanType()),
    StructField("matchEnded", BooleanType())
])

In [43]:
# Paths
target_path = "s3a://aws-glue-assets-cricket/output_cricket/live/score_data"
base_static_path = "s3a://aws-glue-assets-cricket/output_cricket/live/cricket_data"


In [44]:
# Load static metadata for today's partition
def load_static_match_data(spark):
    today = datetime.utcnow().date()
    path = f"{base_static_path}/year={today.year}/month={today.month}/day={today.day}"
    static_df = spark.read.option("basePath", base_static_path).parquet(path)
    static_df = static_df.dropDuplicates(["id"])
    return static_df

In [49]:
def batch_process():
    spark = SparkSession.builder.getOrCreate()
    
    today = datetime.utcnow()
    s3_path = f"s3a://aws-glue-assets-cricket/raw_cricket_data/year={today.year}/month={today.month:02}/day={today.day:02}/*/*/"
    
    raw_schema = StructType([
        StructField("id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("matchType", StringType(), True),
        StructField("event_time", TimestampType(), True),
        StructField("ingested_at", StringType(), True),
        StructField("json_data", StringType(), True)
    ])
    
    # Read raw parquet batch
    raw_df = spark.read.schema(raw_schema).parquet(s3_path)
    
    static_df = load_static_match_data(spark)

    conflicting_cols = ["matchType", "name", "match_status", "venue"]
    for c in conflicting_cols:
        if c in static_df.columns:
            static_df = static_df.drop(c)
    static_df = static_df.withColumnRenamed("id", "match_id")

    parsed_df = raw_df.withColumn("json_parsed", from_json(col("json_data"), json_schema))

    flat_df = parsed_df.select(
        "id", "name", "matchType", "event_time",
        col("json_parsed.status").alias("status"),
        col("json_parsed.venue").alias("venue"),
        col("json_parsed.teams").alias("teams"),
        col("json_parsed.score").alias("score"),
        col("json_parsed.matchStarted").alias("matchStarted"),
        col("json_parsed.matchEnded").alias("matchEnded")
    ).withColumn("event_time_ts", col("event_time").cast("timestamp"))

    max_times = flat_df.groupBy("id").agg(Fmax("event_time_ts").alias("max_ts")) \
                       .withColumnRenamed("id", "max_id")

    latest_df = flat_df.join(
        max_times,
        (flat_df.id == max_times.max_id) & (flat_df.event_time_ts == max_times.max_ts),
        "inner"
    ).drop("max_id", "max_ts")

    latest_df = latest_df.withColumn(
        "match_status",
        when((col("matchStarted") == True) & (col("matchEnded") == False) & (size(col("score")) > 0), "Live")
        .when((col("matchStarted") == True) & (col("matchEnded") == False), "Upcoming")
        .when(col("matchEnded") == True, "Completed")
        .otherwise("Unknown")
    )

    live_df = latest_df.filter(col("match_status") == "Live")

    exploded_df = live_df.select(
        "id", "name", "matchType", "event_time_ts", "status", "venue", "teams", "match_status",
        explode(col("score")).alias("score_entry")
    )
    print(f"Raw Count: {exploded_df.count()}")
    final_df = exploded_df.select(
        col("id").alias("match_id"),
        "name",
        "matchType",
        "event_time_ts",
        "status",
        "venue",
        "teams",
        "match_status",
        col("score_entry.inning").alias("inning"),
        col("score_entry.r").alias("runs"),
        col("score_entry.w").alias("wickets"),
        col("score_entry.o").alias("overs")
    )

    print(f"Rows before join: {final_df.count()}")

    enriched_df = final_df.join(static_df, on="match_id", how="left")
    print(f"Rows after join: {enriched_df.count()}")

    # Add partition columns before writing
    enriched_df = enriched_df.withColumn("year", year("event_time_ts")) \
                             .withColumn("month", month("event_time_ts")) \
                             .withColumn("day", dayofmonth("event_time_ts"))

    # Overwrite only today's partition (best practice to avoid deleting other days)
    enriched_df.filter(
        (col("year") == today.year) & 
        (col("month") == today.month) & 
        (col("day") == today.day)
    ).write \
     .partitionBy("year", "month", "day") \
     .mode("overwrite") \
     .parquet(target_path)

In [50]:
if __name__ == "__main__":
    from pyspark.sql import SparkSession
    batch_process()

Raw Count: 9
Rows before join: 9
Rows after join: 9
