## 0. Create a Spark Session

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CrickeScoree") \
    .config("spark.jars", ",".join([
        r"C:\\spark\\spark-3.5.5-bin-hadoop3\\jars\\delta-core_2.12-3.1.0.jar",
        r"C:\\spark\\spark-3.5.5-bin-hadoop3\\jars\\delta-storage-3.1.0.jar",
        r"C:\\spark\\spark-3.5.5-bin-hadoop3\\jars\\hadoop-aws-3.3.4.jar",
        r"C:\\spark\\spark-3.5.5-bin-hadoop3\\jars\\aws-java-sdk-bundle-1.12.430.jar"
    ])) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain") \
    .getOrCreate()

## Process the data in micro-batch way

In [2]:
from pyspark.sql.functions import col, from_json, when, size, explode, max as Fmax
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, ArrayType, IntegerType, FloatType

# Define schema for parsing JSON data
json_schema = StructType([
    StructField("status", StringType()),
    StructField("venue", StringType()),
    StructField("date", StringType()),
    StructField("dateTimeGMT", StringType()),
    StructField("teams", ArrayType(StringType())),
    StructField("teamInfo", ArrayType(
        StructType([
            StructField("name", StringType()),
            StructField("img", StringType())
        ])
    )),
    StructField("score", ArrayType(
        StructType([
            StructField("r", IntegerType()),
            StructField("w", IntegerType()),
            StructField("o", FloatType()),
            StructField("inning", StringType())
        ])
    )),
    StructField("series_id", StringType()),
    StructField("fantasyEnabled", BooleanType()),
    StructField("bbbEnabled", BooleanType()),
    StructField("hasSquad", BooleanType()),
    StructField("matchStarted", BooleanType()),
    StructField("matchEnded", BooleanType())
])

# Define S3 target and checkpoint paths
target_path = "s3a://aws-glue-assets-cricket/output_cricket/live/score_data"
checkpoint_path = "s3a://aws-glue-assets-cricket/output_cricket/live/score_data/checkpoints"

def process_batch(batch_df, batch_id):
    # Parse the json_data column
    parsed_df = batch_df.withColumn("json_parsed", from_json(col("json_data"), json_schema))

    # Flatten JSON fields
    flat_df = parsed_df.select(
        "id", "name", "matchType", "event_time",
        col("json_parsed.status").alias("status"),
        col("json_parsed.venue").alias("venue"),
        col("json_parsed.teams").alias("teams"),
        col("json_parsed.score").alias("score"),
        col("json_parsed.matchStarted").alias("matchStarted"),
        col("json_parsed.matchEnded").alias("matchEnded")
    )

    # Convert event_time to timestamp
    flat_df = flat_df.withColumn("event_time_ts", col("event_time").cast("timestamp"))

    # Watermark and get latest match updates
    max_times = flat_df.groupBy("id").agg(Fmax("event_time_ts").alias("max_ts")) \
                      .withColumnRenamed("id", "max_id")

    latest_df = flat_df.join(
        max_times,
        (flat_df.id == max_times.max_id) & (flat_df.event_time_ts == max_times.max_ts),
        "inner"
    ).drop("max_id", "max_ts")

    # Add match_status
    latest_df = latest_df.withColumn(
        "match_status",
        when((col("matchStarted") == True) & (col("matchEnded") == False) & (size(col("score")) > 0), "Live")
        .when((col("matchStarted") == True) & (col("matchEnded") == False) & (size(col("score")) == 0), "Upcoming")
        .when(col("matchEnded") == True, "Completed")
        .otherwise("Unknown")
    )

    # Filter Live matches
    live_df = latest_df.filter(col("match_status") == "Live")

    # Explode score array
    exploded_df = live_df.select(
        "id", "name", "matchType", "event_time_ts", "status", "venue", "teams", "match_status",
        explode(col("score")).alias("score_entry")
    )

    # Final flattened score view
    final_df = exploded_df.select(
        col("id").alias("match_id"),
        "name",
        "matchType",
        "event_time_ts",
        "status",
        "venue",
        "teams",
        "match_status",
        col("score_entry.inning").alias("inning"),
        col("score_entry.r").alias("runs"),
        col("score_entry.w").alias("wickets"),
        col("score_entry.o").alias("overs")
    )
    
    dedup_df = final_df.dropDuplicates(["match_id", "inning", "event_time_ts"])

    # Write batch output to S3 (append mode)
    final_df.write.mode("append").parquet(target_path)


In [3]:
 from datetime import datetime

# Step 1: Get UTC time and build S3 input path
now = datetime.utcnow()
s3_path = f"s3a://aws-glue-assets-cricket/raw_cricket_data/year={now.year}/month={now.strftime('%m')}/*/*/"

# Step 2: Read parquet batch (not streaming)
df = spark.read.format("parquet").load(s3_path)

# Step 3: Print schema and show some sample rows
df.printSchema()
df.show(5, truncate=False)


root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- matchType: string (nullable = true)
 |-- event_time: timestamp (nullable = true)
 |-- ingested_at: string (nullable = true)
 |-- json_data: string (nullable = true)

+------------------------------------+---------------------------------------------------------------+---------+--------------------------+-----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Start the Stream with calling process_batch (above)

In [None]:
from datetime import datetime
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

import pyspark.sql.functions as F

# Step 1: Get UTC time and build S3 input path
now = datetime.utcnow()
s3_path = f"s3a://aws-glue-assets-cricket/raw_cricket_data/year={now.year}/month={now.strftime('%m')}/*/*/"

from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("matchType", StringType(), True),
    StructField("event_time", TimestampType(), True),  # Changed to TimestampType
    StructField("ingested_at", StringType(), True),    # Added missing field
    StructField("json_data", StringType(), True)
])


# Step 2: Read stream from S3 (define 'schema' for raw data with 'json_data' and 'event_time')
df = spark.readStream \
    .format("parquet") \
    .schema(schema) \
    .option("maxFilesPerTrigger", 1) \
    .load(s3_path)

# Step 3: Start structured streaming job
query = df.writeStream \
    .foreachBatch(process_batch) \
    .option("checkpointLocation", checkpoint_path) \
    .start()

print("Structured Streaming started...")

# Step 4: Keep it running
query.awaitTermination()


Structured Streaming started...
