## 0. Create a Spark Session

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CrickeScoree") \
    .config("spark.jars", ",".join([
        r"C:\\spark\\spark-3.5.5-bin-hadoop3\\jars\\delta-core_2.12-3.1.0.jar",
        r"C:\\spark\\spark-3.5.5-bin-hadoop3\\jars\\delta-storage-3.1.0.jar",
        r"C:\\spark\\spark-3.5.5-bin-hadoop3\\jars\\hadoop-aws-3.3.4.jar",
        r"C:\\spark\\spark-3.5.5-bin-hadoop3\\jars\\aws-java-sdk-bundle-1.12.430.jar"
    ])) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain") \
    .getOrCreate()

## Process the data in micro-batch way

In [None]:
from pyspark.sql.functions import col, from_json, when, size, explode, max as Fmax, year, month, dayofmonth
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, ArrayType, IntegerType, FloatType
from datetime import datetime

# JSON schema for stream data
json_schema = StructType([
    StructField("status", StringType()),
    StructField("venue", StringType()),
    StructField("date", StringType()),
    StructField("dateTimeGMT", StringType()),
    StructField("teams", ArrayType(StringType())),
    StructField("teamInfo", ArrayType(
        StructType([
            StructField("name", StringType()),
            StructField("img", StringType())
        ])
    )),
    StructField("score", ArrayType(
        StructType([
            StructField("r", IntegerType()),
            StructField("w", IntegerType()),
            StructField("o", FloatType()),
            StructField("inning", StringType())
        ])
    )),
    StructField("series_id", StringType()),
    StructField("fantasyEnabled", BooleanType()),
    StructField("bbbEnabled", BooleanType()),
    StructField("hasSquad", BooleanType()),
    StructField("matchStarted", BooleanType()),
    StructField("matchEnded", BooleanType())
])

# Paths
target_path = "s3a://aws-glue-assets-cricket/output_cricket/live/score_data"
checkpoint_path = "s3a://aws-glue-assets-cricket/output_cricket/live/score_data/checkpoints"
base_static_path = "s3a://aws-glue-assets-cricket/output_cricket/live/cricket_data"

# Load static metadata for today's partition
def load_static_match_data(spark):
    today = datetime.utcnow().date()
    path = f"{base_static_path}/year={today.year}/month={today.month}/day={today.day}"
    static_df = spark.read.option("basePath", base_static_path).parquet(path)
    static_df = static_df.dropDuplicates(["id"])
    return static_df

# Process streaming micro-batch
def process_batch(batch_df, batch_id):
    static_df = load_static_match_data(batch_df.sparkSession)

    conflicting_cols = ["matchType", "name", "match_status", "venue"]
    for c in conflicting_cols:
        if c in static_df.columns:
            static_df = static_df.drop(c)

    static_df = static_df.withColumnRenamed("id", "match_id")

    parsed_df = batch_df.withColumn("json_parsed", from_json(col("json_data"), json_schema))

    flat_df = parsed_df.select(
        "id", "name", "matchType", "event_time",
        col("json_parsed.status").alias("status"),
        col("json_parsed.venue").alias("venue"),
        col("json_parsed.teams").alias("teams"),
        col("json_parsed.score").alias("score"),
        col("json_parsed.matchStarted").alias("matchStarted"),
        col("json_parsed.matchEnded").alias("matchEnded")
    ).withColumn("event_time_ts", col("event_time").cast("timestamp"))

    max_times = flat_df.groupBy("id").agg(Fmax("event_time_ts").alias("max_ts")) \
                       .withColumnRenamed("id", "max_id")

    latest_df = flat_df.join(
        max_times,
        (flat_df.id == max_times.max_id) & (flat_df.event_time_ts == max_times.max_ts),
        "inner"
    ).drop("max_id", "max_ts")

    latest_df = latest_df.withColumn(
        "match_status",
        when((col("matchStarted") == True) & (col("matchEnded") == False) & (size(col("score")) > 0), "Live")
        .when((col("matchStarted") == True) & (col("matchEnded") == False), "Upcoming")
        .when(col("matchEnded") == True, "Completed")
        .otherwise("Unknown")
    )

    live_df = latest_df.filter(col("match_status") == "Live")

    exploded_df = live_df.select(
        "id", "name", "matchType", "event_time_ts", "status", "venue", "teams", "match_status",
        explode(col("score")).alias("score_entry")
    )

    final_df = exploded_df.select(
        col("id").alias("match_id"),
        "name",
        "matchType",
        "event_time_ts",
        "status",
        "venue",
        "teams",
        "match_status",
        col("score_entry.inning").alias("inning"),
        col("score_entry.r").alias("runs"),
        col("score_entry.w").alias("wickets"),
        col("score_entry.o").alias("overs")
    ).dropDuplicates(["match_id", "inning", "event_time_ts"])

    enriched_df = final_df.join(static_df, on="match_id", how="left")

    # 🆕 Add partition columns before writing
    enriched_df = enriched_df.withColumn("year", year("event_time_ts")) \
                             .withColumn("month", month("event_time_ts")) \
                             .withColumn("day", dayofmonth("event_time_ts"))

    enriched_df.write \
        .partitionBy("year", "month", "day") \
        .mode("append") \
        .parquet(target_path)

# Example streaming job start (you already know how to plug in your Kafka stream here)
# query = streaming_df.writeStream.foreachBatch(process_batch).option("checkpointLocation", checkpoint_path).start()
# query.awaitTermination()


## This is juts to test purpose 

In [None]:
#  from datetime import datetime

# # Step 1: Get UTC time and build S3 input path
# now = datetime.utcnow()
# s3_path = f"s3a://aws-glue-assets-cricket/raw_cricket_data/year={now.year}/month={now.strftime('%m')}/*/*/"
# s3_path_score_live =f"s3a://aws-glue-assets-cricket/output_cricket/live/score_data/"  
# # Step 2: Read parquet batch (not streaming)
# df = spark.read.format("parquet").load(s3_path_score_live)

# # Step 3: Print schema and show some sample rows
# df.printSchema()
# #Full output
# # df.show(5, truncate=False)
# df.show(30, truncate=False)



root
 |-- match_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- matchType: string (nullable = true)
 |-- event_time_ts: timestamp (nullable = true)
 |-- status: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- teams: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- match_status: string (nullable = true)
 |-- inning: string (nullable = true)
 |-- runs: integer (nullable = true)
 |-- wickets: integer (nullable = true)
 |-- overs: float (nullable = true)

+------------------------------------+---------------------------------------------------------------+---------+--------------------------+-------------------------------------------------------+--------------------------------------------+--------------------------------------------------+------------+----------------------------------+----+-------+-----+
|match_id                            |name                                                           |matchType|even

## Start the Stream with calling process_batch (above)

In [None]:
from datetime import datetime
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

import pyspark.sql.functions as F

# Step 1: Get UTC time and build S3 input path
now = datetime.utcnow()
s3_path = f"s3a://aws-glue-assets-cricket/raw_cricket_data/year={now.year}/month={now.strftime('%m')}/*/*/"

from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("matchType", StringType(), True),
    StructField("event_time", TimestampType(), True),  # Changed to TimestampType
    StructField("ingested_at", StringType(), True),    # Added missing field
    StructField("json_data", StringType(), True)
])


# Step 2: Read stream from S3 (define 'schema' for raw data with 'json_data' and 'event_time')
df = spark.readStream \
    .format("parquet") \
    .schema(schema) \
    .option("maxFilesPerTrigger", 1) \
    .load(s3_path)

# Step 3: Start structured streaming job
query = df.writeStream \
    .foreachBatch(process_batch) \
    .option("checkpointLocation", checkpoint_path) \
    .start()

print("Structured Streaming started...")

# Step 4: Keep it running
query.awaitTermination()


Structured Streaming started...
