In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, date_format, current_timestamp, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType
print("Starting...")

Starting...


In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("test").getOrCreate()
print("Spark session started")
spark.stop()


Spark session started


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("cricket-kinesis-to-s3-stream") \
    .config("spark.jars", "spark-sql-kinesis_2.12-1.2.0_spark-3.0.jar") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain") \
    .getOrCreate()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, date_format, current_timestamp, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType

# --- HARDCODED CONFIG ---
job_name = "cricket-kinesis-to-s3-stream"
region = "eu-west-1"
kinesis_stream_name = "cricket-bball-stream"
s3_output_path = "s3a://aws-glue-assets-cricket"

# --- Spark session ---
spark = SparkSession.builder \
    .appName(job_name) \
    .config("spark.jars", ",".join([
        r"C:\spark\spark-3.5.5-bin-hadoop3\jars\spark-sql-kinesis_2.12-1.2.0_spark-3.0.jar",
        r"C:\spark\spark-3.5.5-bin-hadoop3\jars\aws-java-sdk-bundle-1.12.430.jar",
        r"C:\spark\spark-3.5.5-bin-hadoop3\jars\hadoop-aws-3.3.4.jar"
    ])) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain") \
    .getOrCreate()

print("✅ Spark session started")

# --- Schema for incoming JSON ---
partial_schema = StructType([
    StructField("match_id", StringType(), True),
    StructField("match_name", StringType(), True),
    StructField("match_type", StringType(), True),
    StructField("event_time", StringType(), True)
])

# --- Read from Kinesis ---
kinesis_df = spark.readStream.format("kinesis") \
    .option("streamName", kinesis_stream_name) \
    .option("region", region) \
    .option("endpointUrl", f"https://kinesis.{region}.amazonaws.com") \
    .option("startingPosition", "TRIM_HORIZON") \
    .option("awsAccessKeyId", "AAAAAA") \
    .option("awsSecretKey", "BBBBBB") \
    .load()

# --- Transform and enrich ---
processed_df = kinesis_df.selectExpr("CAST(data AS STRING) as json_data") \
    .withColumn("ingest_ts", current_timestamp()) \
    .withColumn("parsed", from_json(col("json_data"), partial_schema)) \
    .select(
        col("parsed.match_id").alias("id"),
        col("parsed.match_name").alias("name"),
        col("parsed.match_type").alias("matchType"),
        to_timestamp(col("parsed.event_time")).alias("event_time"),
        col("ingest_ts").cast("string").alias("ingested_at"),
        col("json_data"),
        date_format(col("ingest_ts"), "yyyy").alias("year"),
        date_format(col("ingest_ts"), "MM").alias("month"),
        date_format(col("ingest_ts"), "dd").alias("day"),
        date_format(col("ingest_ts"), "HH").alias("hour")
    )

# --- Write to S3 in partitioned Parquet format ---
output_path = f"{s3_output_path}/raw_cricket_data"
checkpoint_path = f"{s3_output_path}/checkpoints"

query = processed_df.writeStream \
    .format("parquet") \
    .option("path", output_path) \
    .option("checkpointLocation", checkpoint_path) \
    .partitionBy("year", "month", "day", "hour") \
    .outputMode("append") \
    .start()

print("🚀 Streaming started. Waiting for records...")
query.awaitTermination()


✅ Spark session started
🚀 Streaming started. Waiting for records...


## This job will fetch the data from Kinesis stream and put into raw_cricket_data partition based S3 bucket

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, current_timestamp, to_timestamp, date_format
from pyspark.sql.types import StructType, StructField, StringType

# --- Config ---
job_name = "cricket-kinesis-to-s3-stream"
region = "eu-west-1"
kinesis_stream_name = "cricket-bball-stream"
s3_output_path = "s3a://aws-glue-assets-cricket/raw_cricket_data"
checkpoint_path = "s3a://aws-glue-assets-cricket/checkpoints/raw_kinesis"

# --- Spark Session ---
spark = SparkSession.builder \
    .appName(job_name) \
    .config("spark.jars", "spark-sql-kinesis_2.12-1.2.0_spark-3.0.jar") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain") \
    .getOrCreate()

print("✅ Spark session started")

# --- Schema ---
schema = StructType([
    StructField("match_id", StringType()),
    StructField("match_name", StringType()),
    StructField("match_type", StringType()),
    StructField("event_time", StringType())  # API timestamp
])

# --- Kinesis Source ---
kinesis_df = spark.readStream.format("kinesis") \
    .option("streamName", kinesis_stream_name) \
    .option("region", region) \
    .option("endpointUrl", f"https://kinesis.{region}.amazonaws.com") \
    .option("startingPosition", "LATEST") \
    .load()

# --- Processing ---
processed_df = kinesis_df.selectExpr("CAST(data AS STRING) as json_data") \
    .withColumn("ingest_ts", current_timestamp()) \
    .withColumn("parsed", from_json(col("json_data"), schema)) \
    .select(
        col("parsed.match_id").alias("id"),
        col("parsed.match_name").alias("name"),
        col("parsed.match_type").alias("matchType"),
        to_timestamp(col("parsed.event_time")).alias("event_time"),
        col("json_data"),
        date_format(col("ingest_ts"), "yyyy").alias("year"),
        date_format(col("ingest_ts"), "MM").alias("month"),
        date_format(col("ingest_ts"), "dd").alias("day"),
        date_format(col("ingest_ts"), "HH").alias("hour")
    )

# --- Write to S3 ---
query = processed_df.writeStream \
    .format("parquet") \
    .option("path", s3_output_path) \
    .option("checkpointLocation", checkpoint_path) \
    .partitionBy("year", "month", "day", "hour") \
    .outputMode("append") \
    .start()

print("🚀 Streaming started. Waiting for events...")
query.awaitTermination()


✅ Spark session started
🚀 Streaming started. Waiting for events...


StreamingQueryException: [STREAM_FAILED] Query [id = 0fe4a5bf-9233-4206-a87a-16e7ec3e3f09, runId = b43e0a9a-96d1-4462-91ff-52a66e28a187] terminated with exception: Error while List shards