# Load AWS access key, secret key and region

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, LongType
import urllib

In [0]:
ACCESS_KEY = dbutils.secrets.get(scope="aws", key="access_key_id")
SECRET_KEY = dbutils.secrets.get(scope="aws", key="secret_access_key")
REGION = dbutils.secrets.get(scope="aws", key="region")

# Stream transformations

In [0]:
# define the output and checkpoint paths for the cleaned user data
outputPath = '/mnt/pinterest_data/test_streaming_delta_tables/0e3bbd435bfb_user_table'
checkpointPath = '/mnt/pinterest_data/test_streaming_delta_tables/checkpoints/user'

# define partition key
partition_key = "test_user"

# define aws access variables and a dataframe to read from kinesis stream
awsAccessKeyId = ACCESS_KEY
awsSecretKey = SECRET_KEY
kinesisStreamName = "streaming-0e3bbd435bfb-user"
kinesisRegion = REGION
df = (spark.readStream
    .format("kinesis") 
    .option("streamName", kinesisStreamName)
    .option("region", kinesisRegion)
    .option("initialPosition", "LATEST")
    .option("format", "json")
    .option("awsAccessKey", awsAccessKeyId)
    .option("awsSecretKey", awsSecretKey)
    .option("inferSchema", "true")
    .option("minFetchPeriod", "200ms")
    .load())

# schema for the data
schema = StructType([
    StructField("ind", LongType()),
    StructField("first_name", StringType()),
    StructField("last_name", StringType()),
    StructField("age", LongType()),
    StructField("date_joined", StringType())
])

# transformations
transformed_df = (
    # filter data with required shardId
    df.filter(F.col("partitionKey") == partition_key)

        # decode the data column
        .withColumn(
            "decoded_data",
            F.unbase64(
                F.col("data")
            ).cast("string")
        )

        # Use from_json to parse the JSON string in decoded_data and apply the schema
        .withColumn("parsed_data", F.from_json(F.col("decoded_data"), schema))
        
        # select the individual fields from the parsed_data column
        .select(
            F.col("parsed_data.ind"),
            F.col("parsed_data.first_name"),
            F.col("parsed_data.last_name"),
            F.col("parsed_data.age"),
            F.col("parsed_data.date_joined"),
        )

        # create a new column 'user_name' by concatenating 'first_name' and 'last_name'
        .withColumn("user_name", F.concat(F.col("first_name"), F.col("last_name")))

        # drop first_name and last_name
        .drop("first_name", "last_name")

        # convert the date_joined column from a string to a timestamp data type
        .withColumn("date_joined", F.to_timestamp(F.col("date_joined")).cast("timestamp"))
)

# write the stream to a delta table
query = (
    transformed_df.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpointPath)
    .start(outputPath)
)

# keep the stream running
query.awaitTermination()