# Load AWS access key and secret key

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType
import urllib

In [None]:
# Specify file type to be csv
file_type = "csv"
# Indicates file has first row as the header
first_row_is_header = "true"
# Indicates file has comma as the delimiter
delimiter = ","
# Read the CSV file to spark dataframe
aws_keys_df = spark.read.format(file_type)\
    .option("header", first_row_is_header)\
    .option("sep", delimiter)\
    .load("/FileStore/tables/authentication_credentials.csv")

In [None]:
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.where(F.col("User name")=="databricks-user").select('Access key ID').collect()[0]['Access key ID']

SECRET_KEY = aws_keys_df.where(F.col("User name")=="databricks-user").select("Secret access key").collect()[0]["Secret access key"]

# Encode the secret key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

# Stream transformations

In [None]:
# define the output and checkpoint paths for the cleaned user data
outputPath = '/mnt/pinterest_data/test_streaming_delta_tables/0e3bbd435bfb_geo_table'
checkpointPath = '/mnt/pinterest_data/test_streaming_delta_tables/checkpoints/geo'

# define partition key
partition_key = "test_geo"

# define aws access variables and a dataframe to read from kinesis stream
awsAccessKeyId = ACCESS_KEY
awsSecretKey = SECRET_KEY
kinesisStreamName = "streaming-0e3bbd435bfb-geo"
kinesisRegion = "us-east-1"
df = (spark.readStream
    .format("kinesis") 
    .option("streamName", kinesisStreamName)
    .option("region", kinesisRegion)
    .option("initialPosition", "LATEST")
    .option("format", "json")
    .option("awsAccessKey", awsAccessKeyId)
    .option("awsSecretKey", awsSecretKey)
    .option("inferSchema", "true")
    .option("minFetchPeriod", "200ms")
    .load())

# schema for the data
schema = StructType([
    StructField("ind", LongType()),
    StructField("timestamp", StringType()),
    StructField("latitude", DoubleType()),
    StructField("longitude", DoubleType()),
    StructField("country", StringType())
])

# transformations
transformed_df = (
    # filter data with required shardId
    df.filter(F.col("partitionKey") == partition_key)

        # decode the data column
        .withColumn(
            "decoded_data",
            F.unbase64(
                F.col("data")
            ).cast("string")
        )

        # Use from_json to parse the JSON string in decoded_data and apply the schema
        .withColumn("parsed_data", F.from_json(F.col("decoded_data"), schema))
        
        # select the individual fields from the parsed_data column
        .select(
            F.col("parsed_data.ind"),
            F.col("parsed_data.timestamp"),
            F.col("parsed_data.latitude"),
            F.col("parsed_data.longitude"),
            F.col("parsed_data.country"),
        )

        # create a new column coordinates with latitude and longitude
        .withColumn(
            "coordinates",
            F.struct(
                F.col("latitude"),
                F.col("longitude")
            )
        )

        # drop the latitude and longitude columns
        .drop("latitude", "longitude")

        # convert "timestamp" column from string to timestamp
        .withColumn(
            "timestamp",
            F.to_timestamp(
                F.col("timestamp")
            ).cast("timestamp")
        )

        # reorder the columns
        .select(["ind", "country", "coordinates", "timestamp"])
)

# write the stream to a delta table
query = (
    transformed_df.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpointPath)
    .start(outputPath)
)

# keep the stream running
query.awaitTermination()