In [None]:
# Create the Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession 
    .builder 
    .appName("Read and Write using Cosmos DB") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) 
    .config('spark.jars.packages', 'com.azure.cosmos.spark:azure-cosmos-spark_3-3_2-12:4.15.0')
    .config("spark.sql.shuffle.partitions", 8)
    .master("local[*]") 
    .getOrCreate()
)

spark

In [None]:
# Set configuration settings to connect to Cosmos DB

config = {
  "spark.cosmos.accountEndpoint": "<cosmos-db-endpoint>",
  "spark.cosmos.accountKey": "<secret-key>",
  "spark.cosmos.database": "easewithdata",
  "spark.cosmos.container": "device-data"
}

In [None]:
# Read data from Cosmos DB

df = (
    spark.read.format("cosmos.oltp")
    .options(**config)
    .option("spark.cosmos.read.inferSchema.enabled", "true")
    .load()
    
)

In [None]:
df.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- temperature: integer (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |-- eventId: string (nullable = true)
 |-- customerId: string (nullable = true)
 |-- id: string (nullable = false)
 |-- eventPublisher: string (nullable = true)
 |-- eventOffset: integer (nullable = true)
 |-- eventTime: string (nullable = true)



In [None]:
df.show()

+--------------------+--------------------+----------+--------------------+--------------+-----------+--------------------+
|                data|             eventId|customerId|                  id|eventPublisher|eventOffset|           eventTime|
+--------------------+--------------------+----------+--------------------+--------------+-----------+--------------------+
|{[{D001, 15, C, E...|e3cb26d3-41b2-49a...|   CI00103|3277242f-bee6-409...|        device|      10001|2023-01-05 11:13:...|
|                {[]}|8c202190-bc24-448...|   CI00104|ca104bd4-8328-4ac...|        device|      10002|2023-01-05 11:13:...|
+--------------------+--------------------+----------+--------------------+--------------+-----------+--------------------+



In [None]:
# Write data to Cosmos DB

df_read = spark.read.json("datasets/devices/device_03.json")

In [None]:
df_read.show()

In [None]:
# Write data to Cosmos DB
from pyspark.sql.functions import col

df_read.withColumn("id", col("eventId")).write \
    .format("cosmos.oltp") \
    .options(**config) \
    .option("spark.cosmos.write.strategy", "ItemDelete") \
    .mode("APPEND") \
    .save()