In [0]:
%pip install azure-cosmos

In [0]:
from pyspark.sql import functions as f
from pyspark.sql.functions import lit
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, when

from azure.cosmos import exceptions, CosmosClient, PartitionKey

cosmosEndpoint = process.env.endPoint
cosmosMasterKey = process.env.key
cosmosDatabaseName = "bhoojal_outlets"
cosmosContainerName = "outlet"
# city data to be processed
dbutils.widgets.text('city', '')
query_city = dbutils.widgets.get('city')
print("Starting notebook for city", query_city)

# Configure Catalog Api to be used
spark.conf.set("spark.sql.catalog.cosmosCatalog", "com.azure.cosmos.spark.CosmosCatalog")
spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountEndpoint", cosmosEndpoint)
spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountKey", cosmosMasterKey)

# build a view with all outlets
all_outlets_cfg = {
  "spark.cosmos.accountEndpoint" : cosmosEndpoint,
  "spark.cosmos.accountKey" : cosmosMasterKey,
  "spark.cosmos.database" : cosmosDatabaseName,
  "spark.cosmos.container" : cosmosContainerName,
  "spark.cosmos.read.customQuery" : "SELECT * FROM c WHERE c.city = '" + query_city + "'"
}

df = spark.read.format("cosmos.oltp").options(**all_outlets_cfg)\
 .option("spark.cosmos.read.inferSchema.enabled", "true")\
 .load()

df.createOrReplaceTempView("CityOutlets")

# For every outlet, %sql
outlets_df = spark.sql("SELECT * FROM CityOutlets")
city_outlets = outlets_df.toPandas()
city_outlets

In [0]:
for i in city_outlets.index:
  query = "SELECT * FROM c WHERE c.city = '" + query_city + "' AND ST_DISTANCE(c.location,{\"type\":\"Point\",\"coordinates\": [" + str(city_outlets['location'][i]['coordinates'][0]) + ", " + str(city_outlets['location'][i]['coordinates'][1]) + "]}) < 5000"
  #print(city_outlets['id'][i], query)
  neighbour_outlet_cfg = {
    "spark.cosmos.accountEndpoint" : cosmosEndpoint,
    "spark.cosmos.accountKey" : cosmosMasterKey,
    "spark.cosmos.database" : cosmosDatabaseName,
    "spark.cosmos.container" : cosmosContainerName,
    "spark.cosmos.read.customQuery" : query
  }

  df = spark.read.format("cosmos.oltp").options(**neighbour_outlet_cfg)\
   .option("spark.cosmos.read.inferSchema.enabled", "true")\
   .load()
  
  df = df.select(f.skewness("quality_ph").alias("ph_score"),f.skewness("quality_hardness").alias("hardness_score"))
  ps = df.collect()[0]["ph_score"]
  hs = df.collect()[0]["hardness_score"]
  score = hs + ps
  #print(score)
  outlets_df = outlets_df.withColumn(
    "quality_score",
    when(
        col("id") == city_outlets['id'][i],
        score
    ).otherwise(col("quality_score"))
  )

outlets_df.show()

In [0]:
# Update using Cosmos  DB Python SDK as Spark doesn't support batch truncate
client = CosmosClient(cosmosEndpoint, cosmosMasterKey)
container_name = 'outlet'

database = client.create_database_if_not_exists(id=cosmosDatabaseName)

container = database.create_container_if_not_exists(
    id=cosmosContainerName, 
    partition_key=PartitionKey(path="/city"),
    offer_throughput=400
)

for i in city_outlets.index:
  outlet_item = container.read_item(item=city_outlets['id'][i], partition_key=query_city)
  outlet_item['quality_score'] = outlets_df.collect()[i]["quality_score"]
  response = container.upsert_item(body=outlet_item)
  print(response)