In [0]:
#In this cell we are reading the data from...

#from pyspark.sql.types import *
#schema = StructType([
# StructField("cookTime",StringType(),True),
# StructField("datePublished",DateType(),True),
# StructField("description",StringType(),True),
# StructField("image",StringType(),True),
# StructField("ingredients",StringType(),True),
# StructField("name",StringType(),True),
# StructField("prepTime",StringType(),True),
# StructField("recipeYield",StringType(),True),
# StructField("url",StringType(),True)
 # ])
#df_with_schema = spark.read.schema(schema).option("multiline","true").json("dbfs:/FileStore/tables/json/JsonTest2.json")

df = spark.read.option("multiline","true").json("dbfs:/FileStore/tables/json/JsonTest2.json")
df1 = df.limit(100) 
df1.show()


+--------+-------------+--------------------+--------------------+--------------------+--------------------+--------+-----------+--------------------+
|cookTime|datePublished|         description|               image|         ingredients|                name|prepTime|recipeYield|                 url|
+--------+-------------+--------------------+--------------------+--------------------+--------------------+--------+-----------+--------------------+
|   PT45M|   2010-10-14|I have a good, ba...|http://static.the...|4-1/2 cups Water\...|Creamy Cheese Gri...|    PT5M|          8|http://thepioneer...|
|   PT20M|   2010-10-20|There are few thi...|http://static.the...|2 whole Rib-eye O...|     Big Steak Salad|    PT1M|          4|http://thepioneer...|
|   PT15M|   2010-10-26|It's time.     It...|http://static.the...|3 cups Apple Juic...|My Favorite Turke...|   PT10M|         18|http://thepioneer...|
|    PT1H|   2010-10-27|In the next post,...|http://static.the...|2 whole Medium Sp...|Spaghet

In [0]:
#In this cell we are defining the data quality checks and standards and writing functions for the validation rules


In [0]:
#In this cell we are filtering out all the bad records which are rejected by the data quality checks and validation rules mentioned in the cell above
#The bad records are separated from the main flow and stored in a separate table in the database

df1 = df1.filter(df.cookTime != "PT").filter(df.prepTime != "PT").filter(df.prepTime != "PT100M").filter(df.prepTime != "PT900M")
df1 = df1.filter(df.cookTime != "")
df1 = df1.filter(df.cookTime != 'null').filter(df.prepTime != 'null')
df1.count()

Out[40]: 89

In [0]:
#In this cell we are doing some basic data conversion needed for further processing of the data

from pyspark.sql import functions as F
import pandas as pd

@F.pandas_udf("int")
def parse_iso8601_duration(str_duration: pd.Series) -> pd.Series:
    return str_duration.apply(lambda duration: (pd.Timedelta(duration).seconds / 60))
  
df1 = df1.withColumn("cookTime_in_minutes", parse_iso8601_duration(F.col("cookTime")))
df1 = df1.withColumn("prepTime_in_minutes", parse_iso8601_duration(F.col("prepTime")))

In [0]:
#In this cell we are persisting(writing) the final dataset to a delta lake table
#We are performing SCD type 2 on all the new updates for the recepies

df1.write.format("delta") \
  .mode("append") \
  .option("maxRecordsPerFile", "10000") \
  .saveAsTable("TestJsonFinal")
