**CELL DESCRIPTION**  

**Cell 1** : Read Data from Json File to the datafeame  
**Cell 2** : Defining data quality checks and validation rules  
**Cell 3** : Filter out bad/Courrupted records from the dataset and redirect it to a separate table for reporssing  
**Cell 4** : Data cleaning & deriving new columns for further processing    
**Cell 5** : Performing SCD Type 2 and persisting(writing) data to a delta table

In [0]:
#In this cell we are reading the json files from the input folder into a dataframe

from pyspark.sql.types import *
from pyspark.sql.functions import *
from isoduration import parse_duration
from pyspark.sql import functions as F
import pandas as pd

schema = StructType([
         StructField("cookTime",StringType(),True),
         StructField("datePublished",DateType(),True),
         StructField("description",StringType(),True),
         StructField("image",StringType(),True),
         StructField("ingredients",StringType(),True),
         StructField("name",StringType(),True),
         StructField("prepTime",StringType(),True),
         StructField("recipeYield",StringType(),True),
         StructField("url",StringType(),True)
         ])
df_input = spark.read.schema(schema).option("multiline","true").json("dbfs:/FileStore/Input/*.json")
#df = spark.read.option("multiline","true").json("dbfs:/FileStore/Input/recipes_000.json")
#df_input = df_input.limit(200) 

In [0]:
# Defining the data quality checks and validation rules

rule1 = col("cookTime") != "PT" 
rule2 = col("cookTime") != ""
rule3 = col("prepTime") != "PT" 
rule4 = col("prepTime") != ""


In [0]:
#In this cell we are filtering out all the bad records which are rejected by the data quality checks and validation rules mentioned in the cell above

df_goodRecords = df_input.where(rule1 & rule2 & rule3 & rule4).distinct()  
df_badRecords = df_input.subtract(df_goodRecords)



#The bad records are separated from the main flow and stored in a separate table in the database

df_badRecords.write.format("delta")\
                   .mode("overwrite")\
                   .saveAsTable("BadRecords1")

In [0]:
# User Defined Function to parsing the ISO duration to Timedelta in minutes 

@F.pandas_udf("int")
def parse_iso_duration(str_duration):
    return str_duration.apply(lambda duration: (((parse_duration(duration)).time.hours)*60)+((parse_duration(duration)).time.minutes))
  
  
  
# Deriving new columns "cookTime_in_minutes" & "prepTime_in_minutes" by calling the UDF "parse_iso_duration"

try :
    df_final = df_goodRecords.withColumn("cookTime_in_minutes", parse_iso_duration(F.col("PT")))\
                             .withColumn("prepTime_in_minutes", parse_iso_duration(F.col("PT")))
    
    df_final = df_final.select("name","description","ingredients","recipeYield","datePublished","cookTime","prepTime","cookTime_in_minutes","prepTime_in_minutes")
    
except Exception as e :
  
    print("Invalid ISO format given")
  

In [0]:
# persisting(writing) the final dataset to a delta lake table
# We are performing SCD type 2 on all the new updates for the recepies

df_final.write.format("delta")\
                   .mode("overwrite")\
                   .saveAsTable("Recipies")
