In [None]:

## Importing Libraries and ceating spark session

import pyspark
from pyspark.sql import SparkSession as ss
from pyspark.sql.functions import regexp_extract, regexp_replace,avg, udf
from pyspark.sql.functions import round as Round
from pyspark.sql.functions import col, when
from pyspark.sql.functions import to_date
from pyspark.sql.functions import lower
from pyspark.sql.types import IntegerType
import re
from statistics import mean


spark = ss.builder.appName('ADDO-EXAM').getOrCreate()
sc= spark.sparkContext


For this task I had uploaded the recipe files to my Azure storage and perform operations by reading them from the Azure blob storage, since in real time enviroment the data is being stored on the cloud storage so it make sense to do this way by storing files on cloud and then performing spark tasks.

In [None]:
# Authentication to access the files on storage
storage_account_name = "sparkexamstorage"  
storage_account_access_key = "QmRcItJlh4AE8ba/9x8sQSs2D6DhsusEf2CtNIiZBQRCBP84mRJR8eprOFGjeNtn5xzqzpAZu1XS+AStyAzBWg=="

# Containers on Azure Storage
inp_container = "data" ## Container name where recipies are present 
inp_file_type = "json"

out_container = "output" ## Container name where final result file containg only the beef recipies
out_file_type = "csv"



In [None]:
# Configuring for Spark Application
spark.conf.set(
  "fs.azure.account.key."+storage_account_name+".blob.core.windows.net",
  storage_account_access_key)

In [None]:
input_loc = dbutils.fs.ls(f"wasbs://{inp_container}@{storage_account_name}.blob.core.windows.net/")

output_container_path = f"wasbs://{out_container}@{storage_account_name}.blob.core.windows.net/"

In [None]:

# Since all the column are in string format and need to be corected according to the requiremnet:
# > Time is given in ISO format need to extract time in minutes.
# > Dish per person serving needs to be extracted from recipeYield columns 
# > Filling/removing of Null record.
# > Removing extra spaces/punctuations from description and ingredients columns.
# 
# Therefore for sucessful task completion for that some functions are needed to be defined as:


###########################################################
#### Function to extract the Time per dish in minutes #####
###########################################################

def get_time(ISOtime_):
    hours = 0
    minutes = 0
    
    # extract hours, based on regular expression
    hours_match = re.search(r'(\d+)H', ISOtime_)
    if hours_match:
        hours = int(hours_match.group(1))
        
    # extract minutes, based on regular expression
    minutes_match = re.search(r'(\d+)M', ISOtime_)
    if minutes_match:
        minutes = int(minutes_match.group(1))
        
    # Calculate total minutes
    total_minutes = hours * 60 + minutes
    return total_minutes

###################################################
#### Function to extract the serving per dish #####
###################################################

def extract_serving_value(s):
    num_regex = re.compile(r'\d+')
    if len(s)==1 and isinstance(s,int):
        return (s)
    else:
        nums = [int(num) for num in num_regex.findall(s)]
        return (mean(nums)) if nums else (1)

  

### TASK 1

In [None]:

###########################################################
##### TASK 1 ##############################################
########################################################### 

def PrePocessing(df):


  df2 = df.select("datePublished","name","cookTime","ingredients","prepTime","recipeYield")

  ####################################
  ##### Operation on datePublished ###
  ####################################

  # Converting the date to Date format from string
  df2 = df2.withColumn("Date Published", to_date(df2["datePublished"]))


  ####################################
  ##### Operation on Name Column #####
  ####################################


  # Remove extra spaces, punctuations and lower casing from the 'name' column
  df2 = df2.withColumn('Dish_Name', regexp_replace('name', '[^\w\s]+', '').alias('Name'))
  df2 = df2.withColumn('Dish_Name', regexp_replace('name', '\s+', ' ').alias('Name'))
  df2 = df2.withColumn('Dish_Name', lower(df2['name']))


  ####################################
  ##### Operation on Ingredients #####
  ####################################


  # Remove extra spaces, punctuations and lower casing from the 'ingredients' column
  df2 = df2.withColumn('Ingridients', regexp_replace('ingredients', '[^\w\s]+', '').alias('Ingridients'))
  df2 = df2.withColumn('Ingridients', regexp_replace('ingredients', '\s+', ' ').alias('Ingridients'))
  df2 = df2.withColumn('Ingridients', lower(df2['ingredients']))


  ####################################
  ##### Operation on Cook Time #######
  ####################################

  # Getting cook time in minutes 
  cooktime_func = udf(get_time, IntegerType())

  # Applying the operation to cooktime column
  df2 = df2.withColumn('Cook_Time', cooktime_func(df2['cookTime']))

  # Replace null values with the mean of previous values
  mean_val_CT = df2.select(avg(col('Cook_Time'))).collect()[0][0]
  df2 = df2.fillna({'Cook_Time': mean_val_CT})

  # Round the values of newly added cook and prep columns
  df2 = df2.withColumn('Cook_Time', Round(col('Cook_Time'), 0))


  ####################################
  ##### Operation on Prep Time #######
  ####################################

  # Getting Prep time in minutes 
  preptime_func = udf(get_time, IntegerType())

  # Applying the operation to preptime column
  df2 = df2.withColumn('Prep_Time', preptime_func(df2['prepTime']))

  # Replace null values with the mean of previous values
  mean_val_PT = df2.select(avg(col('Prep_Time'))).collect()[0][0]
  df2 = df2.fillna({'Prep_Time': mean_val_PT})

  # Round the values of newly added cook and prep columns
  df2 = df2.withColumn('Prep_Time', Round(col('Prep_Time'), 0))

  ####################################
  ##### Operation on recipeYield #####
  ####################################


  # Getting the Serving per dish 
  serving_func = udf(extract_serving_value, IntegerType())
  df2 = df2.withColumn('Serving', serving_func(df2['recipeYield']))

  # Round the values of newly added cook and prep columns
  df2 = df2.withColumn('Serving', Round(col('Serving'), 0))



  # Drop the old columns
  df2 = df2.drop('cookTime')
  df2 = df2.drop('prepTime')
  df2 = df2.drop('name')
  df2 = df2.drop('datePublished')
  df2 = df2.drop('recipeYield')
  df2 = df2.drop('ingredients')

  return df2






+--------------+--------------------+--------------------+---------+---------+-------+
|Date Published|           Dish_Name|         Ingridients|Cook_Time|Prep_Time|Serving|
+--------------+--------------------+--------------------+---------+---------+-------+
|    2010-10-14|creamy cheese gri...|4-1/2 cups water\...|       45|        5|      8|
|    2010-10-20|     big steak salad|2 whole rib-eye o...|       20|        1|      4|
|    2010-10-26|my favorite turke...|3 cups apple juic...|       15|       10|     18|
|    2010-10-27|spaghetti squash ...|2 whole medium sp...|       60|       10|      8|
|    2010-11-01|pear clafouti, th...|2 whole pears\n2 ...|       45|       15|     12|
|    2010-10-29|soul sweet ‘tater...|4 whole medium sw...|       30|       45|     10|
|    2010-11-04|cranberry-pomegra...|1 bag (about 12 t...|       15|        2|     12|
|    2010-11-08|green bean casserole|2 pounds fresh gr...|       30|       20|      8|
|    2010-11-15|    dreamy apple pie|1 whol

### TASK 2

In [None]:

###########################################################
##### TASK 2 ##############################################
########################################################### 

def TASK2DF(df2):


  # Create a new column by adding two existing columns
  df2 = df2.withColumn('Total_cook_time', col('Cook_Time') + col('Prep_Time'))

  # Create a new column based on the value of the 'sum' column
  df2 = df2.withColumn('Difficulty', when(col('Total_cook_time') < 30, 'Easy')
                                    .when((col('Total_cook_time') >= 30) & (col('Total_cook_time') <= 60), 'Medium')
                                    .when(col('Total_cook_time') > 60, 'Hard'))


  return df2


  

In [None]:
df = {}

# Iterate over all the saved records on the cloud storage and perform operations based on our tasks requirement.
for l in range(len(input_loc)):
  file_location = input_loc[l][0]
  df[file_location[-16:-5]] = spark.read.format(inp_file_type).load(file_location)
  output_folder = f"wasbs://{out_container}@{storage_account_name}.blob.core.windows.net/Output-{file_location[-16:-5]}/"

  # ##### TASK 1 
  df2 = PrePocessing(df[file_location[-16:-5]]) 
  # Persist the dataframe for future processing
  df[file_location[-16:-5]] = df2.persist()




  # ##### TASK 2
  df[file_location[-16:-5]] =  TASK2DF(df[file_location[-16:-5]])
  df2 = df[file_location[-16:-5]]

  # Beef Recipe Extraction  
  df2.createOrReplaceTempView('EXAM')
  beef_recipe = spark.sql("select * from EXAM where (Ingridients like '%beef%') or (Dish_Name like '%beef%')")

  # Calculating average cooking time duration per difficulty level of the whole dataset.
  df2.createOrReplaceTempView('AvgCookDiff')
  Avg_T_D = spark.sql("select Difficulty , ROUND(AVG(Total_cook_time),0) as AVGCook_Time from AvgCookDiff group by Difficulty")
  
  # Avg_T_D.show()
  # break


  ##### Writing the final output to the Azure cloud storage  
  (Avg_T_D.coalesce(1).write.mode("overwrite").option("header", "true").format(out_file_type).save(output_folder))

  # Get the name of the wrangled-data CSV file that was just saved to Azure blob storage.
  files = dbutils.fs.ls(output_folder)
  output_file = [x for x in files if x.name.startswith("part-")]

  # Renameing the file name since the save file has a very large name
  dbutils.fs.mv(output_file[0].path,f"{output_container_path}/Output-{file_location[-16:-5]}/Final-Recipee-{file_location[-8:-5]}.csv")

  # #break

#  df2.show(5)




### CREATING DASHBOARD

In [None]:
finalDF = df['recipes-000'].union(df['recipes-001']).union(df['recipes-002'])


In [None]:
display(finalDF)

Date Published,Dish_Name,Ingridients,Cook_Time,Prep_Time,Serving,Total_cook_time,Difficulty
2010-10-14,creamy cheese grits with chilies,"4-1/2 cups water 1/2 teaspoon salt 1 cup grits (quick or regular) 1/2 can (10 ounce can) rotel (tomatoes and chilies) 1 can (4 ounce can) chopped green chilies 8 ounces, weight monterey jack cheese, grated 4 ounces, weight cream cheese, cut into cubes 1/4 teaspoon cayenne pepper 1/4 teaspoon paprika  black pepper to taste 1 whole egg beaten",45,5,8.0,50,Medium
2010-10-20,big steak salad,"2 whole rib-eye or strip steaks, extra fat trimmed  _____  for the dressing/marinade: 3/4 cups canola oil 3 tablespoons red wine vinegar 1 tablespoon balsamic vinegar 1 tablespoon worcestershire sauce 2 tablespoons soy sauce 1 teaspoon (additional) soy sauce 2 tablespoons lime juice 2 tablespoons sugar 3 cloves garlic, peeled 1 tablespoon minced fresh ginger 1/2 teaspoon hot chili oil 1 teaspoon kosher salt  lots of freshly ground black pepper  _____  for the onion strings: 2 whole onions, sliced as thin as possible 2 cups buttermilk 2 cups flour 1 tablespoon salt 1/2 teaspoon cayenne pepper 1 quart canola oil  black pepper to taste  _____  for the candied pecan bits: 1/2 cup pecans, chopped 1 cup sugar 2 tablespoons water  _____  for the salad:  lettuce mix: romaine, arugula, watercress, raddiccio, etc.  small grape tomatoes 3/4 cups crumbled blue cheese",20,1,4.0,21,Easy
2010-10-26,my favorite turkey brine,"3 cups apple juice or apple cider 2 gallons cold water 4 tablespoons fresh rosemary leaves 5 cloves garlic, minced 1-1/2 cup kosher salt 2 cups brown sugar 3 tablespoons peppercorns 5 whole bay leaves  peel of three large oranges",15,10,18.0,25,Easy
2010-10-27,spaghetti squash with maple syrup and shallots,"2 whole medium spaghetti squash 3 tablespoons butter 2 whole shallots, finely minced 1/4 cup maple syrup  dash of salt  dash of nutmeg",60,10,8.0,70,Hard
2010-11-01,"pear clafouti, three ways","2 whole pears 2 teaspoons minced or grated ginger 2 tablespoons sugar 2 teaspoons brandy, optional 3/4 cups all-purpose flour 1/2 teaspoon salt 1/2 cup sugar 3 whole eggs, beaten with a fork 2 cups whole milk (substitute half with cream if you desire) 1 teaspoon vanilla",45,15,12.0,60,Medium
2010-10-29,soul sweet ‘taters (step-by-step!),4 whole medium sweet potatoes 1 cup sugar 1 cup milk 2 whole eggs 1 teaspoon vanilla extract 1 teaspoon salt 1 cup brown sugar 1 cup pecans 1/2 cup flour 3/4 sticks butter,30,45,10.0,75,Hard
2010-11-04,cranberry-pomegranate sauce,"1 bag (about 12 to 16 oz) fresh cranberries 16 ounces, fluid pomegranate juice 3/4 cups sugar, more or less to taste",15,2,12.0,17,Easy
2010-11-08,green bean casserole,"2 pounds fresh green beans, ends cut off 4 slices bacon, cut into 1/4 inch pieces 3 cloves garlic, minced 1/2 whole large onion, chopped 4 tablespoons butter 4 tablespoons all-purpose flour 2-1/2 cups whole milk 1/2 cup half-and-half 1-1/2 teaspoon salt, more to taste  freshly ground black pepper, to taste 1/8 teaspoon cayenne pepper 1 cup grated sharp cheddar cheese 1 jar (4 ounce) sliced pimentoes, drained  extra milk for thinning if necessary 1 cup panko bread crumbs",30,20,8.0,50,Medium
2010-11-15,dreamy apple pie,"1 whole unbaked pie crust  filling 3 whole large (4 or 5 small) granny smith apples, peeled, cored, and sliced thin 1/2 cup brown sugar 1/2 cup sugar 1 tablespoon all-purpose flour 1 cup heavy cream 2 teaspoons vanilla extract 1/8 teaspoon cinnamon  topping 7 tablespoons butter 3/4 cups all-purpose flour 1/2 cup brown sugar 1/4 cup pecans (more to taste)  dash of salt",60,20,12.0,80,Hard
2010-11-14,hard sauce,"1 stick (1/2 cup) softened (not room temperature) butter 1-1/2 cup powdered sugar 2 tablespoons whiskey, more or less to taste",0,5,8.0,5,Easy


Output can only be rendered in Databricks

Output can only be rendered in Databricks

Output can only be rendered in Databricks

Output can only be rendered in Databricks

Output can only be rendered in Databricks

Output can only be rendered in Databricks

Output can only be rendered in Databricks