In [113]:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,udf,lit,lower,unix_timestamp,count,coalesce, regexp_extract,year,when,lit,avg
from pyspark.sql.window import Window
import os 

In [2]:
spark = SparkSession.builder.master("local").appName("HelloFreshTest").getOrCreate()

21/12/04 15:00:22 WARN Utils: Your hostname, Sannis-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.29.191 instead (on interface en0)
21/12/04 15:00:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/12/04 15:00:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [197]:
path = 'cooking_etl/src/input/'

In [205]:
df = spark.read.json(path).drop(col("image")).drop(col("url")).drop("description")

In [206]:
df.summary().show()

[Stage 226:>                                                        (0 + 1) / 1]

+-------+--------+-------------+--------------------+--------------------+--------+-----------------+
|summary|cookTime|datePublished|         ingredients|                name|prepTime|      recipeYield|
+-------+--------+-------------+--------------------+--------------------+--------+-----------------+
|  count|    1042|         1042|                1042|                1042|    1042|             1042|
|   mean|    null|         null|                null|                null|    null|9.738404452690167|
| stddev|    null|         null|                null|                null|    null|6.529901914334942|
|    min|        |   2003-05-27| (Quantities Depe...|                    |        |                 |
|    25%|    null|         null|                null|                null|    null|              6.0|
|    50%|    null|         null|                null|                null|    null|              8.0|
|    75%|    null|         null|                null|                null|    null

                                                                                

In [130]:
df.printSchema()

root
 |-- cookTime: string (nullable = true)
 |-- datePublished: string (nullable = true)
 |-- description: string (nullable = true)
 |-- image: string (nullable = true)
 |-- ingredients: string (nullable = true)
 |-- name: string (nullable = true)
 |-- prepTime: string (nullable = true)
 |-- recipeYield: string (nullable = true)
 |-- url: string (nullable = true)



In [131]:
df.groupBy("prepTime").agg(count("*")).show()

+--------+--------+
|prepTime|count(1)|
+--------+--------+
|    PT2M|       7|
|   PT15M|     137|
|    PT1H|      11|
|   PT24H|       3|
|  PT950M|       1|
|    PT5M|     108|
|    PT6M|       1|
|    PT3M|       2|
|      PT|       3|
|    PT1M|       5|
|   PT40M|       5|
|   PT20M|      99|
| PT1H15M|       1|
|    PT6H|       3|
|   PT60M|       7|
|    PT4H|       4|
|   PT18H|       1|
|  PT900M|       1|
|   PT65M|       1|
|   PT35M|       3|
+--------+--------+
only showing top 20 rows





In [132]:
df.groupBy("recipeYield").agg(count("*")).show(truncate=False)

+--------------------------------------------+--------+
|recipeYield                                 |count(1)|
+--------------------------------------------+--------+
|Makes four sandwiches.                      |1       |
|Serves about 4 - 6 as a side.               |1       |
|Makes two mega scones.                      |1       |
|Makes about 1 1/2 to 2 cups of puree.       |1       |
|7                                           |1       |
|Serves 2 - 4.                               |4       |
|Makes ~1 1/2 cups of spread.                |1       |
|Makes 18 - 24 medium cookies.               |1       |
|Makes a big pot - enough for 8 -10 servings.|1       |
|Serves 12 - 16 modest slices.               |1       |
|Makes 2 dozen wide-cut fries.               |1       |
|Serves about 12.                            |2       |
|Makes one generous, family-style platter.   |1       |
|Makes one carrot cake.                      |1       |
|Serves 6 or more.                           |1 

In [133]:
df.count()

1042

In [134]:
beefDF = df.filter(lower(col("ingredients")).contains("beef"))

In [135]:
beefDF.count()

47

In [136]:
df.where(col("ingredients").isNull() | (col("ingredients")=="")).show()

+--------+-------------+-----------+-----+-----------+----+--------+-----------+---+
|cookTime|datePublished|description|image|ingredients|name|prepTime|recipeYield|url|
+--------+-------------+-----------+-----+-----------+----+--------+-----------+---+
+--------+-------------+-----------+-----+-----------+----+--------+-----------+---+



In [137]:
from pyspark.sql.types import DateType, StringType, StructField, StructType


class RecipeSchema:
    RECIPE = StructType(
        [
            StructField("name", StringType(), nullable=False),
            StructField("ingredients", StringType(), nullable=False),
            StructField("url", StringType()),
            StructField("image", StringType()),
            StructField("cookTime", StringType()),
            StructField("recipeYield", StringType()),
            StructField("datePublished", DateType()),
            StructField("prepTime", StringType()),
            StructField("description", StringType()),
        ]
    )

In [138]:
df = spark.read.json(path, schema=RecipeSchema.RECIPE).withColumn('year',year(col('datePublished')))

In [139]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- ingredients: string (nullable = true)
 |-- url: string (nullable = true)
 |-- image: string (nullable = true)
 |-- cookTime: string (nullable = true)
 |-- recipeYield: string (nullable = true)
 |-- datePublished: date (nullable = true)
 |-- prepTime: string (nullable = true)
 |-- description: string (nullable = true)
 |-- year: integer (nullable = true)



In [141]:
df.groupBy("year").agg(count("*")).show()

+----+--------+
|year|count(1)|
+----+--------+
|2003|       4|
|2007|      94|
|2006|      35|
|2013|      36|
|2004|      12|
|2012|     137|
|2009|     172|
|2005|      44|
|2010|     170|
|2011|     157|
|2008|     181|
+----+--------+

