
# DAY 2 â€“ Apache Spark Fundamentals

## Learnings
- Spark architecture (Driver, Executors, DAG)
- DataFrames vs RDDs
- Lazy evaluation
- Databricks magic commands


In [0]:

events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/",
    header=True,
    inferSchema=True
)
events.show(10)


+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|2019-11-01 00:00:00|      view|   1003461|2053013555631882655|electronics.smart...|  xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|  janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:01|      view|  17302664|2053013553853497655|                NULL|   creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|      lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:01|      view|   1004775|2053013555631882655|electronics.s

In [0]:

events.select("event_type", "product_id", "price").show(10)


+----------+----------+------+
|event_type|product_id| price|
+----------+----------+------+
|      view|   1003461|489.07|
|      view|   5000088|293.65|
|      view|  17302664| 28.31|
|      view|   3601530|712.87|
|      view|   1004775|183.27|
|      view|   1306894|360.09|
|      view|   1306421|514.56|
|      view|  15900065| 30.86|
|      view|  12708937| 72.72|
|      view|   1004258|732.07|
+----------+----------+------+
only showing top 10 rows


In [0]:

events.filter("price > 100").count()


72300800

In [0]:

events.groupBy("event_type").count().show()


+----------+---------+
|event_type|    count|
+----------+---------+
|  purchase|  1659788|
|      cart|  3955446|
|      view|104335509|
+----------+---------+



In [0]:

top_brands = (
    events.groupBy("brand")
          .count()
          .orderBy("count", ascending=False)
          .limit(5)
)
top_brands.show()


+-------+--------+
|  brand|   count|
+-------+--------+
|   NULL|15331243|
|samsung|13172020|
|  apple|10381933|
| xiaomi| 7721825|
| huawei| 2521331|
+-------+--------+



In [0]:

top_brands.write.mode("overwrite").parquet("/Volumes/workspace/ecommerce/ecommerce_data/output/")
