In [24]:
spark.stop()

In [25]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession.builder \
    .appName("cache") \
    .master("yarn") \
    .config("spark.ui.port", "0") \
    .config("spark.sql.warehouse.dir", f"/user/{username}/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()


In [26]:
spark

In [27]:
orders_schema = "order_id long, order_date date, customer_id long, order_status string"

In [28]:
orders_df = spark.read \
.format("csv") \
.schema(orders_schema) \
.load("/public/trendytech/orders/orders_1gb.csv")


In [29]:
orders_df.show()

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-07-25|      11599|         CLOSED|
|       2|2013-07-25|        256|PENDING_PAYMENT|
|       3|2013-07-25|      12111|       COMPLETE|
|       4|2013-07-25|       8827|         CLOSED|
|       5|2013-07-25|      11318|       COMPLETE|
|       6|2013-07-25|       7130|       COMPLETE|
|       7|2013-07-25|       4530|       COMPLETE|
|       8|2013-07-25|       2911|     PROCESSING|
|       9|2013-07-25|       5657|PENDING_PAYMENT|
|      10|2013-07-25|       5648|PENDING_PAYMENT|
|      11|2013-07-25|        918| PAYMENT_REVIEW|
|      12|2013-07-25|       1837|         CLOSED|
|      13|2013-07-25|       9149|PENDING_PAYMENT|
|      14|2013-07-25|       9842|     PROCESSING|
|      15|2013-07-25|       2568|       COMPLETE|
|      16|2013-07-25|       7276|PENDING_PAYMENT|
|      17|2013-07-25|       2667|       COMPLETE|


In [9]:
orders_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [10]:
orders_df.count()

25831125

In [12]:
orders_cached = orders_df.cache()

below will take time cuz it has to cache the data and then calulate the count

In [13]:
orders_cached.count()

25831125

below will not take time cuz it will read data from memomry cuz it is already cached

In [None]:
orders_cached.count()

In [15]:
orders_cached.count()

25831125

In [6]:
orders_df.cache()

order_id,order_date,customer_id,order_status
1,2013-07-25,11599,CLOSED
2,2013-07-25,256,PENDING_PAYMENT
3,2013-07-25,12111,COMPLETE
4,2013-07-25,8827,CLOSED
5,2013-07-25,11318,COMPLETE
6,2013-07-25,7130,COMPLETE
7,2013-07-25,4530,COMPLETE
8,2013-07-25,2911,PROCESSING
9,2013-07-25,5657,PENDING_PAYMENT
10,2013-07-25,5648,PENDING_PAYMENT


In [30]:
orders_df.count()

25831125

In [31]:
orders_df.select('order_id','order_status').filter("order_status = 'CLOSED'").cache()

order_id,order_status
1,CLOSED
4,CLOSED
12,CLOSED
18,CLOSED
24,CLOSED
25,CLOSED
37,CLOSED
51,CLOSED
57,CLOSED
61,CLOSED


In [32]:
orders_df.count()

25831125

In [39]:
cached_df = orders_df.select('order_id','order_status').filter("order_status == 'CLOSED'").cache()

In [40]:
cached_df.count()

2833500

In [41]:
orders_df.select('order_id').filter("order_status == 'CLOSED'").cache()

order_id
1
4
12
18
24
25
37
51
57
61


In [42]:
orders_df.select('order_id').filter("order_status = 'CLOSED'").count()

2833500