In [38]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession.builder \
    .appName("cache_pyspark2") \
    .master("yarn") \
    .config("spark.ui.port", "0") \
    .config("spark.sql.warehouse.dir", f"/user/{username}/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()



In [39]:
spark

In [5]:
cust_schema = "customer_id long, purchase_date date, product_id long, amount double"
cust_df = spark.read \
.format("csv") \
.schema(cust_schema) \
.load("/public/trendytech/datasets/cust_transf.csv")

In [70]:
start_date = "2023-05-01"
end_date = "2023-06-08"
filtered_df = cust_df.filter((cust_df.purchase_date >= start_date) & (cust_df.purchase_date <= end_date))

In [30]:
top_product = filtered_df.groupBy('product_id').sum('amount').withColumnRenamed("sum(amount)", 'amount')
top_products_no_cache  = top_product.sort("amount", ascending = False).limit(10).show()

+----------+--------------------+
|product_id|              amount|
+----------+--------------------+
|      1003| 5.725592243903786E8|
|      1001|  5.56682641192824E8|
|      1002|4.2938362439486486E8|
|      1004| 2.862080244027619E8|
|      1005| 2.782856412021384E8|
|      1015|  12537.909999999963|
|      1014|  11492.909999999963|
|      1013|  10447.909999999963|
|      1012|   9402.909999999965|
|      1011|   8357.909999999967|
+----------+--------------------+



with cache()

In [6]:
filtered_cached_df = cust_df.filter((cust_df.purchase_date >= start_date) & (cust_df.purchase_date <= end_date)).cache()

In [7]:
top_product_cache = filtered_cached_df.groupBy('product_id').sum('amount').withColumnRenamed("sum(amount)", 'amount')
top_products_no_cache  = top_product_cache.sort("amount", ascending = False).limit(10).show()

+----------+--------------------+
|product_id|              amount|
+----------+--------------------+
|      1003| 5.725592243903786E8|
|      1001| 5.566826411928239E8|
|      1002| 4.293836243948648E8|
|      1004|2.8620802440276194E8|
|      1005| 2.782856412021384E8|
|      1015|  12537.909999999963|
|      1014|  11492.909999999963|
|      1013|  10447.909999999963|
|      1012|   9402.909999999965|
|      1011|   8357.909999999967|
+----------+--------------------+



In [8]:
top_products_no_cache  = top_product_cache.sort("amount", ascending = False).limit(10).show()

+----------+--------------------+
|product_id|              amount|
+----------+--------------------+
|      1003| 5.725592243903786E8|
|      1001|  5.56682641192824E8|
|      1002| 4.293836243948648E8|
|      1004|2.8620802440276194E8|
|      1005| 2.782856412021384E8|
|      1015|  12537.909999999963|
|      1014|  11492.909999999963|
|      1013|  10447.909999999963|
|      1012|   9402.909999999965|
|      1011|   8357.909999999967|
+----------+--------------------+



In [41]:
cust_df.show(5)

+-----------+-------------+----------+------+
|customer_id|purchase_date|product_id|amount|
+-----------+-------------+----------+------+
|       1001|   2023-05-15|      1001| 49.99|
|       1002|   2023-05-16|      1002| 29.99|
|       1003|   2023-05-17|      1003| 39.99|
|       1004|   2023-05-18|      1004| 19.99|
|       1005|   2023-05-19|      1005| 24.99|
+-----------+-------------+----------+------+
only showing top 5 rows



In [42]:
top_cust = filtered_df.groupBY('customer_id').sum('amount').withColumnRenamed("sum(amount)",'transactio_amt')
top_cust_df = top_cust.sort('customer_id',ascending=False).limit(10).show()

AttributeError: 'DataFrame' object has no attribute 'groupBY'

In [45]:
spark.sql("drop table itv023333.customers")

In [46]:
spark.sql("drop database itv023333")

In [47]:
spark.sql("create database itv023333")

In [48]:
cust_df.write.format("csv").saveAsTable("itv023333.customers")

In [49]:
spark.sql("select * from itv023333.customers limit 5")

customer_id,purchase_date,product_id,amount
1012,2023-06-12,1004,19.99
1013,2023-06-13,1005,24.99
1014,2023-06-14,1001,49.99
1015,2023-06-15,1002,29.99
1001,2023-05-15,1001,49.99


In [31]:
spark.sql("select product_id, sum(amount) as amount from itv023333.customers where purchase_date between '2023-05-01' and '2023-06-08' group by product_id order by amount desc limit 10")

product_id,amount
1003,572559224.3903785
1001,556682641.1928239
1002,429383624.3948648
1004,286208024.40276194
1005,278285641.2021384
1015,12537.909999999963
1014,11492.909999999963
1013,10447.909999999963
1012,9402.909999999963
1011,8357.909999999967


In [32]:
spark.sql("select customer_id, sum(amount) as amount from itv023333.customers where purchase_date between '2023-05-01' and '2023-06-08' group by customer_id order by amount desc limit 10")

customer_id,amount
1001,318088458.0005336
1004,310134258.0008686
1005,262409058.0015123
1003,214683858.00145328
1002,206729658.0014408
1011,127243741.11049214
1006,127238516.11049213
1012,111336386.11046082
1007,111331161.11046082
1013,95429031.11041903


after caching table itv023333.customers

In [50]:
spark.sql("cache table itv023333.customers")

In [51]:
spark.sql("select product_id, sum(amount) as amount from itv023333.customers where purchase_date between '2023-05-01' and '2023-06-08' group by product_id order by amount desc limit 10")

product_id,amount
1003,572559224.3903786
1001,556682641.1928239
1002,429383624.39486486
1004,286208024.40276194
1005,278285641.2021384
1015,12537.909999999963
1014,11492.909999999963
1013,10447.909999999963
1012,9402.909999999963
1011,8357.909999999967


In [52]:
spark.sql("select customer_id, sum(amount) as amount from itv023333.customers where purchase_date between '2023-05-01' and '2023-06-08' group by customer_id order by amount desc limit 10")

customer_id,amount
1001,318088458.0005336
1004,310134258.0008687
1005,262409058.0015123
1003,214683858.00145325
1002,206729658.0014408
1011,127243741.11049214
1006,127238516.11049213
1012,111336386.1104608
1007,111331161.11046082
1013,95429031.11041903


In [57]:
cust_df.show(5)

+-----------+-------------+----------+------+
|customer_id|purchase_date|product_id|amount|
+-----------+-------------+----------+------+
|       1001|   2023-05-15|      1001| 49.99|
|       1002|   2023-05-16|      1002| 29.99|
|       1003|   2023-05-17|      1003| 39.99|
|       1004|   2023-05-18|      1004| 19.99|
|       1005|   2023-05-19|      1005| 24.99|
+-----------+-------------+----------+------+
only showing top 5 rows



In [61]:
cust_df.select('customer_id','purchase_date').distinct().show()

+-----------+-------------+
|customer_id|purchase_date|
+-----------+-------------+
|       1008|   2023-05-24|
|       1002|   2023-05-26|
|       1002|   2023-06-05|
|       1013|   2023-06-13|
|       1009|   2023-06-14|
|       1006|   2023-06-01|
|       1011|   2023-06-06|
|       1013|   2023-05-29|
|       1004|   2023-06-07|
|       1007|   2023-06-07|
|       1004|   2023-05-18|
|       1001|   2023-05-30|
|       1002|   2023-05-31|
|       1007|   2023-06-12|
|       1001|   2023-06-04|
|       1009|   2023-06-04|
|       1005|   2023-06-08|
|       1003|   2023-05-22|
|       1012|   2023-06-12|
|       1005|   2023-06-03|
+-----------+-------------+
only showing top 20 rows



In [6]:
from pyspark.sql.functions import *

In [8]:
new_cust_df = cust_df.withColumn("purchase_year",year("purchase_date")).withColumn("purchase_month",month("purchase_date"))

In [9]:
cut_month_count = new_cust_df.groupBy('customer_id','purchase_date','purchase_month') \
.agg(countDistinct('purchase_month')) \
.alias("distinct_months")

In [10]:
regular_customers = cut_month_count.filter("distinct_months = 1") \
.groupBy("customer_id").count() \
.orderBy("count", ascending=False).limit(10)
regular_customers.show()

AnalysisException: "cannot resolve '`distinct_months`' given input columns: [distinct_months.customer_id, distinct_months.purchase_date, distinct_months.purchase_month, distinct_months.count(DISTINCT purchase_month)]; line 1 pos 0;\n'Filter ('distinct_months = 1)\n+- SubqueryAlias `distinct_months`\n   +- Aggregate [customer_id#0L, purchase_date#1, purchase_month#27], [customer_id#0L, purchase_date#1, purchase_month#27, count(distinct purchase_month#27) AS count(DISTINCT purchase_month)#41L]\n      +- Project [customer_id#0L, purchase_date#1, product_id#2L, amount#3, purchase_year#21, month(purchase_date#1) AS purchase_month#27]\n         +- Project [customer_id#0L, purchase_date#1, product_id#2L, amount#3, year(purchase_date#1) AS purchase_year#21]\n            +- Relation[customer_id#0L,purchase_date#1,product_id#2L,amount#3] csv\n"

In [12]:
cust_df.show()

+-----------+-------------+----------+------+
|customer_id|purchase_date|product_id|amount|
+-----------+-------------+----------+------+
|       1001|   2023-05-15|      1001| 49.99|
|       1002|   2023-05-16|      1002| 29.99|
|       1003|   2023-05-17|      1003| 39.99|
|       1004|   2023-05-18|      1004| 19.99|
|       1005|   2023-05-19|      1005| 24.99|
|       1001|   2023-05-20|      1002| 29.99|
|       1002|   2023-05-21|      1003| 39.99|
|       1003|   2023-05-22|      1004| 19.99|
|       1004|   2023-05-23|      1005| 24.99|
|       1005|   2023-05-24|      1001| 49.99|
|       1001|   2023-05-25|      1003| 39.99|
|       1002|   2023-05-26|      1004| 19.99|
|       1003|   2023-05-27|      1005| 24.99|
|       1004|   2023-05-28|      1001| 49.99|
|       1005|   2023-05-29|      1002| 29.99|
|       1001|   2023-05-30|      1003| 39.99|
|       1002|   2023-05-31|      1004| 19.99|
|       1003|   2023-06-01|      1005| 24.99|
|       1004|   2023-06-02|      1

In [14]:
cust_df.count()

87498290

persist dataframe

In [13]:
from pyspark.storagelevel import StorageLevel 


In [15]:
persist_cust_df = cust_df.persist()

In [16]:
persist_cust_df.count()

87498290

In [17]:
persist_cust_df.count()

87498290

In [20]:
persist_cust_df.unpersist()

customer_id,purchase_date,product_id,amount
1001,2023-05-15,1001,49.99
1002,2023-05-16,1002,29.99
1003,2023-05-17,1003,39.99
1004,2023-05-18,1004,19.99
1005,2023-05-19,1005,24.99
1001,2023-05-20,1002,29.99
1002,2023-05-21,1003,39.99
1003,2023-05-22,1004,19.99
1004,2023-05-23,1005,24.99
1005,2023-05-24,1001,49.99


In [21]:
persist_cust_df = cust_df.persist(StorageLevel.MEMORY_AND_DISK_SER)

In [22]:
persist_cust_df.count()

87498290

In [23]:
persist_cust_df.unpersist()

customer_id,purchase_date,product_id,amount
1001,2023-05-15,1001,49.99
1002,2023-05-16,1002,29.99
1003,2023-05-17,1003,39.99
1004,2023-05-18,1004,19.99
1005,2023-05-19,1005,24.99
1001,2023-05-20,1002,29.99
1002,2023-05-21,1003,39.99
1003,2023-05-22,1004,19.99
1004,2023-05-23,1005,24.99
1005,2023-05-24,1001,49.99


In [24]:
persist_cust_df = cust_df.persist(StorageLevel(True,False,False,False,1))
persist_cust_df.count()

87498290

In [25]:
persist_cust_df.unpersist()

customer_id,purchase_date,product_id,amount
1001,2023-05-15,1001,49.99
1002,2023-05-16,1002,29.99
1003,2023-05-17,1003,39.99
1004,2023-05-18,1004,19.99
1005,2023-05-19,1005,24.99
1001,2023-05-20,1002,29.99
1002,2023-05-21,1003,39.99
1003,2023-05-22,1004,19.99
1004,2023-05-23,1005,24.99
1005,2023-05-24,1001,49.99


In [26]:
persist_cust_df = cust_df.persist(StorageLevel(False,True,False,True,1))
persist_cust_df.count()

87498290

In [28]:
persist_cust_df.unpersist() 


customer_id,purchase_date,product_id,amount
1001,2023-05-15,1001,49.99
1002,2023-05-16,1002,29.99
1003,2023-05-17,1003,39.99
1004,2023-05-18,1004,19.99
1005,2023-05-19,1005,24.99
1001,2023-05-20,1002,29.99
1002,2023-05-21,1003,39.99
1003,2023-05-22,1004,19.99
1004,2023-05-23,1005,24.99
1005,2023-05-24,1001,49.99


In [30]:
spark.sql("uncache table itv023333.customers") 

In [33]:
spark.sql("create table itv023333.hotel_df(booking_id int,guest_name string, checkin_date date, checkout_date date, room_type string,total_price double) using CSV location '/public/trendytech/datasets/hotel_data.csv'")

In [34]:
spark.sql("select * from itv023333.hotel_df limit 5")

booking_id,guest_name,checkin_date,checkout_date,room_type,total_price
1,John Doe,2023-05-01,2023-05-05,Standard,400.0
2,Jane Smith,2023-05-02,2023-05-06,Deluxe,600.0
3,Mark Johnson,2023-05-03,2023-05-08,Standard,450.0
4,Sarah Wilson,2023-05-04,2023-05-07,Executive,750.0
5,Emily Brown,2023-05-06,2023-05-09,Deluxe,550.0


In [40]:
spark.sql("select count(booking_id) from itv023333.hotel_df")

count(booking_id)
107


In [43]:
spark.sql("cache table itv023333.hotel_df")

In [44]:
spark.sql("select count(booking_id) from itv023333.hotel_df")

count(booking_id)
107


In [45]:
spark.sql("uncache table itv023333.hotel_df")

In [47]:
spark.sql("select room_type,avg(total_price) as avg_price from itv023333.hotel_df group by room_type").show(100)

+---------+-----------------+
|room_type|        avg_price|
+---------+-----------------+
|Executive|            750.0|
|   Deluxe|575.5813953488372|
| Standard|            425.0|
+---------+-----------------+



In [48]:
spark.sql("cache table itv023333.hotel_df")

In [49]:
spark.sql("select room_type,avg(total_price) as avg_price from itv023333.hotel_df group by room_type").show(100)

+---------+-----------------+
|room_type|        avg_price|
+---------+-----------------+
|Executive|            750.0|
|   Deluxe|575.5813953488372|
| Standard|            425.0|
+---------+-----------------+



In [50]:
spark.sql("uncache table itv023333.hotel_df")