In [49]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
appName("Sneha Spark Session").\
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [50]:
spark

In [3]:
order_schema = 'order_id long, order_date date, customer_id long, order_status string'

In [4]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/orders/orders_1gb.csv")

In [5]:
order_df.show()

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-07-25|      11599|         CLOSED|
|       2|2013-07-25|        256|PENDING_PAYMENT|
|       3|2013-07-25|      12111|       COMPLETE|
|       4|2013-07-25|       8827|         CLOSED|
|       5|2013-07-25|      11318|       COMPLETE|
|       6|2013-07-25|       7130|       COMPLETE|
|       7|2013-07-25|       4530|       COMPLETE|
|       8|2013-07-25|       2911|     PROCESSING|
|       9|2013-07-25|       5657|PENDING_PAYMENT|
|      10|2013-07-25|       5648|PENDING_PAYMENT|
|      11|2013-07-25|        918| PAYMENT_REVIEW|
|      12|2013-07-25|       1837|         CLOSED|
|      13|2013-07-25|       9149|PENDING_PAYMENT|
|      14|2013-07-25|       9842|     PROCESSING|
|      15|2013-07-25|       2568|       COMPLETE|
|      16|2013-07-25|       7276|PENDING_PAYMENT|
|      17|2013-07-25|       2667|       COMPLETE|


In [6]:
order_df.write.format("csv").saveAsTable("itv017244_cachingdemo_db.itv017244_order1")

AnalysisException: Table `itv017244_cachingdemo_db`.`itv017244_order1` already exists.

In [7]:
spark.sql("Describe extended itv017244_cachingdemo_db.itv017244_order1").show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|            order_id|              bigint|   null|
|          order_date|                date|   null|
|         customer_id|              bigint|   null|
|        order_status|              string|   null|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|            Database|itv017244_caching...|       |
|               Table|    itv017244_order1|       |
|               Owner|           itv017244|       |
|        Created Time|Mon Feb 24 13:41:...|       |
|         Last Access|             UNKNOWN|       |
|          Created By|         Spark 3.1.2|       |
|                Type|             MANAGED|       |
|            Provider|                 csv|       |
|          Statistics|     840836656 bytes|       |
|            Location|hdfs://m01.itvers...|       |
|       Serd

In [8]:
spark.sql("select count(*) from  itv017244_cachingdemo_db.itv017244_order1").show()

+--------+
|count(1)|
+--------+
|25831126|
+--------+



In [9]:
spark.sql("cache table itv017244_cachingdemo_db.itv017244_order1")

In [10]:
spark.sql("select count(*) from  itv017244_cachingdemo_db.itv017244_order1").show()

+--------+
|count(1)|
+--------+
|25831126|
+--------+



In [11]:
spark.sql("select distinct(order_status) from itv017244_cachingdemo_db.itv017244_order1").show()

+---------------+
|   order_status|
+---------------+
|PENDING_PAYMENT|
|       COMPLETE|
|        ON_HOLD|
| PAYMENT_REVIEW|
|         BOOKED|
|     PROCESSING|
|         CLOSED|
|SUSPECTED_FRAUD|
|        PENDING|
|       CANCELED|
+---------------+



In [12]:
spark.sql("select distinct(order_status) from itv017244_cachingdemo_db.itv017244_order1").count()

10

In [13]:
spark.sql("uncache table itv017244_cachingdemo_db.itv017244_order1")

In [14]:
spark.sql("cache lazy table itv017244_cachingdemo_db.itv017244_order1")

In [15]:
spark.sql("select count(distinct(order_status)) from itv017244_cachingdemo_db.itv017244_order1")

count(DISTINCT order_status)
10


In [16]:
spark.sql("select order_status, count(*) from from itv017244_cachingdemo_db.itv017244_order1 group by order_status").show()

+---------------+-------+
|   order_status|   from|
+---------------+-------+
|PENDING_PAYMENT|5636250|
|       COMPLETE|8587125|
|        ON_HOLD|1424250|
| PAYMENT_REVIEW| 273375|
|         BOOKED|      1|
|     PROCESSING|3103125|
|         CLOSED|2833500|
|SUSPECTED_FRAUD| 584250|
|        PENDING|2853750|
|       CANCELED| 535500|
+---------------+-------+



In [17]:
spark.sql("insert into itv017244_cachingdemo_db.itv017244_order1 values(11111, CAST('2023-05-29' AS DATE), 222222, 'BOOKED')")

In [18]:
spark.sql("select count(distinct(order_status)) from itv017244_cachingdemo_db.itv017244_order1")

count(DISTINCT order_status)
10


In [19]:
spark.sql("clear cache")

In [20]:
spark.sql("cache table itv017244_cachingdemo_db.itv017244_order1")

In [21]:
spark.catalog.currentDatabase()

'default'

In [22]:
spark.sql("use itv017244_cachingdemo_db")

In [24]:
spark.catalog.isCached("itv017244_cachingdemo_db.itv017244_order1")

True

In [27]:
spark.catalog.uncacheTable("itv017244_cachingdemo_db.itv017244_order1")

In [28]:
spark.catalog.isCached("itv017244_cachingdemo_db.itv017244_order1")

False

In [30]:
spark.catalog.clearCache()

In [31]:
spark.catalog.isCached("itv017244_cachingdemo_db.itv017244_order1")

False

In [33]:
spark.sql("create database itv017244_cachingdemo_ext")

In [77]:
spark.sql("drop table itv017244_cachingdemo_ext.itv017244_order_ext")

In [78]:
spark.sql("create table itv017244_cachingdemo_ext.itv017244_order_ext(order_id long, order_date string, customer_id long, order_status string)\
using csv location '/user/itv017244/order/'")

In [79]:
spark.sql("select * from  itv017244_cachingdemo_ext.itv017244_order_ext limit 10")

order_id,order_date,customer_id,order_status
1,2013-07-25 00:00:...,11599,CLOSED
2,2013-07-25 00:00:...,256,PENDING_PAYMENT
3,2013-07-25 00:00:...,12111,COMPLETE
4,2013-07-25 00:00:...,8827,CLOSED
5,2013-07-25 00:00:...,11318,COMPLETE
6,2013-07-25 00:00:...,7130,COMPLETE
7,2013-07-25 00:00:...,4530,COMPLETE
8,2013-07-25 00:00:...,2911,PROCESSING
9,2013-07-25 00:00:...,5657,PENDING_PAYMENT
10,2013-07-25 00:00:...,5648,PENDING_PAYMENT


In [80]:
spark.sql("select count(*) from  itv017244_cachingdemo_ext.itv017244_order_ext")

count(1)
68883


In [81]:
spark.sql("describe extended itv017244_cachingdemo_ext.itv017244_order_ext").show(30,False)

+----------------------------+---------------------------------------------------------+-------+
|col_name                    |data_type                                                |comment|
+----------------------------+---------------------------------------------------------+-------+
|order_id                    |bigint                                                   |null   |
|order_date                  |string                                                   |null   |
|customer_id                 |bigint                                                   |null   |
|order_status                |string                                                   |null   |
|                            |                                                         |       |
|# Detailed Table Information|                                                         |       |
|Database                    |itv017244_cachingdemo_ext                                |       |
|Table                       |

In [82]:
spark.sql("cache table itv017244_cachingdemo_ext.itv017244_order_ext")

In [83]:
spark.sql("insert into itv017244_cachingdemo_ext.itv017244_order_ext values(11111, CAST('2023-05-29' AS DATE), 222222, 'BOOKED')")

In [84]:
spark.sql("select count(*) from itv017244_cachingdemo_ext.itv017244_order_ext")

count(1)
68884
