In [10]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    config('spark.shuffle.useOldFetchProtocol','true'). \
    config("spark.dynamicAllocation.enabled", "True"). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [9]:
spark

In [43]:
spark.conf.get("spark.dynamicAllocation.enabled")

'True'

In [4]:
order_schema = 'order_id long, order_date string , customer_id long, order_status string'

In [5]:
! hadoop fs  -ls -h /user/itv017244/week11Assignment/orders.csv 

-rw-r--r--   3 itv017244 supergroup      3.5 G 2025-03-17 05:53 /user/itv017244/week11Assignment/orders.csv


In [6]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/user/itv017244/week11Assignment/orders.csv")

In [7]:
order_df.createOrReplaceTempView("order")

In [26]:
spark.sql("""
SELECT order_status, 
       date_format(to_date(substring(order_date, 1, 10), 'yyyy-MM-dd'), 'MM') AS MonthNum, 
       count(order_id) AS NoOfOrder
FROM order 
GROUP BY order_status, date_format(to_date(substring(order_date, 1, 10), 'yyyy-MM-dd'), 'MM')
""").show()

+---------------+--------+---------+
|   order_status|MonthNum|NoOfOrder|
+---------------+--------+---------+
|        ON_HOLD|      05|   386250|
|       COMPLETE|      07|  2417500|
|         CLOSED|      09|   845000|
|         CLOSED|      01|   791250|
|SUSPECTED_FRAUD|      09|   185000|
|       CANCELED|      02|   156250|
|SUSPECTED_FRAUD|      06|   163750|
|        PENDING|      07|   847500|
|       COMPLETE|      08|  2350000|
|       CANCELED|      08|   156250|
|SUSPECTED_FRAUD|      10|   135000|
|        ON_HOLD|      04|   416250|
| PAYMENT_REVIEW|      07|    91250|
|SUSPECTED_FRAUD|      07|   167500|
|SUSPECTED_FRAUD|      01|   163750|
|        ON_HOLD|      03|   403750|
|       CANCELED|      06|   130000|
|       COMPLETE|      06|  2246250|
|PENDING_PAYMENT|      07|  1665000|
| PAYMENT_REVIEW|      08|    68750|
+---------------+--------+---------+
only showing top 20 rows



In [27]:
spark.sql("""
SELECT order_status, 
       date_format(to_date(substring(order_date, 1, 10), 'yyyy-MM-dd'), 'MM') AS MonthNum, 
       count(order_id) AS NoOfOrder
FROM order 
GROUP BY order_status, date_format(to_date(substring(order_date, 1, 10), 'yyyy-MM-dd'), 'MM')
""").explain(True)

== Parsed Logical Plan ==
'Aggregate ['order_status, 'date_format('to_date('substring('order_date, 1, 10), yyyy-MM-dd), MM)], ['order_status, 'date_format('to_date('substring('order_date, 1, 10), yyyy-MM-dd), MM) AS MonthNum#93, 'count('order_id) AS NoOfOrder#94]
+- 'UnresolvedRelation [order], [], false

== Analyzed Logical Plan ==
order_status: string, MonthNum: string, NoOfOrder: bigint
Aggregate [order_status#3, date_format(cast(to_date(substring(order_date#1, 1, 10), Some(yyyy-MM-dd)) as timestamp), MM, Some(America/Toronto))], [order_status#3, date_format(cast(to_date(substring(order_date#1, 1, 10), Some(yyyy-MM-dd)) as timestamp), MM, Some(America/Toronto)) AS MonthNum#93, count(order_id#0L) AS NoOfOrder#94L]
+- SubqueryAlias order
   +- Relation[order_id#0L,order_date#1,customer_id#2L,order_status#3] csv

== Optimized Logical Plan ==
Aggregate [order_status#3, date_format(cast(cast(gettimestamp(substring(order_date#1, 1, 10), yyyy-MM-dd, Some(America/Toronto), false) as date) a

In [44]:
order_schema = 'order_id long, order_date date , customer_id long, order_status string'

In [45]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/user/itv017244/week11Assignment/orders.csv")

In [46]:
order_df.createOrReplaceTempView("order")

In [38]:
spark.sql("""
SELECT order_status, 
       date_format(order_date,'MM') AS MonthNum, 
       count(order_id) AS NoOfOrder
FROM order 
GROUP BY order_status, date_format(order_date, 'MM')
""").show()

+---------------+--------+---------+
|   order_status|MonthNum|NoOfOrder|
+---------------+--------+---------+
|        ON_HOLD|      05|   386250|
|       COMPLETE|      07|  2417500|
|         CLOSED|      09|   845000|
|         CLOSED|      01|   791250|
|SUSPECTED_FRAUD|      09|   185000|
|       CANCELED|      02|   156250|
|SUSPECTED_FRAUD|      06|   163750|
|        PENDING|      07|   847500|
|       COMPLETE|      08|  2350000|
|       CANCELED|      08|   156250|
|SUSPECTED_FRAUD|      10|   135000|
|        ON_HOLD|      04|   416250|
| PAYMENT_REVIEW|      07|    91250|
|SUSPECTED_FRAUD|      07|   167500|
|SUSPECTED_FRAUD|      01|   163750|
|        ON_HOLD|      03|   403750|
|       CANCELED|      06|   130000|
|       COMPLETE|      06|  2246250|
|PENDING_PAYMENT|      07|  1665000|
| PAYMENT_REVIEW|      08|    68750|
+---------------+--------+---------+
only showing top 20 rows



In [39]:
spark.sql("""
SELECT order_status, 
       date_format(order_date,'MM') AS MonthNum, 
       count(order_id) AS NoOfOrder
FROM order 
GROUP BY order_status, date_format(order_date, 'MM')
""").explain(True)

== Parsed Logical Plan ==
'Aggregate ['order_status, 'date_format('order_date, MM)], ['order_status, 'date_format('order_date, MM) AS MonthNum#169, 'count('order_id) AS NoOfOrder#170]
+- 'UnresolvedRelation [order], [], false

== Analyzed Logical Plan ==
order_status: string, MonthNum: string, NoOfOrder: bigint
Aggregate [order_status#139, date_format(cast(order_date#137 as timestamp), MM, Some(America/Toronto))], [order_status#139, date_format(cast(order_date#137 as timestamp), MM, Some(America/Toronto)) AS MonthNum#169, count(order_id#136L) AS NoOfOrder#170L]
+- SubqueryAlias order
   +- Relation[order_id#136L,order_date#137,customer_id#138L,order_status#139] csv

== Optimized Logical Plan ==
Aggregate [order_status#139, date_format(cast(order_date#137 as timestamp), MM, Some(America/Toronto))], [order_status#139, date_format(cast(order_date#137 as timestamp), MM, Some(America/Toronto)) AS MonthNum#169, count(order_id#136L) AS NoOfOrder#170L]
+- Project [order_id#136L, order_date#137

In [47]:
spark.sql("""
SELECT order_status, 
       first(int(date_format(order_date,'MM'))) AS MonthNum, 
       count(order_id) AS NoOfOrder
FROM order 
GROUP BY order_status order by  MonthNum
""").write.format("noop").mode("overwrite").save()

In [48]:
order_schema = 'order_id long, order_date string , customer_id long, order_status string'

In [49]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/user/itv017244/week11Assignment/orders.csv")

In [50]:
order_df.createOrReplaceTempView("order")

In [51]:
spark.sql("""
SELECT order_status, 
       first(date_format(order_date,'MM')) AS MonthNum, 
       count(order_id) AS NoOfOrder
FROM order 
GROUP BY order_status order by  MonthNum
""").write.format("noop").mode("overwrite").save()

In [52]:
spark.sql("""
SELECT order_status, customer_id 
FROM (select order_status, customer_id  from order where order_status =='COMPLETED')
where order_status = 'OPEN'
""").explain(True)

== Parsed Logical Plan ==
'Project ['order_status, 'customer_id]
+- 'Filter ('order_status = OPEN)
   +- 'SubqueryAlias __auto_generated_subquery_name
      +- 'Project ['order_status, 'customer_id]
         +- 'Filter ('order_status = COMPLETED)
            +- 'UnresolvedRelation [order], [], false

== Analyzed Logical Plan ==
order_status: string, customer_id: bigint
Project [order_status#179, customer_id#178L]
+- Filter (order_status#179 = OPEN)
   +- SubqueryAlias __auto_generated_subquery_name
      +- Project [order_status#179, customer_id#178L]
         +- Filter (order_status#179 = COMPLETED)
            +- SubqueryAlias order
               +- Relation[order_id#176L,order_date#177,customer_id#178L,order_status#179] csv

== Optimized Logical Plan ==
Project [order_status#179, customer_id#178L]
+- Filter ((isnotnull(order_status#179) AND (order_status#179 = COMPLETED)) AND (order_status#179 = OPEN))
   +- Relation[order_id#176L,order_date#177,customer_id#178L,order_status#179] c

In [61]:
order_schema = 'order_id long, order_date date , customer_id long' 

In [62]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/datasets/parquet-schema-evol-demo/csv/orders2.csv")

In [63]:
order_df.write.mode("append").option("mergeSchema",True).option("path","/user/itv017244/week11Assignment/").saveAsTable("Week11_assignment1_order1")

In [64]:
spark.sql("select * from Week11_assignment1_order1").show()


+--------+----------+-----------+
|order_id|order_date|customer_id|
+--------+----------+-----------+
|       3|2013-07-25|      12111|
|       4|2013-07-25|       8827|
+--------+----------+-----------+



In [65]:
order_schema = 'order_id long, order_date date , customer_id long, order_status string'

In [68]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/datasets/parquet-schema-evol-demo/csv/orders3.csv")

In [69]:
order_df.write.mode("append").option("mergeSchema",True).option("path","/user/itv017244/week11Assignment/").saveAsTable("Week11_assignment1_order1")

AnalysisException: The column number of the existing table default.Week11_assignment1_order1(struct<order_id:bigint,order_date:date,customer_id:bigint>) doesn't match the data schema(struct<order_id:bigint,order_date:date,order_status:string,customer_id:bigint>)

In [70]:
order_schema = 'order_id long, order_date date '

In [71]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/datasets/parquet-schema-evol-demo/csv/orders1.csv")

In [72]:
order_df.write.mode("append").option("mergeSchema",True).option("path","/user/itv017244/week11Assignment/").saveAsTable("Week11_assignment1_order1")

AnalysisException: The column number of the existing table default.Week11_assignment1_order1(struct<order_id:bigint,order_date:date,customer_id:bigint>) doesn't match the data schema(struct<order_id:bigint,order_date:date>)

In [74]:
order_df.write.mode("append").option("path","/user/itv017244/week11Assignment/order1").save()

In [76]:
merged_df = spark.read.format("parquet").option("mergeSchema",True).load("/user/itv017244/week11Assignment/order1")

In [77]:
merged_df.show()

+--------+----------+
|order_id|order_date|
+--------+----------+
|       1|2013-07-25|
|       2|2013-07-25|
+--------+----------+



In [78]:
order_schema = 'order_id long, order_date date , customer_id long, order_status string'

In [79]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/datasets/parquet-schema-evol-demo/csv/orders3.csv")

In [82]:
order_df.show()

+--------+----------+-----------+------------+
|order_id|order_date|customer_id|order_status|
+--------+----------+-----------+------------+
|       5|2013-07-25|      11318|    COMPLETE|
|       6|2013-07-25|       7130|    COMPLETE|
+--------+----------+-----------+------------+



In [85]:
order_df.write.mode("append").option("path","/user/itv017244/week11Assignment/order1").save()

In [86]:
merged_df = spark.read.option("mergeSchema",True).load("/user/itv017244/week11Assignment/order1")

In [87]:
merged_df.show()

+--------+----------+-----------+------------+
|order_id|order_date|customer_id|order_status|
+--------+----------+-----------+------------+
|       5|2013-07-25|      11318|    COMPLETE|
|       6|2013-07-25|       7130|    COMPLETE|
|       1|2013-07-25|       null|        null|
|       2|2013-07-25|       null|        null|
+--------+----------+-----------+------------+



In [88]:
order_schema = 'order_id long, order_date date , customer_id long' 

In [89]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/datasets/parquet-schema-evol-demo/csv/orders2.csv")

In [93]:
order_df.show()

+--------+----------+-----------+
|order_id|order_date|customer_id|
+--------+----------+-----------+
|       3|2013-07-25|      12111|
|       4|2013-07-25|       8827|
+--------+----------+-----------+



In [90]:
order_df.write.mode("append").option("path","/user/itv017244/week11Assignment/order1").save()

In [91]:
merged_df = spark.read.option("mergeSchema",True).load("/user/itv017244/week11Assignment/order1")

In [92]:
merged_df.show()

+--------+----------+-----------+------------+
|order_id|order_date|customer_id|order_status|
+--------+----------+-----------+------------+
|       5|2013-07-25|      11318|    COMPLETE|
|       6|2013-07-25|       7130|    COMPLETE|
|       3|2013-07-25|      12111|        null|
|       4|2013-07-25|       8827|        null|
|       1|2013-07-25|       null|        null|
|       2|2013-07-25|       null|        null|
+--------+----------+-----------+------------+



In [11]:
order_schema = 'order_id long, order_date date, customer_id long, order_status string'

In [14]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/orders/orders_1gb.csv")

In [15]:
order_df.coalesce(1). \
write. \
format("orc"). \
option("compression", "lzo"). \
mode("overwrite"). \
option("path", "/user/itv017244/datasets/compression-techniques-demo/orc_lzo"). \
save()

In [16]:
order_df.coalesce(1). \
write. \
format("orc"). \
option("compression", "snappy"). \
mode("overwrite"). \
option("path", "/user/itv017244/datasets/compression-techniques-demo/orc_snappy"). \
save()