In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
appName("Sneha Spark Session").\
config("spark.shuffle.useOldFetchProtocol", 'true'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark

In [3]:
order_schema = 'order_id long, order_date date, customer_id long, order_status string'

In [4]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/orders/orders_1gb.csv")

In [5]:
order_df.show()

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-07-25|      11599|         CLOSED|
|       2|2013-07-25|        256|PENDING_PAYMENT|
|       3|2013-07-25|      12111|       COMPLETE|
|       4|2013-07-25|       8827|         CLOSED|
|       5|2013-07-25|      11318|       COMPLETE|
|       6|2013-07-25|       7130|       COMPLETE|
|       7|2013-07-25|       4530|       COMPLETE|
|       8|2013-07-25|       2911|     PROCESSING|
|       9|2013-07-25|       5657|PENDING_PAYMENT|
|      10|2013-07-25|       5648|PENDING_PAYMENT|
|      11|2013-07-25|        918| PAYMENT_REVIEW|
|      12|2013-07-25|       1837|         CLOSED|
|      13|2013-07-25|       9149|PENDING_PAYMENT|
|      14|2013-07-25|       9842|     PROCESSING|
|      15|2013-07-25|       2568|       COMPLETE|
|      16|2013-07-25|       7276|PENDING_PAYMENT|
|      17|2013-07-25|       2667|       COMPLETE|


In [6]:
order_df.createOrReplaceTempView("order")

In [7]:
spark.sql("select * from order")

order_id,order_date,customer_id,order_status
1,2013-07-25,11599,CLOSED
2,2013-07-25,256,PENDING_PAYMENT
3,2013-07-25,12111,COMPLETE
4,2013-07-25,8827,CLOSED
5,2013-07-25,11318,COMPLETE
6,2013-07-25,7130,COMPLETE
7,2013-07-25,4530,COMPLETE
8,2013-07-25,2911,PROCESSING
9,2013-07-25,5657,PENDING_PAYMENT
10,2013-07-25,5648,PENDING_PAYMENT


In [8]:
spark.sql("select * fra order")

ParseException: 
extraneous input 'order' expecting {<EOF>, ';'}(line 1, pos 13)

== SQL ==
select * fra order
-------------^^^


In [9]:
spark.sql("select * from orders")

AnalysisException: Table or view not found: orders; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [orders], [], false


In [10]:
spark.sql("""
select order_id,order_status from (select order_id,customer_id
,order_status from order where order_id<500) where order_id<200
""").show()

+--------+---------------+
|order_id|   order_status|
+--------+---------------+
|       1|         CLOSED|
|       2|PENDING_PAYMENT|
|       3|       COMPLETE|
|       4|         CLOSED|
|       5|       COMPLETE|
|       6|       COMPLETE|
|       7|       COMPLETE|
|       8|     PROCESSING|
|       9|PENDING_PAYMENT|
|      10|PENDING_PAYMENT|
|      11| PAYMENT_REVIEW|
|      12|         CLOSED|
|      13|PENDING_PAYMENT|
|      14|     PROCESSING|
|      15|       COMPLETE|
|      16|PENDING_PAYMENT|
|      17|       COMPLETE|
|      18|         CLOSED|
|      19|PENDING_PAYMENT|
|      20|     PROCESSING|
+--------+---------------+
only showing top 20 rows



In [11]:
spark.sql("""
select order_id,order_status from (select order_id,customer_id
,order_status from order where order_id<500) where order_id<200
""").explain(True)

== Parsed Logical Plan ==
'Project ['order_id, 'order_status]
+- 'Filter ('order_id < 200)
   +- 'SubqueryAlias __auto_generated_subquery_name
      +- 'Project ['order_id, 'customer_id, 'order_status]
         +- 'Filter ('order_id < 500)
            +- 'UnresolvedRelation [order], [], false

== Analyzed Logical Plan ==
order_id: bigint, order_status: string
Project [order_id#0L, order_status#3]
+- Filter (order_id#0L < cast(200 as bigint))
   +- SubqueryAlias __auto_generated_subquery_name
      +- Project [order_id#0L, customer_id#2L, order_status#3]
         +- Filter (order_id#0L < cast(500 as bigint))
            +- SubqueryAlias order
               +- Relation[order_id#0L,order_date#1,customer_id#2L,order_status#3] csv

== Optimized Logical Plan ==
Project [order_id#0L, order_status#3]
+- Filter ((isnotnull(order_id#0L) AND (order_id#0L < 500)) AND (order_id#0L < 200))
   +- Relation[order_id#0L,order_date#1,customer_id#2L,order_status#3] csv

== Physical Plan ==
*(1) Filter ((

In [12]:
customer_schema = "customer_id long,customer_fname string, customer_lname string,username string, password string,address string, city string, state string, pincode long"

In [13]:
customer_df = spark.read.format("csv").schema(customer_schema).load("/public/trendytech/retail_db/customers")

In [14]:
customer_df.createOrReplaceTempView("customer")

In [15]:
spark.sql(""" select * from order join customer on order.customer_id == customer.customer_id
where order_status = 'CLOSED'
""").show()

+--------+----------+-----------+------------+-----------+--------------+--------------+---------+---------+--------------------+------------+-----+-------+
|order_id|order_date|customer_id|order_status|customer_id|customer_fname|customer_lname| username| password|             address|        city|state|pincode|
+--------+----------+-----------+------------+-----------+--------------+--------------+---------+---------+--------------------+------------+-----+-------+
|       1|2013-07-25|      11599|      CLOSED|      11599|          Mary|        Malone|XXXXXXXXX|XXXXXXXXX|8708 Indian Horse...|     Hickory|   NC|  28601|
|       4|2013-07-25|       8827|      CLOSED|       8827|         Brian|        Wilson|XXXXXXXXX|XXXXXXXXX|   8396 High Corners| San Antonio|   TX|  78240|
|      12|2013-07-25|       1837|      CLOSED|       1837|          Mary|          Vega|XXXXXXXXX|XXXXXXXXX|  4312 Bright Corner|      Caguas|   PR|    725|
|      18|2013-07-25|       1205|      CLOSED|       1205|

In [16]:
spark.sql(""" select * from order join customer on order.customer_id == customer.customer_id
where order_status = 'CLOSED'
""").explain(True)

== Parsed Logical Plan ==
'Project [*]
+- 'Filter ('order_status = CLOSED)
   +- 'Join Inner, ('order.customer_id = 'customer.customer_id)
      :- 'UnresolvedRelation [order], [], false
      +- 'UnresolvedRelation [customer], [], false

== Analyzed Logical Plan ==
order_id: bigint, order_date: date, customer_id: bigint, order_status: string, customer_id: bigint, customer_fname: string, customer_lname: string, username: string, password: string, address: string, city: string, state: string, pincode: bigint
Project [order_id#0L, order_date#1, customer_id#2L, order_status#3, customer_id#90L, customer_fname#91, customer_lname#92, username#93, password#94, address#95, city#96, state#97, pincode#98L]
+- Filter (order_status#3 = CLOSED)
   +- Join Inner, (customer_id#2L = customer_id#90L)
      :- SubqueryAlias order
      :  +- Relation[order_id#0L,order_date#1,customer_id#2L,order_status#3] csv
      +- SubqueryAlias customer
         +- Relation[customer_id#90L,customer_fname#91,customer

In [18]:
spark.sql(""" select customer_id ,count(1) from (select * from order
 where customer_id in (1,2,3,4,5)) where customer_id in (1,2,3) group by customer_id
 having customer_id = 1
""").show()

+-----------+--------+
|customer_id|count(1)|
+-----------+--------+
|          1|     375|
+-----------+--------+



In [19]:
spark.sql(""" select customer_id ,count(1) from (select * from order
 where customer_id in (1,2,3,4,5)) where customer_id in (1,2,3) group by customer_id
 having customer_id = 1
""").explain(True)

== Parsed Logical Plan ==
'UnresolvedHaving ('customer_id = 1)
+- 'Aggregate ['customer_id], ['customer_id, unresolvedalias('count(1), None)]
   +- 'Filter 'customer_id IN (1,2,3)
      +- 'SubqueryAlias __auto_generated_subquery_name
         +- 'Project [*]
            +- 'Filter 'customer_id IN (1,2,3,4,5)
               +- 'UnresolvedRelation [order], [], false

== Analyzed Logical Plan ==
customer_id: bigint, count(1): bigint
Filter (customer_id#2L = cast(1 as bigint))
+- Aggregate [customer_id#2L], [customer_id#2L, count(1) AS count(1)#251L]
   +- Filter cast(customer_id#2L as bigint) IN (cast(1 as bigint),cast(2 as bigint),cast(3 as bigint))
      +- SubqueryAlias __auto_generated_subquery_name
         +- Project [order_id#0L, order_date#1, customer_id#2L, order_status#3]
            +- Filter cast(customer_id#2L as bigint) IN (cast(1 as bigint),cast(2 as bigint),cast(3 as bigint),cast(4 as bigint),cast(5 as bigint))
               +- SubqueryAlias order
                  +- Re