In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
appName("Sneha Spark Session").\
config("spark.shuffle.useOldFetchProtocol", 'true'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark

In [3]:
order_schema = 'order_id long, order_date date, customer_id long, order_status string'

In [4]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/orders/orders_1gb.csv")

In [6]:
order_df.show(2)

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-07-25|      11599|         CLOSED|
|       2|2013-07-25|        256|PENDING_PAYMENT|
+--------+----------+-----------+---------------+
only showing top 2 rows



In [5]:
order_df.createOrReplaceTempView("orders")

In [7]:
spark.sql("select * from orders limit 2")

order_id,order_date,customer_id,order_status
1,2013-07-25,11599,CLOSED
2,2013-07-25,256,PENDING_PAYMENT


In [8]:
customer_schema = "customer_id long,customer_fname string, customer_lname string,username string, password string,address string, city string, state string, pincode long"

In [9]:
customer_df = spark.read.format("csv").schema(customer_schema).load("/public/trendytech/retail_db/customers")

In [10]:
customer_df.show(2)

+-----------+--------------+--------------+---------+---------+--------------------+-----------+-----+-------+
|customer_id|customer_fname|customer_lname| username| password|             address|       city|state|pincode|
+-----------+--------------+--------------+---------+---------+--------------------+-----------+-----+-------+
|          1|       Richard|     Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|Brownsville|   TX|  78521|
|          2|          Mary|       Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|  Littleton|   CO|  80126|
+-----------+--------------+--------------+---------+---------+--------------------+-----------+-----+-------+
only showing top 2 rows



In [11]:
customer_df.createOrReplaceTempView("customers")

In [12]:
spark.sql("select * from customers limit 2")

customer_id,customer_fname,customer_lname,username,password,address,city,state,pincode
1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521
2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers...,Littleton,CO,80126


In [13]:
order_df.join(customer_df,order_df.customer_id ==customer_df.customer_id,"inner").write.format("noop").mode("overwrite").save() # broadcast Hash Join

In [14]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [15]:
order_df.join(customer_df,order_df.customer_id ==customer_df.customer_id,"inner").write.format("noop").mode("overwrite").save() # sort Merge Join

In [16]:
order_df.join(customer_df.hint("shuffle_hash"),order_df.customer_id ==customer_df.customer_id,"inner").write.format("noop").mode("overwrite").save() # shuffled hash Join

In [17]:
spark.conf.get("spark.sql.adaptive.enabled")

'false'

In [18]:
spark.conf.set("spark.sql.adaptive.enabled","true")

In [19]:
spark.conf.get("spark.sql.adaptive.enabled")

'true'

In [20]:
order_schema = 'order_id long, order_date date, customer_id long, order_status string'

In [21]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/orders/orders_1gb.csv")

In [22]:
customer_schema = "customer_id long,customer_fname string, customer_lname string,username string, password string,address string, city string, state string, pincode long"

In [23]:
customer_df = spark.read.format("csv").schema(customer_schema).load("/public/trendytech/retail_db/customers")

In [24]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [25]:
order_df.join(customer_df,order_df.customer_id ==customer_df.customer_id,"inner").write.format("noop").mode("overwrite").save() # sort Merge Join

In [26]:
order_df.join(customer_df.hint("shuffle_hash"),order_df.customer_id ==customer_df.customer_id,"inner").write.format("noop").mode("overwrite").save() # shuffle Hash Join

In [27]:
spark.sql("select * from orders inner join customers on (orders.customer_id ==customers.customer_id)") \
.write.format("noop") \
.mode("overwrite").save() # sort merge join

In [28]:
employee_data = [(10, "Raj","1999","100","M",2000),
(20, "Rahul","2002","200","M",2000),
(30, "Raghav","2010","100","",2000),
(40, "Reema","2004","100","F",2000),
(50, "Rina","2008","400","F",2000),
(60, "Rasul","2014","500","M",2000)
]

In [29]:
employee_schema = ["employee_id","name","doj","employee_dept_id","gender","salary"]

In [30]:
employeeDf = spark.createDataFrame(data=employee_data,schema=employee_schema)

In [31]:
employeeDf.show(2)

+-----------+-----+----+----------------+------+------+
|employee_id| name| doj|employee_dept_id|gender|salary|
+-----------+-----+----+----------------+------+------+
|         10|  Raj|1999|             100|     M|  2000|
|         20|Rahul|2002|             200|     M|  2000|
+-----------+-----+----+----------------+------+------+
only showing top 2 rows



In [32]:
department_data = [("HR",100),
("Supply",100),
("Sales",100),
("Stock",100),
]

In [33]:
department_schema = ["dept_name","dept_id"]
departmentDf =spark.createDataFrame(data=department_data,schema=department_schema)
departmentDf.show()

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|       HR|    100|
|   Supply|    100|
|    Sales|    100|
|    Stock|    100|
+---------+-------+



In [34]:
df_left_join = employeeDf.join(departmentDf,employeeDf.employee_dept_id == departmentDf.dept_id,"left_outer")

In [35]:
df_left_join.show()

+-----------+------+----+----------------+------+------+---------+-------+
|employee_id|  name| doj|employee_dept_id|gender|salary|dept_name|dept_id|
+-----------+------+----+----------------+------+------+---------+-------+
|         60| Rasul|2014|             500|     M|  2000|     null|   null|
|         10|   Raj|1999|             100|     M|  2000|       HR|    100|
|         10|   Raj|1999|             100|     M|  2000|   Supply|    100|
|         10|   Raj|1999|             100|     M|  2000|    Sales|    100|
|         10|   Raj|1999|             100|     M|  2000|    Stock|    100|
|         30|Raghav|2010|             100|      |  2000|       HR|    100|
|         30|Raghav|2010|             100|      |  2000|   Supply|    100|
|         30|Raghav|2010|             100|      |  2000|    Sales|    100|
|         30|Raghav|2010|             100|      |  2000|    Stock|    100|
|         40| Reema|2004|             100|     F|  2000|       HR|    100|
|         40| Reema|2004|

In [37]:
df_semi_join = employeeDf.join(departmentDf,employeeDf.employee_dept_id == departmentDf.dept_id,"semi")

In [38]:
df_semi_join.show()

+-----------+------+----+----------------+------+------+
|employee_id|  name| doj|employee_dept_id|gender|salary|
+-----------+------+----+----------------+------+------+
|         30|Raghav|2010|             100|      |  2000|
|         10|   Raj|1999|             100|     M|  2000|
|         40| Reema|2004|             100|     F|  2000|
+-----------+------+----+----------------+------+------+



In [39]:
departmentDf.createOrReplaceTempView("department_table")
employeeDf.createOrReplaceTempView("employee_table")
res = spark.sql("""select * from employee_table Left Outer JOIN department_table ON \
employee_table.employee_dept_id == department_table.dept_id
""")
res = spark.sql("""select * from employee_table SEMI JOIN department_table ON \
employee_table.employee_dept_id == department_table.dept_id
""")
res.show()