In [1]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession.builder \
    .appName("itv023333") \
    .master("yarn") \
    .config("spark.ui.port", "0") \
    .config("spark.sql.warehouse.dir", f"/user/{username}/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
spark

In [3]:
orders_schema = "customer_id long, order_date date, order_id long, order_status string"

In [4]:
orders_df = spark.read \
.format("csv") \
.schema(orders_schema) \
.load("/public/trendytech/orders/orders_1gb.csv")

In [5]:
orders_df.show()

+-----------+----------+--------+---------------+
|customer_id|order_date|order_id|   order_status|
+-----------+----------+--------+---------------+
|          1|2013-07-25|   11599|         CLOSED|
|          2|2013-07-25|     256|PENDING_PAYMENT|
|          3|2013-07-25|   12111|       COMPLETE|
|          4|2013-07-25|    8827|         CLOSED|
|          5|2013-07-25|   11318|       COMPLETE|
|          6|2013-07-25|    7130|       COMPLETE|
|          7|2013-07-25|    4530|       COMPLETE|
|          8|2013-07-25|    2911|     PROCESSING|
|          9|2013-07-25|    5657|PENDING_PAYMENT|
|         10|2013-07-25|    5648|PENDING_PAYMENT|
|         11|2013-07-25|     918| PAYMENT_REVIEW|
|         12|2013-07-25|    1837|         CLOSED|
|         13|2013-07-25|    9149|PENDING_PAYMENT|
|         14|2013-07-25|    9842|     PROCESSING|
|         15|2013-07-25|    2568|       COMPLETE|
|         16|2013-07-25|    7276|PENDING_PAYMENT|
|         17|2013-07-25|    2667|       COMPLETE|


In [6]:
orders_df.rdd.getNumPartitions()

9

lets apply a groupBy()

In [7]:
orders_df.groupBy("order_status").count().write.format("csv").mode("overwrite").save("groupBy")

lets perform joins and broadcast joins

In [8]:
left_data = [
    ("101",1,"closed"),
    ("102",2,"complete"),
    ("103",3,"pending"),
    ("104",4,"closed")
]

In [9]:
left_df= spark.createDataFrame(left_data,["order_id", "cust_id","status string"])

In [10]:
left_df.show()

+--------+-------+-------------+
|order_id|cust_id|status string|
+--------+-------+-------------+
|     101|      1|       closed|
|     102|      2|     complete|
|     103|      3|      pending|
|     104|      4|       closed|
+--------+-------+-------------+



In [11]:
left_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- status string: string (nullable = true)



In [12]:
right_data = [
    (1,"banglore"),
    (2,"pune"),
    (5,"mumbai")
]

In [13]:
right_df = spark.createDataFrame(right_data, ["cust_id","city"])

In [14]:
right_df.show()

+-------+--------+
|cust_id|    city|
+-------+--------+
|      1|banglore|
|      2|    pune|
|      5|  mumbai|
+-------+--------+



In [15]:
right_df.printSchema()

root
 |-- cust_id: long (nullable = true)
 |-- city: string (nullable = true)



In [16]:
left_df.join(right_df,left_df.cust_id == right_df.cust_id,"inner").show()

+--------+-------+-------------+-------+--------+
|order_id|cust_id|status string|cust_id|    city|
+--------+-------+-------------+-------+--------+
|     101|      1|       closed|      1|banglore|
|     102|      2|     complete|      2|    pune|
+--------+-------+-------------+-------+--------+



In [17]:
left_df.join(right_df,left_df.cust_id == right_df.cust_id,"left").show()

+--------+-------+-------------+-------+--------+
|order_id|cust_id|status string|cust_id|    city|
+--------+-------+-------------+-------+--------+
|     101|      1|       closed|      1|banglore|
|     103|      3|      pending|   null|    null|
|     102|      2|     complete|      2|    pune|
|     104|      4|       closed|   null|    null|
+--------+-------+-------------+-------+--------+



In [18]:
left_df.join(right_df,left_df.cust_id == right_df.cust_id,"right").show()

+--------+-------+-------------+-------+--------+
|order_id|cust_id|status string|cust_id|    city|
+--------+-------+-------------+-------+--------+
|    null|   null|         null|      5|  mumbai|
|     101|      1|       closed|      1|banglore|
|     102|      2|     complete|      2|    pune|
+--------+-------+-------------+-------+--------+



In [19]:
left_df.join(right_df,left_df.cust_id == right_df.cust_id,"full").show()

+--------+-------+-------------+-------+--------+
|order_id|cust_id|status string|cust_id|    city|
+--------+-------+-------------+-------+--------+
|    null|   null|         null|      5|  mumbai|
|     101|      1|       closed|      1|banglore|
|     103|      3|      pending|   null|    null|
|     102|      2|     complete|      2|    pune|
|     104|      4|       closed|   null|    null|
+--------+-------+-------------+-------+--------+



In [20]:
cust_schema = "customer_id long, fname string, lname string, username string, password string, address string, city string, state string, pincode long"

In [21]:
cust_df = spark.read \
.format("csv") \
.schema(cust_schema) \
.load("/public/trendytech/retail_db/customers")

In [22]:
cust_df.show()

+-----------+-----------+---------+---------+---------+--------------------+-------------+-----+-------+
|customer_id|      fname|    lname| username| password|             address|         city|state|pincode|
+-----------+-----------+---------+---------+---------+--------------------+-------------+-----+-------+
|          1|    Richard|Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|  Brownsville|   TX|  78521|
|          2|       Mary|  Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|    Littleton|   CO|  80126|
|          3|        Ann|    Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|       Caguas|   PR|    725|
|          4|       Mary|    Jones|XXXXXXXXX|XXXXXXXXX|  8324 Little Common|   San Marcos|   CA|  92069|
|          5|     Robert|   Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|       Caguas|   PR|    725|
|          6|       Mary|    Smith|XXXXXXXXX|XXXXXXXXX|3151 Sleepy Quail...|      Passaic|   NJ|   7055|
|          7|    Melissa|   Wilcox|XXXXXXXXX|XXXXXXXXX|

In [23]:
orders_df.join(cust_df, orders_df.customer_id == cust_df.customer_id, "inner").show()

+-----------+----------+--------+---------------+-----------+-----------+---------+---------+---------+--------------------+-------------+-----+-------+
|customer_id|order_date|order_id|   order_status|customer_id|      fname|    lname| username| password|             address|         city|state|pincode|
+-----------+----------+--------+---------------+-----------+-----------+---------+---------+---------+--------------------+-------------+-----+-------+
|          1|2013-07-25|   11599|         CLOSED|          1|    Richard|Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|  Brownsville|   TX|  78521|
|          2|2013-07-25|     256|PENDING_PAYMENT|          2|       Mary|  Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|    Littleton|   CO|  80126|
|          3|2013-07-25|   12111|       COMPLETE|          3|        Ann|    Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|       Caguas|   PR|    725|
|          4|2013-07-25|    8827|         CLOSED|          4|       Mary|    Jones

In [24]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 10 * 1024 * 1024)


In [25]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [26]:
spark.conf.get("spark.sql.autoBroadcastJoinThreshold")


'-1'

In [None]:
orders_df.join(cust_df, orders_df.customer_id == cust_df.customer_id, "inner").write.format("noop").mode("overwrite").save()