In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
appName("Sneha Spark Session").\
config("spark.shuffle.useOldFetchProtocol", 'true'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark

In [3]:
order_schema = 'order_id long, order_date date, customer_id long, order_status string'

In [4]:
order_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/orders/orders_1gb.csv")

In [5]:
order_df.rdd.getNumPartitions()

9

In [6]:
order_df.groupBy("order_status").count().write.format("csv").mode("overwrite").save("output101")

In [7]:
order_df.groupBy("order_status").count().write.format("noop").mode("overwrite").save()

In [19]:
order_df.createOrReplaceTempView("orders")

In [20]:
spark.sql("select order_status, count(*) from orders group by order_status").show()

+---------------+--------+
|   order_status|count(1)|
+---------------+--------+
|PENDING_PAYMENT| 5636250|
|       COMPLETE| 8587125|
|        ON_HOLD| 1424250|
| PAYMENT_REVIEW|  273375|
|     PROCESSING| 3103125|
|         CLOSED| 2833500|
|SUSPECTED_FRAUD|  584250|
|        PENDING| 2853750|
|       CANCELED|  535500|
+---------------+--------+



In [21]:
customer_schema = "customer_id long,customer_fname string, customer_lname string,username string, password string,address string, city string, state string, pincode long"

In [22]:
customer_df = spark.read.format("csv").schema(customer_schema).load("/public/trendytech/retail_db/customers")

In [23]:
customer_df.show()

+-----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|customer_id|customer_fname|customer_lname| username| password|             address|         city|state|pincode|
+-----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|          1|       Richard|     Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|  Brownsville|   TX|  78521|
|          2|          Mary|       Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|    Littleton|   CO|  80126|
|          3|           Ann|         Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|       Caguas|   PR|    725|
|          4|          Mary|         Jones|XXXXXXXXX|XXXXXXXXX|  8324 Little Common|   San Marcos|   CA|  92069|
|          5|        Robert|        Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|       Caguas|   PR|    725|
|          6|          Mary|         Smith|XXXXXXXXX|XXXXXXXXX|3151 Sleepy Quail...|      Passai

In [24]:
order_df.show()

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-07-25|      11599|         CLOSED|
|       2|2013-07-25|        256|PENDING_PAYMENT|
|       3|2013-07-25|      12111|       COMPLETE|
|       4|2013-07-25|       8827|         CLOSED|
|       5|2013-07-25|      11318|       COMPLETE|
|       6|2013-07-25|       7130|       COMPLETE|
|       7|2013-07-25|       4530|       COMPLETE|
|       8|2013-07-25|       2911|     PROCESSING|
|       9|2013-07-25|       5657|PENDING_PAYMENT|
|      10|2013-07-25|       5648|PENDING_PAYMENT|
|      11|2013-07-25|        918| PAYMENT_REVIEW|
|      12|2013-07-25|       1837|         CLOSED|
|      13|2013-07-25|       9149|PENDING_PAYMENT|
|      14|2013-07-25|       9842|     PROCESSING|
|      15|2013-07-25|       2568|       COMPLETE|
|      16|2013-07-25|       7276|PENDING_PAYMENT|
|      17|2013-07-25|       2667|       COMPLETE|


In [45]:
spark.conf.get('spark.sql.autoBroadcastJoinThreshold')

'-1'

In [44]:
spark.conf.set('spark.sql.autoBroadcastJoinThreshold','-1')

In [27]:
order_df.join(customer_df,order_df.customer_id==customer_df.customer_id,"inner").write.format("noop").mode("overwrite").save()

In [28]:
spark.conf.set('spark.sql.autoBroadcastJoinThreshold','10485760b')

In [29]:
spark.conf.get('spark.sql.autoBroadcastJoinThreshold')

'10485760b'

In [30]:
order_df.join(customer_df,order_df.customer_id==customer_df.customer_id,"inner").write.format("noop").mode("overwrite").save()

In [31]:
order_df.join(customer_df,order_df.customer_id==customer_df.customer_id,"left").write.format("noop").mode("overwrite").save()

In [32]:
order_df.join(customer_df,order_df.customer_id==customer_df.customer_id,"right").write.format("noop").mode("overwrite").save()

In [33]:
order_df.join(customer_df,order_df.customer_id==customer_df.customer_id,"full").write.format("noop").mode("overwrite").save()

In [34]:
from pyspark.sql.functions import *

In [None]:
## customer_df.join(broadcast(order_df),order_df.customer_id==customer_df.customer_id,"left").write.format("noop").mode("overwrite").save()

In [36]:
customer_df.createOrReplaceTempView("customers")

In [37]:
spark.sql("select * from orders inner join customers on orders.customer_id == customers.customer_id").write.format("noop").mode("overwrite").save()

In [38]:
order_new_df = spark.read.\
format("csv").\
schema(order_schema).\
load("/public/trendytech/retail_db/ordersnew")

In [39]:
order_new_df.groupBy("order_status").count().collect()

[Row(order_status='PENDING_PAYMENT', count=5636250),
 Row(order_status='COMPLETE', count=46008801),
 Row(order_status='ON_HOLD', count=1424250),
 Row(order_status='PAYMENT_REVIEW', count=273375),
 Row(order_status='PROCESSING', count=3103125),
 Row(order_status='CLOSED', count=2833500),
 Row(order_status='SUSPECTED_FRAUD', count=584250),
 Row(order_status='PENDING', count=2853750),
 Row(order_status='CANCELED', count=535500)]

In [40]:
order_new_df.show()

+--------+----------+-----------+------------+
|order_id|order_date|customer_id|order_status|
+--------+----------+-----------+------------+
|    2480|2013-08-07|       3807|    COMPLETE|
|   30479|2014-01-30|       9265|    COMPLETE|
|    2481|2013-08-07|       2476|    COMPLETE|
|   30481|2014-01-30|       9240|    COMPLETE|
|    2483|2013-08-07|      10453|    COMPLETE|
|   30484|2014-01-30|       2876|    COMPLETE|
|    2484|2013-08-07|       9256|    COMPLETE|
|   30485|2014-01-30|       1069|    COMPLETE|
|    2488|2013-08-07|       1255|    COMPLETE|
|   30486|2014-01-30|       1151|    COMPLETE|
|    2491|2013-08-07|        247|    COMPLETE|
|   30487|2014-01-30|       6772|    COMPLETE|
|    2495|2013-08-07|       9011|    COMPLETE|
|   30489|2014-01-30|       5717|    COMPLETE|
|    2498|2013-08-07|       1966|    COMPLETE|
|   30490|2014-01-30|      12189|    COMPLETE|
|    2511|2013-08-07|       8544|    COMPLETE|
|   30492|2014-01-30|       3710|    COMPLETE|
|    2515|201

In [41]:
mapping_schema = "status string, code int"

In [42]:
mapping_df = spark.read.format("csv").option("delimiter","|").schema(mapping_schema).load("/public/trendytech/datasets/mapping_data")

In [43]:
mapping_df.show()

+---------------+----+
|         status|code|
+---------------+----+
|PENDING_PAYMENT|   1|
|       COMPLETE|   2|
|        ON_HOLD|   3|
| PAYMENT_REVIEW|   4|
|     PROCESSING|   5|
|         CLOSED|   6|
|SUSPECTED_FRAUD|   7|
|        PENDING|   8|
|       CANCELED|   9|
+---------------+----+



In [46]:
order_new_df.join(mapping_df,order_new_df.order_status==mapping_df.status,"inner").write.format("noop").mode("overwrite").save()