In [27]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
appName("Sneha Spark Session").\
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [28]:
spark

In [29]:
groceries_df = spark.read.format('csv').option("header", "True").load('/public/trendytech/groceries.csv')

In [30]:
groceries_df.show()

+--------+---------+--------+----------+--------+
|order_id| location|    item|order_date|quantity|
+--------+---------+--------+----------+--------+
|      o1|  Seattle| Bananas|01/01/2017|       7|
|      o2|     Kent|  Apples|02/01/2017|      20|
|      o3| Bellevue| Flowers|02/01/2017|      10|
|      o4|  Redmond|    Meat|03/01/2017|      40|
|      o5|  Seattle|Potatoes|04/01/2017|       9|
|      o6| Bellevue|   Bread|04/01/2017|       5|
|      o7|  Redmond|   Bread|05/01/2017|       5|
|      o8| Issaquah|   Onion|05/01/2017|       4|
|      o9|  Redmond|  Cheese|05/01/2017|      15|
|     o10| Issaquah|   Onion|06/01/2017|       4|
|     o11|   Renton|   Bread|05/01/2017|       5|
|     o12| Issaquah|   Onion|07/01/2017|       4|
|     o13|Sammamish|   Bread|07/01/2017|       5|
|     o14| Issaquah|  Tomato|07/01/2017|       6|
|     o15| Issaquah|    Meat|08/01/2017|       3|
|     o16| Issaquah|    Meat|09/01/2017|       5|
|     o17| Issaquah|    Meat|10/01/2017|       6|


In [31]:
from pyspark.sql.functions import col,lead
from pyspark.sql import Window

In [32]:
mywindow = Window.partitionBy("location").orderBy("order_date")

In [46]:
groceries_df_new = groceries_df.withColumn("lead_quantity",lead("quantity").over(mywindow))

In [34]:
filter_null_df = groceries_df_new.filter(groceries_df_new["lead_quantity"].isNull())
filter_null_df.show()

+--------+---------+--------+----------+--------+-------------+
|order_id| location|    item|order_date|quantity|lead_quantity|
+--------+---------+--------+----------+--------+-------------+
|     o17| Issaquah|    Meat|10/01/2017|       6|         null|
|     o13|Sammamish|   Bread|07/01/2017|       5|         null|
|      o9|  Redmond|  Cheese|05/01/2017|      15|         null|
|      o5|  Seattle|Potatoes|04/01/2017|       9|         null|
|      o2|     Kent|  Apples|02/01/2017|      20|         null|
|     o21| Bellevue|   Bread|14/01/2017|      25|         null|
|     o11|   Renton|   Bread|05/01/2017|       5|         null|
+--------+---------+--------+----------+--------+-------------+



In [35]:
filter_not_null_df = groceries_df_new.filter(groceries_df_new["lead_quantity"].isNotNull())
filter_not_null_df.show()

+--------+--------+-------+----------+--------+-------------+
|order_id|location|   item|order_date|quantity|lead_quantity|
+--------+--------+-------+----------+--------+-------------+
|      o8|Issaquah|  Onion|05/01/2017|       4|            4|
|     o10|Issaquah|  Onion|06/01/2017|       4|            4|
|     o12|Issaquah|  Onion|07/01/2017|       4|            6|
|     o14|Issaquah| Tomato|07/01/2017|       6|            3|
|     o15|Issaquah|   Meat|08/01/2017|       3|            5|
|     o16|Issaquah|   Meat|09/01/2017|       5|            6|
|      o4| Redmond|   Meat|03/01/2017|      40|            5|
|      o7| Redmond|  Bread|05/01/2017|       5|           15|
|      o1| Seattle|Bananas|01/01/2017|       7|            9|
|      o3|Bellevue|Flowers|02/01/2017|      10|            5|
|      o6|Bellevue|  Bread|04/01/2017|       5|            7|
|     o18|Bellevue|  Bread|11/01/2017|       7|           54|
|     o19|Bellevue|  Bread|12/01/2017|      54|           34|
|     o2

In [36]:
#removing rows with null
filter_null_df.dropna().show()


+--------+--------+----+----------+--------+-------------+
|order_id|location|item|order_date|quantity|lead_quantity|
+--------+--------+----+----------+--------+-------------+
+--------+--------+----+----------+--------+-------------+



In [37]:
#removing rows where a quantity column is null.
result_not_null = groceries_df_new.dropna(subset = ["lead_quantity"]).show()

+--------+--------+-------+----------+--------+-------------+
|order_id|location|   item|order_date|quantity|lead_quantity|
+--------+--------+-------+----------+--------+-------------+
|      o8|Issaquah|  Onion|05/01/2017|       4|            4|
|     o10|Issaquah|  Onion|06/01/2017|       4|            4|
|     o12|Issaquah|  Onion|07/01/2017|       4|            6|
|     o14|Issaquah| Tomato|07/01/2017|       6|            3|
|     o15|Issaquah|   Meat|08/01/2017|       3|            5|
|     o16|Issaquah|   Meat|09/01/2017|       5|            6|
|      o4| Redmond|   Meat|03/01/2017|      40|            5|
|      o7| Redmond|  Bread|05/01/2017|       5|           15|
|      o1| Seattle|Bananas|01/01/2017|       7|            9|
|      o3|Bellevue|Flowers|02/01/2017|      10|            5|
|      o6|Bellevue|  Bread|04/01/2017|       5|            7|
|     o18|Bellevue|  Bread|11/01/2017|       7|           54|
|     o19|Bellevue|  Bread|12/01/2017|      54|           34|
|     o2

In [38]:
groceries_df_new.show()

+--------+---------+--------+----------+--------+-------------+
|order_id| location|    item|order_date|quantity|lead_quantity|
+--------+---------+--------+----------+--------+-------------+
|      o8| Issaquah|   Onion|05/01/2017|       4|            4|
|     o10| Issaquah|   Onion|06/01/2017|       4|            4|
|     o12| Issaquah|   Onion|07/01/2017|       4|            6|
|     o14| Issaquah|  Tomato|07/01/2017|       6|            3|
|     o15| Issaquah|    Meat|08/01/2017|       3|            5|
|     o16| Issaquah|    Meat|09/01/2017|       5|            6|
|     o17| Issaquah|    Meat|10/01/2017|       6|         null|
|     o13|Sammamish|   Bread|07/01/2017|       5|         null|
|      o4|  Redmond|    Meat|03/01/2017|      40|            5|
|      o7|  Redmond|   Bread|05/01/2017|       5|           15|
|      o9|  Redmond|  Cheese|05/01/2017|      15|         null|
|      o1|  Seattle| Bananas|01/01/2017|       7|            9|
|      o5|  Seattle|Potatoes|04/01/2017|

In [39]:
groceries_df_new.fillna("NA").show()

+--------+---------+--------+----------+--------+-------------+
|order_id| location|    item|order_date|quantity|lead_quantity|
+--------+---------+--------+----------+--------+-------------+
|      o8| Issaquah|   Onion|05/01/2017|       4|            4|
|     o10| Issaquah|   Onion|06/01/2017|       4|            4|
|     o12| Issaquah|   Onion|07/01/2017|       4|            6|
|     o14| Issaquah|  Tomato|07/01/2017|       6|            3|
|     o15| Issaquah|    Meat|08/01/2017|       3|            5|
|     o16| Issaquah|    Meat|09/01/2017|       5|            6|
|     o17| Issaquah|    Meat|10/01/2017|       6|           NA|
|     o13|Sammamish|   Bread|07/01/2017|       5|           NA|
|      o4|  Redmond|    Meat|03/01/2017|      40|            5|
|      o7|  Redmond|   Bread|05/01/2017|       5|           15|
|      o9|  Redmond|  Cheese|05/01/2017|      15|           NA|
|      o1|  Seattle| Bananas|01/01/2017|       7|            9|
|      o5|  Seattle|Potatoes|04/01/2017|

In [40]:
groceries_df_new.show()

+--------+---------+--------+----------+--------+-------------+
|order_id| location|    item|order_date|quantity|lead_quantity|
+--------+---------+--------+----------+--------+-------------+
|      o8| Issaquah|   Onion|05/01/2017|       4|            4|
|     o10| Issaquah|   Onion|06/01/2017|       4|            4|
|     o12| Issaquah|   Onion|07/01/2017|       4|            6|
|     o14| Issaquah|  Tomato|07/01/2017|       6|            3|
|     o15| Issaquah|    Meat|08/01/2017|       3|            5|
|     o16| Issaquah|    Meat|09/01/2017|       5|            6|
|     o17| Issaquah|    Meat|10/01/2017|       6|         null|
|     o13|Sammamish|   Bread|07/01/2017|       5|         null|
|      o4|  Redmond|    Meat|03/01/2017|      40|            5|
|      o7|  Redmond|   Bread|05/01/2017|       5|           15|
|      o9|  Redmond|  Cheese|05/01/2017|      15|         null|
|      o1|  Seattle| Bananas|01/01/2017|       7|            9|
|      o5|  Seattle|Potatoes|04/01/2017|

In [41]:
groceries_df_new.fillna({"lead_quantity":"UNKNOWN"}).show()

+--------+---------+--------+----------+--------+-------------+
|order_id| location|    item|order_date|quantity|lead_quantity|
+--------+---------+--------+----------+--------+-------------+
|      o8| Issaquah|   Onion|05/01/2017|       4|            4|
|     o10| Issaquah|   Onion|06/01/2017|       4|            4|
|     o12| Issaquah|   Onion|07/01/2017|       4|            6|
|     o14| Issaquah|  Tomato|07/01/2017|       6|            3|
|     o15| Issaquah|    Meat|08/01/2017|       3|            5|
|     o16| Issaquah|    Meat|09/01/2017|       5|            6|
|     o17| Issaquah|    Meat|10/01/2017|       6|      UNKNOWN|
|     o13|Sammamish|   Bread|07/01/2017|       5|      UNKNOWN|
|      o4|  Redmond|    Meat|03/01/2017|      40|            5|
|      o7|  Redmond|   Bread|05/01/2017|       5|           15|
|      o9|  Redmond|  Cheese|05/01/2017|      15|      UNKNOWN|
|      o1|  Seattle| Bananas|01/01/2017|       7|            9|
|      o5|  Seattle|Potatoes|04/01/2017|

In [42]:
# Use of Colease function
groceries_df_new.show()

+--------+---------+--------+----------+--------+-------------+
|order_id| location|    item|order_date|quantity|lead_quantity|
+--------+---------+--------+----------+--------+-------------+
|      o8| Issaquah|   Onion|05/01/2017|       4|            4|
|     o10| Issaquah|   Onion|06/01/2017|       4|            4|
|     o12| Issaquah|   Onion|07/01/2017|       4|            6|
|     o14| Issaquah|  Tomato|07/01/2017|       6|            3|
|     o15| Issaquah|    Meat|08/01/2017|       3|            5|
|     o16| Issaquah|    Meat|09/01/2017|       5|            6|
|     o17| Issaquah|    Meat|10/01/2017|       6|         null|
|     o13|Sammamish|   Bread|07/01/2017|       5|         null|
|      o4|  Redmond|    Meat|03/01/2017|      40|            5|
|      o7|  Redmond|   Bread|05/01/2017|       5|           15|
|      o9|  Redmond|  Cheese|05/01/2017|      15|         null|
|      o1|  Seattle| Bananas|01/01/2017|       7|            9|
|      o5|  Seattle|Potatoes|04/01/2017|

In [43]:
from pyspark.sql.functions import coalesce, lit

groceries_df_new = groceries_df_new.withColumn("lead_quantity", coalesce(groceries_df_new["lead_quantity"], lit(0)))
groceries_df_new.show()

+--------+---------+--------+----------+--------+-------------+
|order_id| location|    item|order_date|quantity|lead_quantity|
+--------+---------+--------+----------+--------+-------------+
|      o8| Issaquah|   Onion|05/01/2017|       4|            4|
|     o10| Issaquah|   Onion|06/01/2017|       4|            4|
|     o12| Issaquah|   Onion|07/01/2017|       4|            6|
|     o14| Issaquah|  Tomato|07/01/2017|       6|            3|
|     o15| Issaquah|    Meat|08/01/2017|       3|            5|
|     o16| Issaquah|    Meat|09/01/2017|       5|            6|
|     o17| Issaquah|    Meat|10/01/2017|       6|            0|
|     o13|Sammamish|   Bread|07/01/2017|       5|            0|
|      o4|  Redmond|    Meat|03/01/2017|      40|            5|
|      o7|  Redmond|   Bread|05/01/2017|       5|           15|
|      o9|  Redmond|  Cheese|05/01/2017|      15|            0|
|      o1|  Seattle| Bananas|01/01/2017|       7|            9|
|      o5|  Seattle|Potatoes|04/01/2017|

In [44]:
#Handling Nulls in Expressions
groceries_df_new.show()

+--------+---------+--------+----------+--------+-------------+
|order_id| location|    item|order_date|quantity|lead_quantity|
+--------+---------+--------+----------+--------+-------------+
|      o8| Issaquah|   Onion|05/01/2017|       4|            4|
|     o10| Issaquah|   Onion|06/01/2017|       4|            4|
|     o12| Issaquah|   Onion|07/01/2017|       4|            6|
|     o14| Issaquah|  Tomato|07/01/2017|       6|            3|
|     o15| Issaquah|    Meat|08/01/2017|       3|            5|
|     o16| Issaquah|    Meat|09/01/2017|       5|            6|
|     o17| Issaquah|    Meat|10/01/2017|       6|            0|
|     o13|Sammamish|   Bread|07/01/2017|       5|            0|
|      o4|  Redmond|    Meat|03/01/2017|      40|            5|
|      o7|  Redmond|   Bread|05/01/2017|       5|           15|
|      o9|  Redmond|  Cheese|05/01/2017|      15|            0|
|      o1|  Seattle| Bananas|01/01/2017|       7|            9|
|      o5|  Seattle|Potatoes|04/01/2017|

In [45]:
groceries_df_new.selectExpr("nvl(lead_quantity,0)as lead_quantity").show()

+-------------+
|lead_quantity|
+-------------+
|            4|
|            4|
|            6|
|            3|
|            5|
|            6|
|            0|
|            0|
|            5|
|           15|
|            0|
|            9|
|            0|
|            0|
|            5|
|            7|
|           54|
|           34|
|           25|
|            0|
+-------------+
only showing top 20 rows



In [48]:
groceries_df_new1 = groceries_df.withColumn("lead_quantity",lead("quantity").over(mywindow))

In [50]:
groceries_df_new1.selectExpr("sum(quantity)as total_quantity").show()

+--------------+
|total_quantity|
+--------------+
|         273.0|
+--------------+



In [52]:
groceries_df_new1.selectExpr("count(*)as total_quantity").show()

+--------------+
|total_quantity|
+--------------+
|            21|
+--------------+



In [53]:
groceries_df_new1.selectExpr("count(quantity)as total_quantity").show()

+--------------+
|total_quantity|
+--------------+
|            21|
+--------------+

