In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark= SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark

In [4]:
products_df=spark.read \
.format("csv") \
.option("header","true") \
.option("inferSchema","true") \
.load("/user/itv017244/products_folder/products_wh_data.csv")

In [6]:
products_df.show(5)

+----------+--------+--------------------+-----------+------+--------------------+
|product_id|category|        product_name|description| price|           image_url|
+----------+--------+--------------------+-----------+------+--------------------+
|      null|    null|        product_name|description|  null|           image_url|
|         1|       2|Quest Q64 10 FT. ...|       null| 59.98|http://images.acm...|
|         2|       2|Under Armour Men'...|       null|129.99|http://images.acm...|
|         3|       2|Under Armour Men'...|       null| 89.99|http://images.acm...|
|         4|       2|Under Armour Men'...|       null| 89.99|http://images.acm...|
+----------+--------+--------------------+-----------+------+--------------------+
only showing top 5 rows



In [7]:
total_products = products_df.count()

In [8]:
print(total_products)

1346


In [9]:
unique_categories = products_df.select("Category").distinct().count()

In [10]:
print(unique_categories)

56


In [12]:
expensive_products = products_df.select("product_name","category","image_url","price").orderBy("price", ascending = False).limit(5)

In [13]:
expensive_products.show()

+--------------------+--------+--------------------+-------+
|        product_name|category|           image_url|  price|
+--------------------+--------+--------------------+-------+
| SOLE E35 Elliptical|      10|http://images.acm...|1999.99|
|  SOLE F85 Treadmill|       4|http://images.acm...|1799.99|
|  SOLE F85 Treadmill|      10|http://images.acm...|1799.99|
|  SOLE F85 Treadmill|      22|http://images.acm...|1799.99|
|"Spalding Beast 6...|      47|http://images.acm...|1099.99|
+--------------------+--------+--------------------+-------+



In [16]:
category_df = products_df.filter("price > 100").groupBy("category").count().withColumnRenamed("count", "NumberOfProducts")

In [17]:
category_df.show(10)

+--------+----------------+
|category|NumberOfProducts|
+--------+----------------+
|      31|              17|
|      53|              16|
|      34|              15|
|      44|               9|
|      12|               3|
|      22|               4|
|      47|              10|
|      52|               5|
|      13|               1|
|       6|               5|
+--------+----------------+
only showing top 10 rows



In [20]:
category_product = products_df.filter("price > 200 and category = 5")

In [21]:
result = category_product.select("product_name", "price")

In [22]:
result.show()

+--------------------+------+
|        product_name| price|
+--------------------+------+
|"Goaliath 54"" In...|499.99|
|Fitness Gear 300 ...|209.99|
|Teeter Hang Ups N...|299.99|
+--------------------+------+



In [23]:
products_df.createOrReplaceTempView("products")

In [24]:
result1 = spark.sql("select count(product_id) as total_products from products")

In [25]:
result1.show()

+--------------+
|total_products|
+--------------+
|          1345|
+--------------+



In [27]:
result2 = spark.sql("select count(distinct(category)) as unique_category from products")
result2

unique_category
55


In [28]:
result3 = spark.sql("select product_name, category,image_url from products order by price desc limit 5")
result3.show()

+--------------------+--------+--------------------+
|        product_name|category|           image_url|
+--------------------+--------+--------------------+
| SOLE E35 Elliptical|      10|http://images.acm...|
|  SOLE F85 Treadmill|       4|http://images.acm...|
|  SOLE F85 Treadmill|      10|http://images.acm...|
|  SOLE F85 Treadmill|      22|http://images.acm...|
|"Spalding Beast 6...|      47|http://images.acm...|
+--------------------+--------+--------------------+



In [29]:
result4 = spark.sql("select category,count(product_id) as numberOfProduct from products where price >100 group by category ")
result4.show()

+--------+---------------+
|category|numberOfProduct|
+--------+---------------+
|      31|             17|
|      53|             16|
|      34|             15|
|      44|              9|
|      12|              3|
|      22|              4|
|      47|             10|
|      52|              5|
|      13|              1|
|       6|              5|
|      16|             11|
|       3|              5|
|      20|              7|
|      57|              6|
|      54|              6|
|      48|             17|
|       5|             11|
|      19|             13|
|      41|             11|
|      43|             23|
+--------+---------------+
only showing top 20 rows



In [31]:
result5 = spark.sql("select product_name,price from products where price >200 and category = 5")
result5.show()

+--------------------+------+
|        product_name| price|
+--------------------+------+
|"Goaliath 54"" In...|499.99|
|Fitness Gear 300 ...|209.99|
|Teeter Hang Ups N...|299.99|
+--------------------+------+

