# Read Data into Spark DataFrame

In [0]:
df = spark.read.option("header", "true") \
               .option("inferSchema", "true") \
               .csv("workspace.default.ecommerce_data")


# Basic Spark Operations

In [0]:
df = spark.table("workspace.default.ecommerce_data")
df.show()

+--------+----------+-----------+-------------+----------------+--------------------+-----+--------+--------------+---------+
|order_id|order_date|customer_id|customer_name|product_category|        product_name|price|quantity|payment_method|     city|
+--------+----------+-----------+-------------+----------------+--------------------+-----+--------+--------------+---------+
|    1001|2024-01-05|       C001|  Amit Sharma|     Electronics|      Wireless Mouse|  799|       1|           UPI|Bangalore|
|    1002|2024-01-06|       C002|  Priya Verma|         Fashion|        Cotton Kurti| 1299|       2|   Credit Card|    Delhi|
|    1003|2024-01-06|       C003|  Rahul Mehta|     Electronics|Bluetooth Headphones| 2499|       1|    Debit Card|   Mumbai|
|    1004|2024-01-07|       C004| Ananya Singh|  Home & Kitchen|     Electric Kettle| 1999|       1|           UPI|Hyderabad|
|    1005|2024-01-08|       C001|  Amit Sharma|     Electronics|       USB-C Charger|  999|       2|           UPI|Ban

In [0]:
%sql
SELECT order_id, customer_name, product_name, price
FROM ecommerce_data

order_id,customer_name,product_name,price
1001,Amit Sharma,Wireless Mouse,799
1002,Priya Verma,Cotton Kurti,1299
1003,Rahul Mehta,Bluetooth Headphones,2499
1004,Ananya Singh,Electric Kettle,1999
1005,Amit Sharma,USB-C Charger,999
1006,Sneha Iyer,Face Serum,1599
1007,Vikas Gupta,Running Shoes,3499
1008,Priya Verma,Dinner Set,2999
1009,Neha Kapoor,Smart Watch,4999
1010,Rahul Mehta,Hair Dryer,1899


**Display DataFrame Schema**

In [0]:
df.show()
df.printSchema()

+--------+----------+-----------+-------------+----------------+--------------------+-----+--------+--------------+---------+
|order_id|order_date|customer_id|customer_name|product_category|        product_name|price|quantity|payment_method|     city|
+--------+----------+-----------+-------------+----------------+--------------------+-----+--------+--------------+---------+
|    1001|2024-01-05|       C001|  Amit Sharma|     Electronics|      Wireless Mouse|  799|       1|           UPI|Bangalore|
|    1002|2024-01-06|       C002|  Priya Verma|         Fashion|        Cotton Kurti| 1299|       2|   Credit Card|    Delhi|
|    1003|2024-01-06|       C003|  Rahul Mehta|     Electronics|Bluetooth Headphones| 2499|       1|    Debit Card|   Mumbai|
|    1004|2024-01-07|       C004| Ananya Singh|  Home & Kitchen|     Electric Kettle| 1999|       1|           UPI|Hyderabad|
|    1005|2024-01-08|       C001|  Amit Sharma|     Electronics|       USB-C Charger|  999|       2|           UPI|Ban

**SELECT**


In [0]:
%python
df.select("order_id", "customer_name", "product_name", "price").show()

+--------+-------------+--------------------+-----+
|order_id|customer_name|        product_name|price|
+--------+-------------+--------------------+-----+
|    1001|  Amit Sharma|      Wireless Mouse|  799|
|    1002|  Priya Verma|        Cotton Kurti| 1299|
|    1003|  Rahul Mehta|Bluetooth Headphones| 2499|
|    1004| Ananya Singh|     Electric Kettle| 1999|
|    1005|  Amit Sharma|       USB-C Charger|  999|
|    1006|   Sneha Iyer|          Face Serum| 1599|
|    1007|  Vikas Gupta|       Running Shoes| 3499|
|    1008|  Priya Verma|          Dinner Set| 2999|
|    1009|  Neha Kapoor|         Smart Watch| 4999|
|    1010|  Rahul Mehta|          Hair Dryer| 1899|
+--------+-------------+--------------------+-----+



**FILTER**

In [0]:
df.filter(df.price > 2000).show()

+--------+----------+-----------+-------------+----------------+--------------------+-----+--------+--------------+------+
|order_id|order_date|customer_id|customer_name|product_category|        product_name|price|quantity|payment_method|  city|
+--------+----------+-----------+-------------+----------------+--------------------+-----+--------+--------------+------+
|    1003|2024-01-06|       C003|  Rahul Mehta|     Electronics|Bluetooth Headphones| 2499|       1|    Debit Card|Mumbai|
|    1007|2024-01-10|       C006|  Vikas Gupta|         Fashion|       Running Shoes| 3499|       1|   Credit Card|  Pune|
|    1008|2024-01-10|       C002|  Priya Verma|  Home & Kitchen|          Dinner Set| 2999|       1|    Debit Card| Delhi|
|    1009|2024-01-11|       C007|  Neha Kapoor|     Electronics|         Smart Watch| 4999|       1|           UPI| Noida|
+--------+----------+-----------+-------------+----------------+--------------------+-----+--------+--------------+------+



**GROUP BY**

In [0]:
from pyspark.sql.functions import sum

df.groupBy("product_category") \
  .agg(sum("price").alias("total_sales")) \
  .show()

+----------------+-----------+
|product_category|total_sales|
+----------------+-----------+
|     Electronics|       9296|
|         Fashion|       4798|
|  Home & Kitchen|       4998|
|          Beauty|       3498|
+----------------+-----------+



**ORDER BY**

In [0]:
df.orderBy(df.price.desc()).show()

+--------+----------+-----------+-------------+----------------+--------------------+-----+--------+--------------+---------+
|order_id|order_date|customer_id|customer_name|product_category|        product_name|price|quantity|payment_method|     city|
+--------+----------+-----------+-------------+----------------+--------------------+-----+--------+--------------+---------+
|    1009|2024-01-11|       C007|  Neha Kapoor|     Electronics|         Smart Watch| 4999|       1|           UPI|    Noida|
|    1007|2024-01-10|       C006|  Vikas Gupta|         Fashion|       Running Shoes| 3499|       1|   Credit Card|     Pune|
|    1008|2024-01-10|       C002|  Priya Verma|  Home & Kitchen|          Dinner Set| 2999|       1|    Debit Card|    Delhi|
|    1003|2024-01-06|       C003|  Rahul Mehta|     Electronics|Bluetooth Headphones| 2499|       1|    Debit Card|   Mumbai|
|    1004|2024-01-07|       C004| Ananya Singh|  Home & Kitchen|     Electric Kettle| 1999|       1|           UPI|Hyd