In [None]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("ProductOrderAnalysis").getOrCreate()

# Product data
product_data = [
    (101, "Laptop", "Electronics", 55000),
    (102, "Mobile Phone", "Electronics", 25000),
    (103, "Chair", "Furniture", 5000),
    (104, "Book", "Stationery", 300),
    (105, "Headphones", "Electronics", 3000)
]

product_cols = ["product_id", "name", "category", "price"]
product_df = spark.createDataFrame(product_data, product_cols)

# Order data
order_data = [
    (201, 101, 2, "Rahul Sharma"),
    (202, 102, 1, "Priya Singh"),
    (203, 103, 4, "Aman Kumar"),
    (204, 104, 10, "Sneha Reddy"),
    (205, 101, 1, "Arjun Mehta"),
    (206, 105, 3, "Rahul Sharma"),
    (207, 106, 1, "Ghost Customer")  # Order with product not in catalog
]

order_cols = ["order_id", "product_id", "quantity", "customer"]
order_df = spark.createDataFrame(order_data, order_cols)

# Show both
product_df.show()
order_df.show()


product_df.select("name","price").show()

product_df.filter(product_df["price"]>10000).show()

product_df.orderBy(product_df["price"].desc()).show()

+----------+------------+-----------+-----+
|product_id|        name|   category|price|
+----------+------------+-----------+-----+
|       101|      Laptop|Electronics|55000|
|       102|Mobile Phone|Electronics|25000|
|       103|       Chair|  Furniture| 5000|
|       104|        Book| Stationery|  300|
|       105|  Headphones|Electronics| 3000|
+----------+------------+-----------+-----+

+--------+----------+--------+--------------+
|order_id|product_id|quantity|      customer|
+--------+----------+--------+--------------+
|     201|       101|       2|  Rahul Sharma|
|     202|       102|       1|   Priya Singh|
|     203|       103|       4|    Aman Kumar|
|     204|       104|      10|   Sneha Reddy|
|     205|       101|       1|   Arjun Mehta|
|     206|       105|       3|  Rahul Sharma|
|     207|       106|       1|Ghost Customer|
+--------+----------+--------+--------------+

+------------+-----+
|        name|price|
+------------+-----+
|      Laptop|55000|
|Mobile Phon

In [None]:
order_df.join(product_df,order_df.product_id==product_df.product_id,"inner").show()
order_df.join(product_df,order_df.product_id==product_df.product_id,"left").show()
order_df.join(product_df,order_df.product_id==product_df.product_id,"right").show()

+--------+----------+--------+------------+----------+------------+-----------+-----+
|order_id|product_id|quantity|    customer|product_id|        name|   category|price|
+--------+----------+--------+------------+----------+------------+-----------+-----+
|     201|       101|       2|Rahul Sharma|       101|      Laptop|Electronics|55000|
|     205|       101|       1| Arjun Mehta|       101|      Laptop|Electronics|55000|
|     202|       102|       1| Priya Singh|       102|Mobile Phone|Electronics|25000|
|     203|       103|       4|  Aman Kumar|       103|       Chair|  Furniture| 5000|
|     204|       104|      10| Sneha Reddy|       104|        Book| Stationery|  300|
|     206|       105|       3|Rahul Sharma|       105|  Headphones|Electronics| 3000|
+--------+----------+--------+------------+----------+------------+-----------+-----+

+--------+----------+--------+--------------+----------+------------+-----------+-----+
|order_id|product_id|quantity|      customer|produc

In [None]:
product_df.createOrReplaceTempView("products")
order_df.createOrReplaceTempView("orders")

spark.sql(""" SELECT o.product_id, p.name, SUM(o.quantity * p.price)
 AS total_revenue
  FROM orders o
JOIN products p ON o.product_id = p.product_id
GROUP BY o.product_id, p.name """). show()

spark.sql(""" SELECT customer, SUM(quantity) AS total_qty
  FROM orders
  GROUP BY customer
  ORDER BY total_qty DESC
  LIMIT 2 """). show()


+----------+------------+-------------+
|product_id|        name|total_revenue|
+----------+------------+-------------+
|       101|      Laptop|       165000|
|       102|Mobile Phone|        25000|
|       103|       Chair|        20000|
|       104|        Book|         3000|
|       105|  Headphones|         9000|
+----------+------------+-------------+

+------------+---------+
|    customer|total_qty|
+------------+---------+
| Sneha Reddy|       10|
|Rahul Sharma|        5|
+------------+---------+



In [None]:
!pip install pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RDD-Example").getOrCreate()

# Get SparkContext
sc = spark.sparkContext




---

# 📝 What is RDD?

**RDD** stands for **Resilient Distributed Dataset**.

It is the **core data structure of Apache Spark** — an **immutable distributed collection of objects** that can be processed in parallel across a cluster.

👉 In simple words:

* Think of RDD like a **giant list**, but instead of sitting in one machine, it’s **spread across multiple machines**.
* Spark can apply functions to this distributed data in parallel, making it fast and fault-tolerant.

---

# 📝 Key Properties of RDD

* **Resilient** → Fault-tolerant (can recover lost data automatically using lineage).
* **Distributed** → Data is split into partitions across cluster nodes.
* **Dataset** → Collection of elements (numbers, strings, objects, rows, etc.).
* **Immutable** → Once created, cannot be changed — only transformed into new RDDs.

---

In [7]:
data=[1,2,3,4,5,6,7,8,9]
rdd=sc.parallelize(data)
print("RDD elements:",rdd.collect())


squared_rdd=rdd.map(lambda x: x* x)

even_rdd= rdd.filter(lambda x: x % 2 == 0)
print("squared:", squared_rdd.collect())
print("Event", even_rdd.collect())
print("Count:", rdd.count())
print("Sum:", rdd.sum())
print("rax:", rdd.max())

RDD elements: [1, 2, 3, 4, 5, 6, 7, 8, 9]
squared: [1, 4, 9, 16, 25, 36, 49, 64, 81]
Event [2, 4, 6, 8]
Count: 9
Sum: 45
rax: 9
