In [14]:
!pip install pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Employee-Analysis").getOrCreate()



In [9]:
import io
csv_data = """id,name,department,salary
1,Rahul Sharma,IT,55000
2,Priya Singh,HR,60000
3,Aman Kumar,Finance,48000
4,Sneha Reddy,Marketing,52000
5,Arjun Mehta,IT,75000
6,Divya Nair,Finance,67000
"""
with open("employees.csv", "w") as f:
  f.write(csv_data)

In [17]:
df = spark.read.csv("employees.csv", header=True, inferSchema=True)
df.show()

+---+------------+----------+------+
| id|        name|department|salary|
+---+------------+----------+------+
|  1|Rahul Sharma|        IT| 55000|
|  2| Priya Singh|        HR| 60000|
|  3|  Aman Kumar|   Finance| 48000|
|  4| Sneha Reddy| Marketing| 52000|
|  5| Arjun Mehta|        IT| 75000|
|  6|  Divya Nair|   Finance| 67000|
+---+------------+----------+------+



**Transformations**
---

# üìù Key Points about Transformations

* **Lazy Execution**:
  Spark doesn‚Äôt run transformations right away. Instead, it builds a **logical plan** (a DAG ‚Äì Directed Acyclic Graph).
  The computation only runs when an **action** (like `.show()` or `.count()`) is called.

* **Return Type**:
  A transformation always returns a **new DataFrame or RDD**. It does **not modify the existing one**.

* **Two Types of Transformations**:

  1. **Narrow Transformations** ‚Üí Each input partition contributes to only one output partition.
     (e.g., `map()`, `filter()`, `select()`)
  2. **Wide Transformations** ‚Üí Data is shuffled across partitions.
     (e.g., `groupBy()`, `join()`)

---



In [19]:
df.select ("name", "salary").show()
df.filter (df["salary"] > 60000).show()
df.orderBy (df["salary"].desc()).show()

+------------+------+
|        name|salary|
+------------+------+
|Rahul Sharma| 55000|
| Priya Singh| 60000|
|  Aman Kumar| 48000|
| Sneha Reddy| 52000|
| Arjun Mehta| 75000|
|  Divya Nair| 67000|
+------------+------+

+---+-----------+----------+------+
| id|       name|department|salary|
+---+-----------+----------+------+
|  5|Arjun Mehta|        IT| 75000|
|  6| Divya Nair|   Finance| 67000|
+---+-----------+----------+------+

+---+------------+----------+------+
| id|        name|department|salary|
+---+------------+----------+------+
|  5| Arjun Mehta|        IT| 75000|
|  6|  Divya Nair|   Finance| 67000|
|  2| Priya Singh|        HR| 60000|
|  1|Rahul Sharma|        IT| 55000|
|  4| Sneha Reddy| Marketing| 52000|
|  3|  Aman Kumar|   Finance| 48000|
+---+------------+----------+------+



---

# üìù What is Aggregation?

* An operation that **groups data** and applies a **summary function** (like sum, avg, count, min, max).

* Used to answer questions like:

  * *‚ÄúWhat is the average salary per department?‚Äù*

  * *‚ÄúHow many employees are in each department?‚Äù*

  * *‚ÄúWhat is the highest salary in Finance?‚Äù*

---


In [22]:
df.groupBy("department").avg("salary").show()
df.groupBy("department").max("salary").show()
df.groupBy("department").count().show()

+----------+-----------+
|department|avg(salary)|
+----------+-----------+
|        HR|    60000.0|
|   Finance|    57500.0|
| Marketing|    52000.0|
|        IT|    65000.0|
+----------+-----------+

+----------+-----------+
|department|max(salary)|
+----------+-----------+
|        HR|      60000|
|   Finance|      67000|
| Marketing|      52000|
|        IT|      75000|
+----------+-----------+

+----------+-----+
|department|count|
+----------+-----+
|        HR|    1|
|   Finance|    2|
| Marketing|    1|
|        IT|    2|
+----------+-----+



In [24]:
df.createOrReplaceTempView("employees")
spark.sql ("SELECT department, AVG(salary) as avg_salary FROM employees GROUP BY department").show()

+----------+----------+
|department|avg_salary|
+----------+----------+
|        HR|   60000.0|
|   Finance|   57500.0|
| Marketing|   52000.0|
|        IT|   65000.0|
+----------+----------+

