In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("FiveQueries").getOrCreate()

# Sample data
data = [
    ("John", "Sales", 3000),
    ("Jane", "Marketing", 4000),
    ("Doe", "Sales", 3500),
    ("Smith", "HR", 3200),
    ("Brown", "Marketing", 3800)
]

# Create DataFrame
df = spark.createDataFrame(data, ["Name", "Department", "Salary"])



In [2]:
# 1. Show all data
print("Query 1: Show all data")
df.show()



Query 1: Show all data
+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| John|     Sales|  3000|
| Jane| Marketing|  4000|
|  Doe|     Sales|  3500|
|Smith|        HR|  3200|
|Brown| Marketing|  3800|
+-----+----------+------+



In [3]:
# 2. Filter employees in Sales department
print("Query 2: Filter employees in Sales department")
df_sales = df.filter(col("Department") == "Sales")
df_sales.show()



Query 2: Filter employees in Sales department
+----+----------+------+
|Name|Department|Salary|
+----+----------+------+
|John|     Sales|  3000|
| Doe|     Sales|  3500|
+----+----------+------+



In [4]:
# 3. Calculate average salary by department
print("Query 3: Calculate average salary by department")
df.groupby("Department").agg({"Salary": "avg"}).show()



Query 3: Calculate average salary by department
+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|     Sales|     3250.0|
| Marketing|     3900.0|
|        HR|     3200.0|
+----------+-----------+



In [5]:
# 4. Find employees with salary above 3500
print("Query 4: Find employees with salary above 3500")
df_high_salary = df.filter(col("Salary") > 3500)
df_high_salary.show()



Query 4: Find employees with salary above 3500
+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| Jane| Marketing|  4000|
|Brown| Marketing|  3800|
+-----+----------+------+



In [6]:
# 5. Sort employees by salary in descending order
print("Query 5: Sort employees by salary in descending order")
df_sorted = df.sort(col("Salary").desc())
df_sorted.show()

Query 5: Sort employees by salary in descending order
+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| Jane| Marketing|  4000|
|Brown| Marketing|  3800|
|  Doe|     Sales|  3500|
|Smith|        HR|  3200|
| John|     Sales|  3000|
+-----+----------+------+



In [7]:
# Stop the Spark session
spark.stop()