In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .appName("Azure Exercise")\
        .getOrCreate()

spark

In [0]:
data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]

columns = ["Name", "Department", "Salary"]

df = spark.createDataFrame(data, columns)
df.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+



In [0]:
#Exercise Set 1: Basics
# 1. Display all records in the DataFrame.
df.show()

# 2. Print the schema of the DataFrame.
df.printSchema()

# 3. Count total number of employees.df
total_employees = df.count()
print(f"Total number of employees: {total_employees}")

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)

Total number of employees: 7


In [0]:
#Exercise Set 2: Column Operations
# 4. Add a new column Bonus which is 15% of Salary.
df = df.withColumn("Bonus", df["Salary"] * 0.15)
df.show()

# 5. Add a new column NetPay = Salary + Bonus.
df = df.withColumn("NetPay", df["Salary"] + df["Bonus"])
df.show()

+------+-----------+------+-------+
|  Name| Department|Salary|  Bonus|
+------+-----------+------+-------+
|Ananya|         HR| 52000| 7800.0|
| Rahul|Engineering| 65000| 9750.0|
| Priya|Engineering| 60000| 9000.0|
|  Zoya|  Marketing| 48000| 7200.0|
| Karan|         HR| 53000| 7950.0|
|Naveen|Engineering| 70000|10500.0|
|Fatima|  Marketing| 45000| 6750.0|
+------+-----------+------+-------+

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



In [0]:
#Exercise Set 3: Filtering and Conditions
# 6. Display only employees from the “Engineering” department.
engineering_employees = df.filter(df["Department"] == "Engineering")
engineering_employees.show()

# 7. Display employees whose salary is greater than 60000.
emp_by_salary = df.filter(df["Salary"] > 60000)
emp_by_salary.show()

# 8. Display employees who are not in the “Marketing” department.
non_marketing_employees = df.filter(df["Department"] != "Marketing")
non_marketing_employees.show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
| Rahul|Engineering| 65000| 9750.0|74750.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+



In [0]:
#Exercise Set 4: Sorting and Limiting
from pyspark.sql.functions import col
# 9. Show top 3 highest paid employees.
top_3_employees = df.orderBy(df["Salary"].desc()).limit(3)
top_3_employees.show()

# 10. Sort the data by Department ascending and Salary descending.
sorted_df = df.orderBy(col("Department").asc(), col("Salary").desc())
sorted_df.show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Naveen|Engineering| 70000|10500.0|80500.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
+------+-----------+------+-------+-------+

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Naveen|Engineering| 70000|10500.0|80500.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Ananya|         HR| 52000| 7800.0|59800.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



In [0]:
#Exercise Set 5: String and Case Logic
from pyspark.sql.functions import when, upper, col

#11: Add Level column
df_with_level = df.withColumn("Level", 
    when(col("Salary") > 60000, "Senior")
    .when((col("Salary") >= 50000) & (col("Salary") <= 60000), "Mid")
    .otherwise("Junior")
)
df_with_level.show()

#12: Convert Name to uppercase (on df_with_level)
df_final = df_with_level.withColumn("Name", upper(col("Name")))

df_final.show()


+------+-----------+------+-------+-------+------+
|  Name| Department|Salary|  Bonus| NetPay| Level|
+------+-----------+------+-------+-------+------+
|Ananya|         HR| 52000| 7800.0|59800.0|   Mid|
| Rahul|Engineering| 65000| 9750.0|74750.0|Senior|
| Priya|Engineering| 60000| 9000.0|69000.0|   Mid|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|Junior|
| Karan|         HR| 53000| 7950.0|60950.0|   Mid|
|Naveen|Engineering| 70000|10500.0|80500.0|Senior|
|Fatima|  Marketing| 45000| 6750.0|51750.0|Junior|
+------+-----------+------+-------+-------+------+

+------+-----------+------+-------+-------+------+
|  Name| Department|Salary|  Bonus| NetPay| Level|
+------+-----------+------+-------+-------+------+
|ANANYA|         HR| 52000| 7800.0|59800.0|   Mid|
| RAHUL|Engineering| 65000| 9750.0|74750.0|Senior|
| PRIYA|Engineering| 60000| 9000.0|69000.0|   Mid|
|  ZOYA|  Marketing| 48000| 7200.0|55200.0|Junior|
| KARAN|         HR| 53000| 7950.0|60950.0|   Mid|
|NAVEEN|Engineering| 70000|105