## Create Spark session and import requrired libraries


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("PySparkPractice").getOrCreate()


## Create Schema and data

In [2]:
# Employee Schema
emp_schema = StructType([
    StructField("EmpID", IntegerType(), True),
    StructField("EmpName", StringType(), True),
    StructField("DeptID", IntegerType(), True),
    StructField("Salary", IntegerType(), True),
    StructField("JoiningDate", StringType(), True),
    StructField("Gender", StringType(), True)
])

emp_data = [
    (1, "Amit", 101, 70000, "2020-01-10", "M"),
    (2, "Priya", 102, 80000, "2019-03-15", "F"),
    (3, "Rohit", 101, 90000, "2021-06-01", "M"),
    (4, "Sneha", 103, 60000, "2020-11-20", "F"),
    (5, "Ankit", 102, 75000, "2022-02-10", "M"),
    (6, "Riya", 101, 72000, "2023-01-05", "F"),
    (7, "Dev", 103, 67000, "2020-09-23", "M"),
    (8, "Simran", 102, 81000, "2021-05-25", "F"),
    (9, "Karan", 101, 95000, "2018-12-11", "M"),
    (10, "Neha", 103, 58000, "2023-04-30", "F")
]

df_emp = spark.createDataFrame(emp_data, schema=emp_schema)

# Department Schema
dept_schema = StructType([
    StructField("DeptID", IntegerType(), True),
    StructField("DeptName", StringType(), True),
    StructField("Location", StringType(), True)
])

dept_data = [
    (101, "IT", "Bangalore"),
    (102, "HR", "Hyderabad"),
    (103, "Finance", "Pune")
]

df_dept = spark.createDataFrame(dept_data, schema=dept_schema)

# Sales Data
sales_schema = StructType([
    StructField("SaleID", IntegerType(), True),
    StructField("EmpID", IntegerType(), True),
    StructField("SaleAmount", IntegerType(), True),
    StructField("SaleDate", StringType(), True)
])

sales_data = [
    (1, 1, 1000, "2024-01-10"),
    (2, 2, 2000, "2024-01-12"),
    (3, 3, 1500, "2024-02-01"),
    (4, 1, 2500, "2024-02-15"),
    (5, 4, 3000, "2024-03-20"),
    (6, 2, 1800, "2024-03-22"),
    (7, 6, 2200, "2024-04-10"),
    (8, 9, 4000, "2024-04-15")
]

df_sales = spark.createDataFrame(sales_data, schema=sales_schema)

## Beginner Level (1–15)

### 1️ Show first 5 records of employee dataset

In [3]:
df_emp.show(5)

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|
|    2|  Priya|   102| 80000| 2019-03-15|     F|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|
+-----+-------+------+------+-----------+------+
only showing top 5 rows



### 2️ Display schema of employee DataFrame

In [4]:
df_emp.printSchema()

root
 |-- EmpID: integer (nullable = true)
 |-- EmpName: string (nullable = true)
 |-- DeptID: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- JoiningDate: string (nullable = true)
 |-- Gender: string (nullable = true)



### 3️ Select only EmpName and Salary columns

In [5]:
df_emp.select('EmpName','Salary').show()

+-------+------+
|EmpName|Salary|
+-------+------+
|   Amit| 70000|
|  Priya| 80000|
|  Rohit| 90000|
|  Sneha| 60000|
|  Ankit| 75000|
|   Riya| 72000|
|    Dev| 67000|
| Simran| 81000|
|  Karan| 95000|
|   Neha| 58000|
+-------+------+



### 4️ Filter employees with salary > 75000

In [6]:
df_emp.filter(df_emp.Salary >75000).show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    2|  Priya|   102| 80000| 2019-03-15|     F|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|
|    8| Simran|   102| 81000| 2021-05-25|     F|
|    9|  Karan|   101| 95000| 2018-12-11|     M|
+-----+-------+------+------+-----------+------+



### 5️ Count total number of employees

In [7]:
df_emp.count()

10

### 6️ Find distinct department IDs

In [8]:
df_emp.select("DeptID").distinct().show()

+------+
|DeptID|
+------+
|   101|
|   103|
|   102|
+------+



### 7️ Sort employees by salary descending

In [9]:
df_emp.orderBy(col("Salary").desc()).show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    9|  Karan|   101| 95000| 2018-12-11|     M|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|
|    8| Simran|   102| 81000| 2021-05-25|     F|
|    2|  Priya|   102| 80000| 2019-03-15|     F|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|
|    6|   Riya|   101| 72000| 2023-01-05|     F|
|    1|   Amit|   101| 70000| 2020-01-10|     M|
|    7|    Dev|   103| 67000| 2020-09-23|     M|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|
|   10|   Neha|   103| 58000| 2023-04-30|     F|
+-----+-------+------+------+-----------+------+



### 8️ Add a new column Bonus = 10% of Salary

In [10]:
df_emp.withColumn("Bonus", col("Salary")*0.1).show()

+-----+-------+------+------+-----------+------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender| Bonus|
+-----+-------+------+------+-----------+------+------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|7000.0|
|    2|  Priya|   102| 80000| 2019-03-15|     F|8000.0|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|9000.0|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|6000.0|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|7500.0|
|    6|   Riya|   101| 72000| 2023-01-05|     F|7200.0|
|    7|    Dev|   103| 67000| 2020-09-23|     M|6700.0|
|    8| Simran|   102| 81000| 2021-05-25|     F|8100.0|
|    9|  Karan|   101| 95000| 2018-12-11|     M|9500.0|
|   10|   Neha|   103| 58000| 2023-04-30|     F|5800.0|
+-----+-------+------+------+-----------+------+------+



### 9️ Rename column “EmpName” to “EmployeeName”

In [11]:
df_emp.withColumnRenamed("EmpName", "EmployeeName").show()

+-----+------------+------+------+-----------+------+
|EmpID|EmployeeName|DeptID|Salary|JoiningDate|Gender|
+-----+------------+------+------+-----------+------+
|    1|        Amit|   101| 70000| 2020-01-10|     M|
|    2|       Priya|   102| 80000| 2019-03-15|     F|
|    3|       Rohit|   101| 90000| 2021-06-01|     M|
|    4|       Sneha|   103| 60000| 2020-11-20|     F|
|    5|       Ankit|   102| 75000| 2022-02-10|     M|
|    6|        Riya|   101| 72000| 2023-01-05|     F|
|    7|         Dev|   103| 67000| 2020-09-23|     M|
|    8|      Simran|   102| 81000| 2021-05-25|     F|
|    9|       Karan|   101| 95000| 2018-12-11|     M|
|   10|        Neha|   103| 58000| 2023-04-30|     F|
+-----+------------+------+------+-----------+------+



### Find maximum & minimum salary

In [12]:
df_emp.agg(max("Salary").alias("Max_Salary")).show()

df_emp.agg(min("Salary").alias("Min_Salary")).show()

+----------+
|Max_Salary|
+----------+
|     95000|
+----------+

+----------+
|Min_Salary|
+----------+
|     58000|
+----------+



### 11️ Group by DeptID and find average salary

In [13]:
df_emp.groupBy('DeptID').agg(avg('Salary').alias('Avg_Salary')).show()

+------+------------------+
|DeptID|        Avg_Salary|
+------+------------------+
|   101|           81750.0|
|   103|61666.666666666664|
|   102| 78666.66666666667|
+------+------------------+



### 12️ Filter female employees

In [14]:
df_emp.filter(col('Gender') == 'F').show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    2|  Priya|   102| 80000| 2019-03-15|     F|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|
|    6|   Riya|   101| 72000| 2023-01-05|     F|
|    8| Simran|   102| 81000| 2021-05-25|     F|
|   10|   Neha|   103| 58000| 2023-04-30|     F|
+-----+-------+------+------+-----------+------+



### 13️ Show number of employees by gender

In [15]:
df_emp.groupBy(col('Gender')).agg(count(col("EmpID")).alias("Num_Employees")).show()

+------+-------------+
|Gender|Num_Employees|
+------+-------------+
|     F|            5|
|     M|            5|
+------+-------------+



### 14️ Add column “YearsOfExperience” based on JoiningDate

In [16]:
df_emp.withColumn("YearsOfExperience", round(months_between(current_date(), to_date(col("JoiningDate")))/12, 2)).show()

+-----+-------+------+------+-----------+------+-----------------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|YearsOfExperience|
+-----+-------+------+------+-----------+------+-----------------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|             5.86|
|    2|  Priya|   102| 80000| 2019-03-15|     F|             6.68|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|             4.47|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|              5.0|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|             3.77|
|    6|   Riya|   101| 72000| 2023-01-05|     F|             2.87|
|    7|    Dev|   103| 67000| 2020-09-23|     M|             5.16|
|    8| Simran|   102| 81000| 2021-05-25|     F|             4.48|
|    9|  Karan|   101| 95000| 2018-12-11|     M|             6.94|
|   10|   Neha|   103| 58000| 2023-04-30|     F|             2.55|
+-----+-------+------+------+-----------+------+-----------------+



### 15️ Combine department name with employee data

In [17]:
df_emp.join(df_dept, on='DeptID', how='inner').show()

+------+-----+-------+------+-----------+------+--------+---------+
|DeptID|EmpID|EmpName|Salary|JoiningDate|Gender|DeptName| Location|
+------+-----+-------+------+-----------+------+--------+---------+
|   101|    1|   Amit| 70000| 2020-01-10|     M|      IT|Bangalore|
|   101|    3|  Rohit| 90000| 2021-06-01|     M|      IT|Bangalore|
|   101|    6|   Riya| 72000| 2023-01-05|     F|      IT|Bangalore|
|   101|    9|  Karan| 95000| 2018-12-11|     M|      IT|Bangalore|
|   102|    2|  Priya| 80000| 2019-03-15|     F|      HR|Hyderabad|
|   102|    5|  Ankit| 75000| 2022-02-10|     M|      HR|Hyderabad|
|   102|    8| Simran| 81000| 2021-05-25|     F|      HR|Hyderabad|
|   103|    4|  Sneha| 60000| 2020-11-20|     F| Finance|     Pune|
|   103|    7|    Dev| 67000| 2020-09-23|     M| Finance|     Pune|
|   103|   10|   Neha| 58000| 2023-04-30|     F| Finance|     Pune|
+------+-----+-------+------+-----------+------+--------+---------+



In [18]:
df_emp.explain()

== Physical Plan ==
*(1) Scan ExistingRDD[EmpID#0,EmpName#1,DeptID#2,Salary#3,JoiningDate#4,Gender#5]




## Intermediate Level (16–35)

### 16️ Get highest-paid employee in each departmen

In [20]:
df_dept.printSchema()

root
 |-- DeptID: integer (nullable = true)
 |-- DeptName: string (nullable = true)
 |-- Location: string (nullable = true)



In [28]:
df_Higest_Pay_Emp = df_emp.groupBy(F.col('DeptID').alias('Dept_Id')).agg(F.max(F.col('Salary')).alias('Max_salary'))
df_Higest_Pay_Emp.show()

#### Join with department table
df_Higest_Pay_Emp_dept = df_Higest_Pay_Emp.join(df_dept, df_Higest_Pay_Emp['Dept_Id'] == df_dept['DeptID'], 'inner')
result = df_Higest_Pay_Emp_dept.select(F.col('DeptID'), F.col('Max_salary'), F.col('DeptName'),F.col('Location'))
result.show()

+-------+----------+
|Dept_Id|Max_salary|
+-------+----------+
|    101|     95000|
|    103|     67000|
|    102|     81000|
+-------+----------+

+------+----------+--------+---------+
|DeptID|Max_salary|DeptName| Location|
+------+----------+--------+---------+
|   101|     95000|      IT|Bangalore|
|   103|     67000| Finance|     Pune|
|   102|     81000|      HR|Hyderabad|
+------+----------+--------+---------+



#### 17️ Find employees with name starting with “S”

In [30]:
df_S_emp = df_emp.filter(F.col('EmpName').startswith("S"))

df_S_emp.show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    4|  Sneha|   103| 60000| 2020-11-20|     F|
|    8| Simran|   102| 81000| 2021-05-25|     F|
+-----+-------+------+------+-----------+------+



In [31]:
df_sales.printSchema()

root
 |-- SaleID: integer (nullable = true)
 |-- EmpID: integer (nullable = true)
 |-- SaleAmount: integer (nullable = true)
 |-- SaleDate: string (nullable = true)



#### 18️ Find total sales amount per employee

In [32]:
df_total_sales_per_emp = df_sales.groupBy(F.col('EmpID')).agg(F.sum(F.col('SaleAmount')).alias('Total_Sales_Amount'))
df_total_sales_per_emp.show()

+-----+------------------+
|EmpID|Total_Sales_Amount|
+-----+------------------+
|    1|              3500|
|    3|              1500|
|    2|              3800|
|    6|              2200|
|    9|              4000|
|    4|              3000|
+-----+------------------+



#### 19 Join employee and sales data

In [35]:
df_combine_emp_sales = df_emp.join(df_sales, df_emp['EmpID'] == df_sales['EmpID'], 'inner').drop(df_sales['EmpID'])
df_combine_emp_sales.show()

+-----+-------+------+------+-----------+------+------+----------+----------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|SaleID|SaleAmount|  SaleDate|
+-----+-------+------+------+-----------+------+------+----------+----------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|     1|      1000|2024-01-10|
|    1|   Amit|   101| 70000| 2020-01-10|     M|     4|      2500|2024-02-15|
|    2|  Priya|   102| 80000| 2019-03-15|     F|     2|      2000|2024-01-12|
|    2|  Priya|   102| 80000| 2019-03-15|     F|     6|      1800|2024-03-22|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|     3|      1500|2024-02-01|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|     5|      3000|2024-03-20|
|    6|   Riya|   101| 72000| 2023-01-05|     F|     7|      2200|2024-04-10|
|    9|  Karan|   101| 95000| 2018-12-11|     M|     8|      4000|2024-04-15|
+-----+-------+------+------+-----------+------+------+----------+----------+



#### 20️ Find employees who have not made any sales

In [36]:
df_not_sale_emp = df_emp.join(df_sales, df_emp['EmpID'] == df_sales['EmpID'], 'left_anti')
df_not_sale_emp.show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    5|  Ankit|   102| 75000| 2022-02-10|     M|
|    8| Simran|   102| 81000| 2021-05-25|     F|
|    7|    Dev|   103| 67000| 2020-09-23|     M|
|   10|   Neha|   103| 58000| 2023-04-30|     F|
+-----+-------+------+------+-----------+------+



#### 21️ Add “PerformanceCategory” column

In [37]:
new_df_emp = df_emp.withColumn('“PerformanceCategory”', 
                               F.when(F.col('Salary')> 90000, 'High')\
                               .when(F.col('Salary')>70000, 'Medium')\
                               .otherwise('Low')                               
                               )
new_df_emp.show()

+-----+-------+------+------+-----------+------+---------------------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|“PerformanceCategory”|
+-----+-------+------+------+-----------+------+---------------------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|                  Low|
|    2|  Priya|   102| 80000| 2019-03-15|     F|               Medium|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|               Medium|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|                  Low|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|               Medium|
|    6|   Riya|   101| 72000| 2023-01-05|     F|               Medium|
|    7|    Dev|   103| 67000| 2020-09-23|     M|                  Low|
|    8| Simran|   102| 81000| 2021-05-25|     F|               Medium|
|    9|  Karan|   101| 95000| 2018-12-11|     M|                 High|
|   10|   Neha|   103| 58000| 2023-04-30|     F|                  Low|
+-----+-------+------+------+-----------+------+---------------------+

