## Create Spark session and import requrired libraries


In [2]:
!pip install delta-spark

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from delta.tables import DeltaTable
spark = SparkSession.builder.appName("PySparkPractice").getOrCreate()




## Create Schema and data

In [3]:
# Employee Schema
emp_schema = StructType([
    StructField("EmpID", IntegerType(), True),
    StructField("EmpName", StringType(), True),
    StructField("DeptID", IntegerType(), True),
    StructField("Salary", IntegerType(), True),
    StructField("JoiningDate", StringType(), True),
    StructField("Gender", StringType(), True)
])

emp_data = [
    (1, "Amit", 101, 70000, "2020-01-10", "M"),
    (2, "Priya", 102, 80000, "2019-03-15", "F"),
    (3, "Rohit", 101, 90000, "2021-06-01", "M"),
    (4, "Sneha", 103, 60000, "2020-11-20", "F"),
    (5, "Ankit", 102, 75000, "2022-02-10", "M"),
    (6, "Riya", 101, 72000, "2023-01-05", "F"),
    (7, "Dev", 103, 67000, "2020-09-23", "M"),
    (8, "Simran", 102, 81000, "2021-05-25", "F"),
    (9, "Karan", 101, 95000, "2018-12-11", "M"),
    (10, "Neha", 103, 58000, "2023-04-30", "F")
]

df_emp = spark.createDataFrame(emp_data, schema=emp_schema)

# Department Schema
dept_schema = StructType([
    StructField("DeptID", IntegerType(), True),
    StructField("DeptName", StringType(), True),
    StructField("Location", StringType(), True)
])

dept_data = [
    (101, "IT", "Bangalore"),
    (102, "HR", "Hyderabad"),
    (103, "Finance", "Pune")
]

df_dept = spark.createDataFrame(dept_data, schema=dept_schema)

# Sales Data
sales_schema = StructType([
    StructField("SaleID", IntegerType(), True),
    StructField("EmpID", IntegerType(), True),
    StructField("SaleAmount", IntegerType(), True),
    StructField("SaleDate", StringType(), True)
])

sales_data = [
    (1, 1, 1000, "2024-01-10"),
    (2, 2, 2000, "2024-01-12"),
    (3, 3, 1500, "2024-02-01"),
    (4, 1, 2500, "2024-02-15"),
    (5, 4, 3000, "2024-03-20"),
    (6, 2, 1800, "2024-03-22"),
    (7, 6, 2200, "2024-04-10"),
    (8, 9, 4000, "2024-04-15")
]

df_sales = spark.createDataFrame(sales_data, schema=sales_schema)

## Beginner Level (1‚Äì15)

### 1Ô∏è Show first 5 records of employee dataset

In [4]:
df_emp.show(5)

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|
|    2|  Priya|   102| 80000| 2019-03-15|     F|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|
+-----+-------+------+------+-----------+------+
only showing top 5 rows


### 2Ô∏è Display schema of employee DataFrame

In [5]:
df_emp.printSchema()

root
 |-- EmpID: integer (nullable = true)
 |-- EmpName: string (nullable = true)
 |-- DeptID: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- JoiningDate: string (nullable = true)
 |-- Gender: string (nullable = true)



### 3Ô∏è Select only EmpName and Salary columns

In [6]:
df_emp.select('EmpName','Salary').show()

+-------+------+
|EmpName|Salary|
+-------+------+
|   Amit| 70000|
|  Priya| 80000|
|  Rohit| 90000|
|  Sneha| 60000|
|  Ankit| 75000|
|   Riya| 72000|
|    Dev| 67000|
| Simran| 81000|
|  Karan| 95000|
|   Neha| 58000|
+-------+------+



### 4Ô∏è Filter employees with salary > 75000

In [7]:
df_emp.filter(df_emp.Salary >75000).show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    2|  Priya|   102| 80000| 2019-03-15|     F|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|
|    8| Simran|   102| 81000| 2021-05-25|     F|
|    9|  Karan|   101| 95000| 2018-12-11|     M|
+-----+-------+------+------+-----------+------+



### 5Ô∏è Count total number of employees

In [8]:
df_emp.count()

10

### 6Ô∏è Find distinct department IDs

In [9]:
df_emp.select("DeptID").distinct().show()

+------+
|DeptID|
+------+
|   101|
|   103|
|   102|
+------+



### 7Ô∏è Sort employees by salary descending

In [10]:
df_emp.orderBy(col("Salary").desc()).show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    9|  Karan|   101| 95000| 2018-12-11|     M|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|
|    8| Simran|   102| 81000| 2021-05-25|     F|
|    2|  Priya|   102| 80000| 2019-03-15|     F|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|
|    6|   Riya|   101| 72000| 2023-01-05|     F|
|    1|   Amit|   101| 70000| 2020-01-10|     M|
|    7|    Dev|   103| 67000| 2020-09-23|     M|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|
|   10|   Neha|   103| 58000| 2023-04-30|     F|
+-----+-------+------+------+-----------+------+



### 8Ô∏è Add a new column Bonus = 10% of Salary

In [11]:
df_emp.withColumn("Bonus", col("Salary")*0.1).show()

+-----+-------+------+------+-----------+------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender| Bonus|
+-----+-------+------+------+-----------+------+------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|7000.0|
|    2|  Priya|   102| 80000| 2019-03-15|     F|8000.0|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|9000.0|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|6000.0|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|7500.0|
|    6|   Riya|   101| 72000| 2023-01-05|     F|7200.0|
|    7|    Dev|   103| 67000| 2020-09-23|     M|6700.0|
|    8| Simran|   102| 81000| 2021-05-25|     F|8100.0|
|    9|  Karan|   101| 95000| 2018-12-11|     M|9500.0|
|   10|   Neha|   103| 58000| 2023-04-30|     F|5800.0|
+-----+-------+------+------+-----------+------+------+



### 9Ô∏è Rename column ‚ÄúEmpName‚Äù to ‚ÄúEmployeeName‚Äù

In [12]:
df_emp.withColumnRenamed("EmpName", "EmployeeName").show()

+-----+------------+------+------+-----------+------+
|EmpID|EmployeeName|DeptID|Salary|JoiningDate|Gender|
+-----+------------+------+------+-----------+------+
|    1|        Amit|   101| 70000| 2020-01-10|     M|
|    2|       Priya|   102| 80000| 2019-03-15|     F|
|    3|       Rohit|   101| 90000| 2021-06-01|     M|
|    4|       Sneha|   103| 60000| 2020-11-20|     F|
|    5|       Ankit|   102| 75000| 2022-02-10|     M|
|    6|        Riya|   101| 72000| 2023-01-05|     F|
|    7|         Dev|   103| 67000| 2020-09-23|     M|
|    8|      Simran|   102| 81000| 2021-05-25|     F|
|    9|       Karan|   101| 95000| 2018-12-11|     M|
|   10|        Neha|   103| 58000| 2023-04-30|     F|
+-----+------------+------+------+-----------+------+



### Find maximum & minimum salary

In [13]:
df_emp.agg(max("Salary").alias("Max_Salary")).show()

df_emp.agg(min("Salary").alias("Min_Salary")).show()

+----------+
|Max_Salary|
+----------+
|     95000|
+----------+

+----------+
|Min_Salary|
+----------+
|     58000|
+----------+



### 11Ô∏è Group by DeptID and find average salary

In [14]:
df_emp.groupBy('DeptID').agg(avg('Salary').alias('Avg_Salary')).show()

+------+------------------+
|DeptID|        Avg_Salary|
+------+------------------+
|   101|           81750.0|
|   103|61666.666666666664|
|   102| 78666.66666666667|
+------+------------------+



### 12Ô∏è Filter female employees

In [15]:
df_emp.filter(col('Gender') == 'F').show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    2|  Priya|   102| 80000| 2019-03-15|     F|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|
|    6|   Riya|   101| 72000| 2023-01-05|     F|
|    8| Simran|   102| 81000| 2021-05-25|     F|
|   10|   Neha|   103| 58000| 2023-04-30|     F|
+-----+-------+------+------+-----------+------+



### 13Ô∏è Show number of employees by gender

In [16]:
df_emp.groupBy(col('Gender')).agg(count(col("EmpID")).alias("Num_Employees")).show()

+------+-------------+
|Gender|Num_Employees|
+------+-------------+
|     F|            5|
|     M|            5|
+------+-------------+



### 14Ô∏è Add column ‚ÄúYearsOfExperience‚Äù based on JoiningDate

In [17]:
df_emp.withColumn("YearsOfExperience", round(months_between(current_date(), to_date(col("JoiningDate")))/12, 2)).show()

+-----+-------+------+------+-----------+------+-----------------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|YearsOfExperience|
+-----+-------+------+------+-----------+------+-----------------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|             5.91|
|    2|  Priya|   102| 80000| 2019-03-15|     F|             6.73|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|             4.52|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|             5.05|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|             3.83|
|    6|   Riya|   101| 72000| 2023-01-05|     F|             2.92|
|    7|    Dev|   103| 67000| 2020-09-23|     M|             5.21|
|    8| Simran|   102| 81000| 2021-05-25|     F|             4.53|
|    9|  Karan|   101| 95000| 2018-12-11|     M|             6.99|
|   10|   Neha|   103| 58000| 2023-04-30|     F|              2.6|
+-----+-------+------+------+-----------+------+-----------------+



### 15Ô∏è Combine department name with employee data

In [18]:
df_emp.join(df_dept, on='DeptID', how='inner').show()

+------+-----+-------+------+-----------+------+--------+---------+
|DeptID|EmpID|EmpName|Salary|JoiningDate|Gender|DeptName| Location|
+------+-----+-------+------+-----------+------+--------+---------+
|   101|    1|   Amit| 70000| 2020-01-10|     M|      IT|Bangalore|
|   101|    3|  Rohit| 90000| 2021-06-01|     M|      IT|Bangalore|
|   101|    6|   Riya| 72000| 2023-01-05|     F|      IT|Bangalore|
|   101|    9|  Karan| 95000| 2018-12-11|     M|      IT|Bangalore|
|   102|    2|  Priya| 80000| 2019-03-15|     F|      HR|Hyderabad|
|   102|    5|  Ankit| 75000| 2022-02-10|     M|      HR|Hyderabad|
|   102|    8| Simran| 81000| 2021-05-25|     F|      HR|Hyderabad|
|   103|    4|  Sneha| 60000| 2020-11-20|     F| Finance|     Pune|
|   103|    7|    Dev| 67000| 2020-09-23|     M| Finance|     Pune|
|   103|   10|   Neha| 58000| 2023-04-30|     F| Finance|     Pune|
+------+-----+-------+------+-----------+------+--------+---------+



In [19]:
df_emp.explain()

== Physical Plan ==
*(1) Scan ExistingRDD[EmpID#0,EmpName#1,DeptID#2,Salary#3,JoiningDate#4,Gender#5]




## Intermediate Level (16‚Äì35)

### 16Ô∏è Get highest-paid employee in each departmen

In [20]:
df_dept.printSchema()

root
 |-- DeptID: integer (nullable = true)
 |-- DeptName: string (nullable = true)
 |-- Location: string (nullable = true)



In [21]:
df_Higest_Pay_Emp = df_emp.groupBy(F.col('DeptID').alias('Dept_Id')).agg(F.max(F.col('Salary')).alias('Max_salary'))
df_Higest_Pay_Emp.show()

#### Join with department table
df_Higest_Pay_Emp_dept = df_Higest_Pay_Emp.join(df_dept, df_Higest_Pay_Emp['Dept_Id'] == df_dept['DeptID'], 'inner')
result = df_Higest_Pay_Emp_dept.select(F.col('DeptID'), F.col('Max_salary'), F.col('DeptName'),F.col('Location'))
result.show()

+-------+----------+
|Dept_Id|Max_salary|
+-------+----------+
|    101|     95000|
|    103|     67000|
|    102|     81000|
+-------+----------+

+------+----------+--------+---------+
|DeptID|Max_salary|DeptName| Location|
+------+----------+--------+---------+
|   101|     95000|      IT|Bangalore|
|   103|     67000| Finance|     Pune|
|   102|     81000|      HR|Hyderabad|
+------+----------+--------+---------+



#### 17Ô∏è Find employees with name starting with ‚ÄúS‚Äù

In [22]:
df_S_emp = df_emp.filter(F.col('EmpName').startswith("S"))

df_S_emp.show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    4|  Sneha|   103| 60000| 2020-11-20|     F|
|    8| Simran|   102| 81000| 2021-05-25|     F|
+-----+-------+------+------+-----------+------+



In [23]:
df_sales.printSchema()

root
 |-- SaleID: integer (nullable = true)
 |-- EmpID: integer (nullable = true)
 |-- SaleAmount: integer (nullable = true)
 |-- SaleDate: string (nullable = true)



#### 18Ô∏è Find total sales amount per employee

In [24]:
df_total_sales_per_emp = df_sales.groupBy(F.col('EmpID')).agg(F.sum(F.col('SaleAmount')).alias('Total_Sales_Amount'))
df_total_sales_per_emp.show()

+-----+------------------+
|EmpID|Total_Sales_Amount|
+-----+------------------+
|    1|              3500|
|    3|              1500|
|    2|              3800|
|    6|              2200|
|    9|              4000|
|    4|              3000|
+-----+------------------+



#### 19 Join employee and sales data

In [25]:
df_combine_emp_sales = df_emp.join(df_sales, df_emp['EmpID'] == df_sales['EmpID'], 'inner').drop(df_sales['EmpID'])
df_combine_emp_sales.show()

+-----+-------+------+------+-----------+------+------+----------+----------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|SaleID|SaleAmount|  SaleDate|
+-----+-------+------+------+-----------+------+------+----------+----------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|     1|      1000|2024-01-10|
|    1|   Amit|   101| 70000| 2020-01-10|     M|     4|      2500|2024-02-15|
|    2|  Priya|   102| 80000| 2019-03-15|     F|     2|      2000|2024-01-12|
|    2|  Priya|   102| 80000| 2019-03-15|     F|     6|      1800|2024-03-22|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|     3|      1500|2024-02-01|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|     5|      3000|2024-03-20|
|    6|   Riya|   101| 72000| 2023-01-05|     F|     7|      2200|2024-04-10|
|    9|  Karan|   101| 95000| 2018-12-11|     M|     8|      4000|2024-04-15|
+-----+-------+------+------+-----------+------+------+----------+----------+



#### 20Ô∏è Find employees who have not made any sales

In [26]:
df_not_sale_emp = df_emp.join(df_sales, df_emp['EmpID'] == df_sales['EmpID'], 'left_anti')
df_not_sale_emp.show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    5|  Ankit|   102| 75000| 2022-02-10|     M|
|    8| Simran|   102| 81000| 2021-05-25|     F|
|    7|    Dev|   103| 67000| 2020-09-23|     M|
|   10|   Neha|   103| 58000| 2023-04-30|     F|
+-----+-------+------+------+-----------+------+



#### 21Ô∏è Add ‚ÄúPerformanceCategory‚Äù column

In [27]:
new_df_emp = df_emp.withColumn('‚ÄúPerformanceCategory‚Äù', 
                               F.when(F.col('Salary')> 90000, 'High')\
                               .when(F.col('Salary')>70000, 'Medium')\
                               .otherwise('Low')                               
                               )
new_df_emp.show()

+-----+-------+------+------+-----------+------+---------------------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|‚ÄúPerformanceCategory‚Äù|
+-----+-------+------+------+-----------+------+---------------------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|                  Low|
|    2|  Priya|   102| 80000| 2019-03-15|     F|               Medium|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|               Medium|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|                  Low|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|               Medium|
|    6|   Riya|   101| 72000| 2023-01-05|     F|               Medium|
|    7|    Dev|   103| 67000| 2020-09-23|     M|                  Low|
|    8| Simran|   102| 81000| 2021-05-25|     F|               Medium|
|    9|  Karan|   101| 95000| 2018-12-11|     M|                 High|
|   10|   Neha|   103| 58000| 2023-04-30|     F|                  Low|
+-----+-------+------+------+-----------+------+---------------------+



#### 22Ô∏è Convert all employee names to uppercase

In [28]:
df_emp_upper = df_emp.withColumn('EmpName_upper', F.upper(F.col('EmpName')))
df_emp_upper.show()


+-----+-------+------+------+-----------+------+-------------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|EmpName_upper|
+-----+-------+------+------+-----------+------+-------------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|         AMIT|
|    2|  Priya|   102| 80000| 2019-03-15|     F|        PRIYA|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|        ROHIT|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|        SNEHA|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|        ANKIT|
|    6|   Riya|   101| 72000| 2023-01-05|     F|         RIYA|
|    7|    Dev|   103| 67000| 2020-09-23|     M|          DEV|
|    8| Simran|   102| 81000| 2021-05-25|     F|       SIMRAN|
|    9|  Karan|   101| 95000| 2018-12-11|     M|        KARAN|
|   10|   Neha|   103| 58000| 2023-04-30|     F|         NEHA|
+-----+-------+------+------+-----------+------+-------------+



#### 23Ô∏è Find total sales per month

In [29]:
df_sales.show()

+------+-----+----------+----------+
|SaleID|EmpID|SaleAmount|  SaleDate|
+------+-----+----------+----------+
|     1|    1|      1000|2024-01-10|
|     2|    2|      2000|2024-01-12|
|     3|    3|      1500|2024-02-01|
|     4|    1|      2500|2024-02-15|
|     5|    4|      3000|2024-03-20|
|     6|    2|      1800|2024-03-22|
|     7|    6|      2200|2024-04-10|
|     8|    9|      4000|2024-04-15|
+------+-----+----------+----------+



In [30]:
df_monthly_sales = df_sales.groupBy(F.month(F.col('SaleDate')).alias('Sales_by_Month')).agg(F.sum(F.col('SaleAmount')).alias('Total_sales'))
df_monthly_sales.show()

+--------------+-----------+
|Sales_by_Month|Total_sales|
+--------------+-----------+
|             1|       3000|
|             2|       4000|
|             3|       4800|
|             4|       6200|
+--------------+-----------+



#### 24Ô∏è Find top 3 highest earners

In [31]:
topEarners =  df_emp.orderBy(F.col('salary').desc()).limit(3)
topEarners.show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    9|  Karan|   101| 95000| 2018-12-11|     M|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|
|    8| Simran|   102| 81000| 2021-05-25|     F|
+-----+-------+------+------+-----------+------+



### 25Ô∏è Create a window to rank employees by salary

In [32]:

w = Window.orderBy(F.col('Salary').desc())
df_emp_rnked = df_emp.withColumn('SalaryRanked', F.dense_rank().over(w))
df_emp_rnked.show()

+-----+-------+------+------+-----------+------+------------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|SalaryRanked|
+-----+-------+------+------+-----------+------+------------+
|    9|  Karan|   101| 95000| 2018-12-11|     M|           1|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|           2|
|    8| Simran|   102| 81000| 2021-05-25|     F|           3|
|    2|  Priya|   102| 80000| 2019-03-15|     F|           4|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|           5|
|    6|   Riya|   101| 72000| 2023-01-05|     F|           6|
|    1|   Amit|   101| 70000| 2020-01-10|     M|           7|
|    7|    Dev|   103| 67000| 2020-09-23|     M|           8|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|           9|
|   10|   Neha|   103| 58000| 2023-04-30|     F|          10|
+-----+-------+------+------+-----------+------+------------+



#### 26Ô∏è Rank employees within each department

In [33]:
w = Window.partitionBy(F.col('DeptID')).orderBy(F.desc(F.col('Salary')))

df_rnk_salary_dept = df_emp.withColumn('Dept_WiseSalary_Rank', F.dense_rank().over(w)).orderBy(F.col('DeptID'))
df_rnk_salary_dept.show()



+-----+-------+------+------+-----------+------+--------------------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|Dept_WiseSalary_Rank|
+-----+-------+------+------+-----------+------+--------------------+
|    9|  Karan|   101| 95000| 2018-12-11|     M|                   1|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|                   2|
|    6|   Riya|   101| 72000| 2023-01-05|     F|                   3|
|    1|   Amit|   101| 70000| 2020-01-10|     M|                   4|
|    8| Simran|   102| 81000| 2021-05-25|     F|                   1|
|    2|  Priya|   102| 80000| 2019-03-15|     F|                   2|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|                   3|
|    7|    Dev|   103| 67000| 2020-09-23|     M|                   1|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|                   2|
|   10|   Neha|   103| 58000| 2023-04-30|     F|                   3|
+-----+-------+------+------+-----------+------+--------------------+



#### 28Ô∏è Find department with highest total salary

In [34]:
df_dept_high = df_emp.groupBy(F.col('DeptID')).agg(F.sum(F.col('Salary')).alias('Total_Amount')).orderBy(F.col('Total_Amount'))
df_dept_high.show()

+------+------------+
|DeptID|Total_Amount|
+------+------------+
|   103|      185000|
|   102|      236000|
|   101|      327000|
+------+------------+



#### 29 Calculate average sales per employee

In [35]:
df_sales.show()
avg_sales_emp = df_sales.groupBy('EmpID').agg(F.avg('SaleAmount').alias('AvarageSales_emp'))
avg_sales_emp.show()

+------+-----+----------+----------+
|SaleID|EmpID|SaleAmount|  SaleDate|
+------+-----+----------+----------+
|     1|    1|      1000|2024-01-10|
|     2|    2|      2000|2024-01-12|
|     3|    3|      1500|2024-02-01|
|     4|    1|      2500|2024-02-15|
|     5|    4|      3000|2024-03-20|
|     6|    2|      1800|2024-03-22|
|     7|    6|      2200|2024-04-10|
|     8|    9|      4000|2024-04-15|
+------+-----+----------+----------+

+-----+----------------+
|EmpID|AvarageSales_emp|
+-----+----------------+
|    1|          1750.0|
|    3|          1500.0|
|    2|          1900.0|
|    6|          2200.0|
|    9|          4000.0|
|    4|          3000.0|
+-----+----------------+



#### 30Ô∏è Find duplicate department IDs

In [36]:
getDuplicate = df_emp.groupBy(F.col('EmpID')).count().filter('count > 1')
getDuplicate.show()

+-----+-----+
|EmpID|count|
+-----+-----+
+-----+-----+



### 31Ô∏è Calculate salary percentile by department

In [37]:
df_emp.show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|
|    2|  Priya|   102| 80000| 2019-03-15|     F|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|
|    6|   Riya|   101| 72000| 2023-01-05|     F|
|    7|    Dev|   103| 67000| 2020-09-23|     M|
|    8| Simran|   102| 81000| 2021-05-25|     F|
|    9|  Karan|   101| 95000| 2018-12-11|     M|
|   10|   Neha|   103| 58000| 2023-04-30|     F|
+-----+-------+------+------+-----------+------+



In [38]:
w = Window.partitionBy(F.col('DeptID')).orderBy(F.col('Salary'))
df_percentile = df_emp.withColumn('PercentRank', F.percent_rank().over(w))
df_percentile.show()

+-----+-------+------+------+-----------+------+------------------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|       PercentRank|
+-----+-------+------+------+-----------+------+------------------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|               0.0|
|    6|   Riya|   101| 72000| 2023-01-05|     F|0.3333333333333333|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|0.6666666666666666|
|    9|  Karan|   101| 95000| 2018-12-11|     M|               1.0|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|               0.0|
|    2|  Priya|   102| 80000| 2019-03-15|     F|               0.5|
|    8| Simran|   102| 81000| 2021-05-25|     F|               1.0|
|   10|   Neha|   103| 58000| 2023-04-30|     F|               0.0|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|               0.5|
|    7|    Dev|   103| 67000| 2020-09-23|     M|               1.0|
+-----+-------+------+------+-----------+------+------------------+



### 32 Add column indicating whether the employee has any sales

In [39]:
df_sales.printSchema()

root
 |-- SaleID: integer (nullable = true)
 |-- EmpID: integer (nullable = true)
 |-- SaleAmount: integer (nullable = true)
 |-- SaleDate: string (nullable = true)



In [40]:
df_em_sales = df_emp.join(df_sales.select("EmpID").distinct(), "EmpId", 'left')
df_em_sales.show()

cal_df = df_em_sales.withColumn("HasSales", F.when(F.col('EmpID').isNull(), True).otherwise(False))
cal_df.show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|
|    2|  Priya|   102| 80000| 2019-03-15|     F|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|
|    6|   Riya|   101| 72000| 2023-01-05|     F|
|    7|    Dev|   103| 67000| 2020-09-23|     M|
|    8| Simran|   102| 81000| 2021-05-25|     F|
|    9|  Karan|   101| 95000| 2018-12-11|     M|
|   10|   Neha|   103| 58000| 2023-04-30|     F|
+-----+-------+------+------+-----------+------+

+-----+-------+------+------+-----------+------+--------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|HasSales|
+-----+-------+------+------+-----------+------+--------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|   false|
|    2|  Priya|   102| 80000| 2019-03-15|     F|   false|
|    3|  Rohit|   101| 

### 33Ô∏è Replace null values with defaults

In [41]:
null_df = df_emp.fillna({"Salary": 50000})
null_df.show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|
|    2|  Priya|   102| 80000| 2019-03-15|     F|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|
|    6|   Riya|   101| 72000| 2023-01-05|     F|
|    7|    Dev|   103| 67000| 2020-09-23|     M|
|    8| Simran|   102| 81000| 2021-05-25|     F|
|    9|  Karan|   101| 95000| 2018-12-11|     M|
|   10|   Neha|   103| 58000| 2023-04-30|     F|
+-----+-------+------+------+-----------+------+



### 34Ô∏è Calculate cumulative sales per employee

In [42]:
df_sales.printSchema()

root
 |-- SaleID: integer (nullable = true)
 |-- EmpID: integer (nullable = true)
 |-- SaleAmount: integer (nullable = true)
 |-- SaleDate: string (nullable = true)



In [43]:
win = Window.partitionBy('EmpID').orderBy('SaleDate')

cum_df = df_sales.withColumn('cumulativeSales', F.sum('SaleAmount').over(win))

cum_df.show()




+------+-----+----------+----------+---------------+
|SaleID|EmpID|SaleAmount|  SaleDate|cumulativeSales|
+------+-----+----------+----------+---------------+
|     1|    1|      1000|2024-01-10|           1000|
|     4|    1|      2500|2024-02-15|           3500|
|     2|    2|      2000|2024-01-12|           2000|
|     6|    2|      1800|2024-03-22|           3800|
|     3|    3|      1500|2024-02-01|           1500|
|     5|    4|      3000|2024-03-20|           3000|
|     7|    6|      2200|2024-04-10|           2200|
|     8|    9|      4000|2024-04-15|           4000|
+------+-----+----------+----------+---------------+



### 35Ô∏è Pivot sales data by month 

In [44]:
df_sales_pi = df_sales.withColumn("Month", F.month('SaleDate')).groupBy('EmpID').pivot('Month').agg(F.sum('SaleAmount'))
df_sales_pi.show()

+-----+----+----+----+----+
|EmpID|   1|   2|   3|   4|
+-----+----+----+----+----+
|    1|1000|2500|NULL|NULL|
|    6|NULL|NULL|NULL|2200|
|    3|NULL|1500|NULL|NULL|
|    9|NULL|NULL|NULL|4000|
|    4|NULL|NULL|3000|NULL|
|    2|2000|NULL|1800|NULL|
+-----+----+----+----+----+



# üî¥ Advanced Level (36‚Äì50)

#### 36Ô∏è Optimize joins using broadcast

In [45]:
df_join = df_emp.join(F.broadcast(df_dept), 'DeptID')
df_join.show()

+------+-----+-------+------+-----------+------+--------+---------+
|DeptID|EmpID|EmpName|Salary|JoiningDate|Gender|DeptName| Location|
+------+-----+-------+------+-----------+------+--------+---------+
|   101|    1|   Amit| 70000| 2020-01-10|     M|      IT|Bangalore|
|   102|    2|  Priya| 80000| 2019-03-15|     F|      HR|Hyderabad|
|   101|    3|  Rohit| 90000| 2021-06-01|     M|      IT|Bangalore|
|   103|    4|  Sneha| 60000| 2020-11-20|     F| Finance|     Pune|
|   102|    5|  Ankit| 75000| 2022-02-10|     M|      HR|Hyderabad|
|   101|    6|   Riya| 72000| 2023-01-05|     F|      IT|Bangalore|
|   103|    7|    Dev| 67000| 2020-09-23|     M| Finance|     Pune|
|   102|    8| Simran| 81000| 2021-05-25|     F|      HR|Hyderabad|
|   101|    9|  Karan| 95000| 2018-12-11|     M|      IT|Bangalore|
|   103|   10|   Neha| 58000| 2023-04-30|     F| Finance|     Pune|
+------+-----+-------+------+-----------+------+--------+---------+



In [46]:
df_join.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [DeptID#2, EmpID#0, EmpName#1, Salary#3, JoiningDate#4, Gender#5, DeptName#7, Location#8]
   +- BroadcastHashJoin [DeptID#2], [DeptID#6], Inner, BuildRight, false
      :- Filter isnotnull(DeptID#2)
      :  +- Scan ExistingRDD[EmpID#0,EmpName#1,DeptID#2,Salary#3,JoiningDate#4,Gender#5]
      +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [plan_id=1748]
         +- Filter isnotnull(DeptID#6)
            +- Scan ExistingRDD[DeptID#6,DeptName#7,Location#8]




### 37Ô∏è Repartition data by DeptID

In [47]:
repart = df_emp.repartition("DeptID")
repart.explain()

repart.groupBy('DeptID').count().show()



== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange hashpartitioning(DeptID#2, 200), REPARTITION_BY_COL, [plan_id=1757]
   +- Scan ExistingRDD[EmpID#0,EmpName#1,DeptID#2,Salary#3,JoiningDate#4,Gender#5]


+------+-----+
|DeptID|count|
+------+-----+
|   101|    4|
|   103|    3|
|   102|    3|
+------+-----+



### 38Ô∏è Cache DataFrame for reuse

In [48]:
df_emp.cache()

df_emp.show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|
|    2|  Priya|   102| 80000| 2019-03-15|     F|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|
|    6|   Riya|   101| 72000| 2023-01-05|     F|
|    7|    Dev|   103| 67000| 2020-09-23|     M|
|    8| Simran|   102| 81000| 2021-05-25|     F|
|    9|  Karan|   101| 95000| 2018-12-11|     M|
|   10|   Neha|   103| 58000| 2023-04-30|     F|
+-----+-------+------+------+-----------+------+



### 39Ô∏è Write employee data as Parquet

In [49]:
df_emp.write.format('parquet').mode('overwrite').save('./temp/employee_parquet')

### 40Ô∏è Read data from Parquet

In [50]:
df_parquet = spark.read.parquet('./temp/employee_parquet')
df_parquet.show()

+-----+-------+------+------+-----------+------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|
+-----+-------+------+------+-----------+------+
|    6|   Riya|   101| 72000| 2023-01-05|     F|
|    7|    Dev|   103| 67000| 2020-09-23|     M|
|    8| Simran|   102| 81000| 2021-05-25|     F|
|    9|  Karan|   101| 95000| 2018-12-11|     M|
|   10|   Neha|   103| 58000| 2023-04-30|     F|
|    1|   Amit|   101| 70000| 2020-01-10|     M|
|    2|  Priya|   102| 80000| 2019-03-15|     F|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|
+-----+-------+------+------+-----------+------+



### 41Ô∏è Handle schema evolution

In [51]:
df_new = df_emp.withColumn('Country', F.lit('India'))
df_new.write.format('parquet').mode('overwrite').option('mergeSchema', True).save('./temp/employee_parquet')

In [52]:
df_new.show()

+-----+-------+------+------+-----------+------+-------+
|EmpID|EmpName|DeptID|Salary|JoiningDate|Gender|Country|
+-----+-------+------+------+-----------+------+-------+
|    1|   Amit|   101| 70000| 2020-01-10|     M|  India|
|    2|  Priya|   102| 80000| 2019-03-15|     F|  India|
|    3|  Rohit|   101| 90000| 2021-06-01|     M|  India|
|    4|  Sneha|   103| 60000| 2020-11-20|     F|  India|
|    5|  Ankit|   102| 75000| 2022-02-10|     M|  India|
|    6|   Riya|   101| 72000| 2023-01-05|     F|  India|
|    7|    Dev|   103| 67000| 2020-09-23|     M|  India|
|    8| Simran|   102| 81000| 2021-05-25|     F|  India|
|    9|  Karan|   101| 95000| 2018-12-11|     M|  India|
|   10|   Neha|   103| 58000| 2023-04-30|     F|  India|
+-----+-------+------+------+-----------+------+-------+



### 42Ô∏è Create Delta table from employee data

In [56]:
df_emp.write.format("delta").mode("overwrite").save("./output/delta/employee_delta")

Py4JJavaError: An error occurred while calling o475.save.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: delta. Make sure the provider name is correct and the package is properly registered and compatible with your Spark version. SQLSTATE: 42K02
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:722)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:681)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:740)
	at org.apache.spark.sql.classic.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:626)
	at org.apache.spark.sql.classic.DataFrameWriter.saveInternal(DataFrameWriter.scala:135)
	at org.apache.spark.sql.classic.DataFrameWriter.save(DataFrameWriter.scala:118)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.lang.ClassNotFoundException: delta.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:592)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$6(DataSource.scala:665)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:665)
	at scala.util.Failure.orElse(Try.scala:230)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:665)
	... 16 more


### 43Ô∏è Update Delta table (SCD Type 1)

In [54]:
from delta.tables import DeltaTable

In [None]:
deltaTable = DeltaTable.