<a href="https://colab.research.google.com/github/simantinip04/Data-Engineering/blob/main/3June25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pyspark



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("EmployeeDataset").getOrCreate()

# Load CSV
file_path = "/content/drive/MyDrive/large_employee_dataset.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# View schema
df.printSchema()

# Show sample data
df.show(5)


root
 |-- EmployeeID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- JoiningDate: date (nullable = true)
 |-- Status: string (nullable = true)
 |-- City: string (nullable = true)

+----------+-------------------+---+----------+------+-----------+--------+------------+
|EmployeeID|               Name|Age|Department|Salary|JoiningDate|  Status|        City|
+----------+-------------------+---+----------+------+-----------+--------+------------+
|      4128|    Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|   Allentown|
|      6094|      Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active| Anthonyfort|
|      5883|Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|   Gilesstad|
|      9146|         Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|Jenniferfurt|
|      1918|      Angela Hooper| 26|   Finance|108773| 2019-

In [7]:
#1.Top 10 rows
df.show(10)

+----------+--------------------+---+----------+------+-----------+--------+------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|        City|
+----------+--------------------+---+----------+------+-----------+--------+------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|   Allentown|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active| Anthonyfort|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|   Gilesstad|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|Jenniferfurt|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|Lake Amystad|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|  Russohaven|
|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active| East Robert|
|      8278|       Riley Johnson| 49|        HR| 43449| 2015-08-03|Resigned|  New Thomas|
|      852

In [10]:
#2. Count the total number of employees.
total_employees = df.select("EmployeeID").distinct().count()
print(f"Total number of employees: {total_employees}")

Total number of employees: 500


In [11]:
#3. Display unique departments.
unique_departments = df.select("Department").distinct()
unique_departments.show()

+----------+
|Department|
+----------+
|     Sales|
|        HR|
|   Finance|
| Marketing|
|        IT|
+----------+



In [14]:
#4. Filter all employees in the "IT" department.
it_employees = df.filter(df["Department"] == "IT")
it_employees.show()

+----------+-------------------+---+----------+------+-----------+--------+------------------+
|EmployeeID|               Name|Age|Department|Salary|JoiningDate|  Status|              City|
+----------+-------------------+---+----------+------+-----------+--------+------------------+
|      6598|        Mary Henson| 58|        IT| 63951| 2021-08-25|  Active|       Port Tricia|
|      8518|   Elizabeth Abbott| 22|        IT| 91732| 2022-11-05|  Active|       Douglasside|
|      9506|        Thomas Dunn| 45|        IT| 90340| 2020-07-12|On Leave|    Lindseychester|
|      9663|        Glenn Mason| 43|        IT|109189| 2020-03-27|On Leave|      Katelynburgh|
|      2106|     Richard Bailey| 45|        IT| 30950| 2021-06-29|Resigned|        North John|
|      8212|      Jacob Jackson| 35|        IT| 54812| 2020-09-18|On Leave|South Veronicastad|
|      6354|     Nicole Gilmore| 35|        IT|104202| 2018-05-04|  Active|       East Joseph|
|      5716|         David Wang| 49|        IT| 94

In [15]:
#5. Show employees aged between 30 and 40.
employees = df.filter((df.Age >= 30) & (df.Age <= 40))
employees.show()

+----------+------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|              Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+------------------+---+----------+------+-----------+--------+-------------------+
|      4676|Christopher Fuller| 30|        HR| 63042| 2021-04-30|Resigned|   South Donnaville|
|      4136|     Jerome Torres| 30|   Finance| 68213| 2024-11-30|  Active|North Justinborough|
|      1588|       Edwin Burns| 34|     Sales|108208| 2015-09-14|Resigned|        South David|
|      8074|       Fred Brewer| 30|        HR|100736| 2021-06-06|On Leave|    Port Wendyville|
|      3841|       April Allen| 36|        HR| 98845| 2020-05-20|  Active|      Rachelchester|
|      8212|     Jacob Jackson| 35|        IT| 54812| 2020-09-18|On Leave| South Veronicastad|
|      3325|       Brian Huynh| 40|   Finance| 59924| 2017-01-02|On Leave|           Johnside|
|      6180|     Robert Cortez| 35| Marketing| 761

In [16]:
#6. Sort employees by Salary in descending order.
sorted_employees = df.orderBy(df.Salary.desc())
sorted_employees.show()

+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|      8860|       Cody Williams| 30|        IT|119978| 2019-03-16|Resigned|         Susanville|
|      4585|      Sandra Webster| 30|        HR|119940| 2022-10-21|On Leave|       Thompsonport|
|      4667|         Ronald Hunt| 58|     Sales|119677| 2019-08-29|Resigned|    Griffithchester|
|      1602|    Deborah Williams| 25|        HR|119397| 2023-09-26|On Leave|    Port Terrimouth|
|      3374|        Amanda Green| 41|        HR|119316| 2021-04-08|Resigned|    West Shelleyton|
|      6329|       Heidi Shaffer| 36|        HR|119165| 2020-01-14|Resigned|          New Alexa|
|      2428|        Mary Stevens| 55|     Sales|119137| 2022-03-06|On Leave|         Travisport|
|      1545|Brittany Christens

In [17]:
#7. Get the average salary by department.
average_salary_by_department = df.groupBy("Department").avg("Salary")
average_salary_by_department.show()

+----------+-----------------+
|Department|      avg(Salary)|
+----------+-----------------+
|     Sales|77488.54545454546|
|        HR|76091.27450980392|
|   Finance|72834.75630252101|
| Marketing| 71958.1888888889|
|        IT|73116.25555555556|
+----------+-----------------+



In [18]:
#8. Count of employees by Status .
employee_count_by_status = df.groupBy("Status").count()
employee_count_by_status.show()

+--------+-----+
|  Status|count|
+--------+-----+
|Resigned|  159|
|  Active|  172|
|On Leave|  169|
+--------+-----+



In [19]:
#9. Highest salary in each city.
from pyspark.sql.functions import max

highest_salary_by_city = df.groupBy("City").agg(max("Salary").alias("HighestSalary"))
highest_salary_by_city.show()

+----------------+-------------+
|            City|HighestSalary|
+----------------+-------------+
|   Wilsonchester|        67025|
|     Bradshawton|       111116|
|       Steventon|        32009|
|     Lake Alyssa|        84903|
|      North Lisa|        57898|
|    North Marvin|        66252|
|     Jenniferton|        39907|
|     Buckleyview|        50109|
|     Burtonville|        98492|
|    Johnsonmouth|        48799|
|    South Joseph|        52456|
|  Lindseychester|        90340|
|   North Stephen|        91947|
|Port Nicoleshire|        57537|
|    Jerrychester|        53374|
|  North Jennifer|        82486|
|      Laurenstad|        44608|
|West Brendanbury|        90698|
|       Juliaberg|        50170|
|       New James|        54378|
+----------------+-------------+
only showing top 20 rows



In [20]:
#10. Total number of employees who joined each year.
from pyspark.sql.functions import year, to_date

df = df.withColumn("JoinYear", year(to_date("JoiningDate", "yyyy-MM-dd")))
df.groupBy("JoinYear").count().show()

+--------+-----+
|JoinYear|count|
+--------+-----+
|    2025|   27|
|    2018|   52|
|    2015|   37|
|    2023|   47|
|    2022|   49|
|    2019|   52|
|    2020|   56|
|    2016|   49|
|    2024|   38|
|    2017|   44|
|    2021|   49|
+--------+-----+



In [21]:
#11. Department-wise count of employees who are currently "Active".
active_employees_by_department = df.filter(df["Status"] == "Active").groupBy("Department").count()
active_employees_by_department.show()

+----------+-----+
|Department|count|
+----------+-----+
|     Sales|   32|
|        HR|   37|
|   Finance|   45|
| Marketing|   32|
|        IT|   26|
+----------+-----+



In [23]:
#12. Average age of employees per department.
from pyspark.sql.functions import avg

average_age_by_department = df.groupBy("Department").agg(avg("Age").alias("AverageAge"))
average_age_by_department.show()

+----------+------------------+
|Department|        AverageAge|
+----------+------------------+
|     Sales|40.535353535353536|
|        HR| 41.46078431372549|
|   Finance| 39.21008403361345|
| Marketing| 41.82222222222222|
|        IT| 38.68888888888889|
+----------+------------------+



In [25]:
#13. Create another dataset with City and Region , and join it.
from pyspark.sql import Row

# Create region DataFrame
region_data = [
    Row(City="New York", Region="East"),
    Row(City="San Francisco", Region="West"),
    Row(City="Chicago", Region="Midwest"),
    Row(City="Austin", Region="South")
]

region_df = spark.createDataFrame(region_data)

# Join on City
joined_df = df.join(region_df, on="City", how="left")
joined_df.show(5)


+------------+----------+-------------------+---+----------+------+-----------+--------+--------+------+
|        City|EmployeeID|               Name|Age|Department|Salary|JoiningDate|  Status|JoinYear|Region|
+------------+----------+-------------------+---+----------+------+-----------+--------+--------+------+
|   Allentown|      4128|    Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|    2018|  NULL|
|Jenniferfurt|      9146|         Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|    2015|  NULL|
|   Gilesstad|      5883|Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|    2025|  NULL|
|Lake Amystad|      1918|      Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|    2019|  NULL|
| Anthonyfort|      6094|      Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active|    2015|  NULL|
+------------+----------+-------------------+---+----------+------+-----------+--------+--------+------+
only showing top 5 rows



In [26]:
#14. Group salaries by Region after the join.
from pyspark.sql.functions import sum

# Group salaries by Region
grouped_salaries = joined_df.groupBy("Region").agg(sum("Salary").alias("TotalSalary"))
grouped_salaries.show()

+------+-----------+
|Region|TotalSalary|
+------+-----------+
|  NULL|   37156712|
+------+-----------+



In [27]:
#15. Calculate years of experience for each employee (current date - JoiningDate).
from pyspark.sql.functions import datediff, current_date, floor

df = df.withColumn("JoiningDate", to_date("JoiningDate", "yyyy-MM-dd"))
df = df.withColumn("YearsExperience", floor(datediff(current_date(), "JoiningDate") / 365))
df.select("Name", "YearsExperience").show()

+--------------------+---------------+
|                Name|YearsExperience|
+--------------------+---------------+
|     Charles Johnson|              6|
|       Dylan Camacho|              9|
| Mr. Ryan Bowman Jr.|              0|
|          Brian Ball|              9|
|       Angela Hooper|              5|
|Alexander Johnson...|              9|
|         Steven Lane|              3|
|       Riley Johnson|              9|
|    Emily Washington|              3|
|     Valerie Fleming|              5|
|     Tracy Hughes MD|              5|
|    Johnathan Harmon|              4|
|       Michael Brown|              1|
|       Scott Burnett|              9|
|  Christopher Fuller|              4|
|         Mary Henson|              3|
|       Jerome Torres|              0|
|     Isaiah Martinez|              3|
|       Patrick Chung|              0|
|        Micheal Wade|              1|
+--------------------+---------------+
only showing top 20 rows



In [28]:
#16. List all employees with more than 5 years of experience.
filtered_df = df.filter(df.YearsExperience > 5).select("Name", "YearsExperience")
filtered_df.show()

+--------------------+---------------+
|                Name|YearsExperience|
+--------------------+---------------+
|     Charles Johnson|              6|
|       Dylan Camacho|              9|
|          Brian Ball|              9|
|Alexander Johnson...|              9|
|       Riley Johnson|              9|
|       Scott Burnett|              9|
|       Brittany Kerr|              6|
|         Edwin Burns|              9|
|       Mary Reynolds|              6|
|           Erin Berg|              7|
|         Jason Hines|              9|
|Christopher Mcdaniel|             10|
|      Victoria Kelly|              7|
|      Heather Nelson|             10|
|         Paul Porter|              7|
|         Brian Huynh|              8|
|          James West|              8|
|     Cameron Shelton|              9|
|      Nicole Gilmore|              7|
|          David Wang|              9|
+--------------------+---------------+
only showing top 20 rows

