# Task 1: Data Loading 

In [1]:
import pandas as pd

# Load the employees.csv file into a Pandas DataFrame. 
df = pd.read_csv('employees.csv')

In [2]:
# Print the first 10 rows of the dataset
df.head(10)

Unnamed: 0,Employee_ID,Name,Age,Department,Salary,Joining_Date,Resigned
0,1001,Steven Dunn,30.0,IT,46846.0,09/06/2024,False
1,1002,Jennifer Romero,54.0,IT,43585.0,07/02/2018,False
2,1003,Dana Haney,59.0,Finance,,11/01/2019,False
3,1004,Louis Powers,60.0,Finance,58094.0,26/09/2019,False
4,1005,Jennifer Gilmore,54.0,Marketing,80848.0,09/02/2020,False
5,1006,Joshua Shelton,31.0,HR,69143.0,24/08/2021,True
6,1007,Thomas Lopez,39.0,HR,66678.0,24/05/2015,False
7,1008,Derrick Simmons,39.0,Finance,67837.0,23/06/2023,True
8,1009,Lori Short,56.0,IT,58575.0,28/02/2015,True
9,1010,Michael Foley,29.0,IT,54935.0,03/06/2020,True


# Task 2: Data Cleaning 

In [3]:
# 1 Handle Missing Data
# Identify columns with missing values. 
df

Unnamed: 0,Employee_ID,Name,Age,Department,Salary,Joining_Date,Resigned
0,1001,Steven Dunn,30.0,IT,46846.0,09/06/2024,False
1,1002,Jennifer Romero,54.0,IT,43585.0,07/02/2018,False
2,1003,Dana Haney,59.0,Finance,,11/01/2019,False
3,1004,Louis Powers,60.0,Finance,58094.0,26/09/2019,False
4,1005,Jennifer Gilmore,54.0,Marketing,80848.0,09/02/2020,False
...,...,...,...,...,...,...,...
95,1096,John Walton,28.0,IT,52401.0,13/08/2017,True
96,1097,Matthew Alexander,25.0,IT,61221.0,18/11/2018,False
97,1098,Kelly Cole,25.0,HR,69045.0,14/07/2018,False
98,1099,Melissa Harper,52.0,IT,61606.0,20/10/2022,True


In [4]:
# Fill missing values in the Age column with the median age. 
df['Age'] = df['Age'].fillna(df['Age'].median())

print(df['Age'])

0     30.0
1     54.0
2     59.0
3     60.0
4     54.0
      ... 
95    28.0
96    25.0
97    25.0
98    52.0
99    45.5
Name: Age, Length: 100, dtype: float64


In [5]:
# Fill missing values in the Salary column with the mean salary. 
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())

df

Unnamed: 0,Employee_ID,Name,Age,Department,Salary,Joining_Date,Resigned
0,1001,Steven Dunn,30.0,IT,46846.000000,09/06/2024,False
1,1002,Jennifer Romero,54.0,IT,43585.000000,07/02/2018,False
2,1003,Dana Haney,59.0,Finance,59643.923913,11/01/2019,False
3,1004,Louis Powers,60.0,Finance,58094.000000,26/09/2019,False
4,1005,Jennifer Gilmore,54.0,Marketing,80848.000000,09/02/2020,False
...,...,...,...,...,...,...,...
95,1096,John Walton,28.0,IT,52401.000000,13/08/2017,True
96,1097,Matthew Alexander,25.0,IT,61221.000000,18/11/2018,False
97,1098,Kelly Cole,25.0,HR,69045.000000,14/07/2018,False
98,1099,Melissa Harper,52.0,IT,61606.000000,20/10/2022,True


In [6]:
# 2 Remove Duplicates
# Check for duplicate rows and remove them if any exist. 
df.drop_duplicates()

Unnamed: 0,Employee_ID,Name,Age,Department,Salary,Joining_Date,Resigned
0,1001,Steven Dunn,30.0,IT,46846.000000,09/06/2024,False
1,1002,Jennifer Romero,54.0,IT,43585.000000,07/02/2018,False
2,1003,Dana Haney,59.0,Finance,59643.923913,11/01/2019,False
3,1004,Louis Powers,60.0,Finance,58094.000000,26/09/2019,False
4,1005,Jennifer Gilmore,54.0,Marketing,80848.000000,09/02/2020,False
...,...,...,...,...,...,...,...
95,1096,John Walton,28.0,IT,52401.000000,13/08/2017,True
96,1097,Matthew Alexander,25.0,IT,61221.000000,18/11/2018,False
97,1098,Kelly Cole,25.0,HR,69045.000000,14/07/2018,False
98,1099,Melissa Harper,52.0,IT,61606.000000,20/10/2022,True


In [7]:
# 3 Standardize Columns
# Convert all column names to lowercase for consistency. 
df.columns = df.columns.str.lower()
df

Unnamed: 0,employee_id,name,age,department,salary,joining_date,resigned
0,1001,Steven Dunn,30.0,IT,46846.000000,09/06/2024,False
1,1002,Jennifer Romero,54.0,IT,43585.000000,07/02/2018,False
2,1003,Dana Haney,59.0,Finance,59643.923913,11/01/2019,False
3,1004,Louis Powers,60.0,Finance,58094.000000,26/09/2019,False
4,1005,Jennifer Gilmore,54.0,Marketing,80848.000000,09/02/2020,False
...,...,...,...,...,...,...,...
95,1096,John Walton,28.0,IT,52401.000000,13/08/2017,True
96,1097,Matthew Alexander,25.0,IT,61221.000000,18/11/2018,False
97,1098,Kelly Cole,25.0,HR,69045.000000,14/07/2018,False
98,1099,Melissa Harper,52.0,IT,61606.000000,20/10/2022,True


In [8]:
# 4 Convert Data Types
# Convert all the column’s data type to the most relevant data type. 
df['name'] = df['name'].astype(str)
df['department'] = df['department'].astype(str)
df['salary'] = df['salary'].astype(int)
df['age'] = df['age'].astype(int)
df['joining_date'] = df['joining_date'].astype('datetime64[ns]')
print(df.dtypes)

employee_id              int64
name                    object
age                      int64
department              object
salary                   int64
joining_date    datetime64[ns]
resigned                  bool
dtype: object


# Task 3: Data Manipulation

In [9]:
# Filter out employees who are above the age of 25 or have a salary above 40,000, and are not resigned. 
filter = df[((df['age'] > 25) | (df['salary'] > 40000)) & (df['resigned'] == False)]
filter

Unnamed: 0,employee_id,name,age,department,salary,joining_date,resigned
0,1001,Steven Dunn,30,IT,46846,2024-09-06,False
1,1002,Jennifer Romero,54,IT,43585,2018-07-02,False
2,1003,Dana Haney,59,Finance,59643,2019-11-01,False
3,1004,Louis Powers,60,Finance,58094,2019-09-26,False
4,1005,Jennifer Gilmore,54,Marketing,80848,2020-09-02,False
6,1007,Thomas Lopez,39,HR,66678,2015-05-24,False
10,1011,Steven Macias,59,HR,46928,2018-02-04,False
11,1012,Joseph Erickson,45,HR,65527,2024-03-03,False
15,1016,Karina Allen,46,IT,66746,2019-04-02,False
17,1018,Bryan Golden,30,Finance,78126,2015-07-05,False


In [10]:
#Create a new column YearsInCompany by calculating the number of years each employee has worked in the company (from JoiningDate to today). 

from datetime import datetime

# Convert the 'joining_date' column to datetime format with the correct date format
df['joining_date'] = pd.to_datetime(df['joining_date'], format='%d/%m/%Y', errors='coerce')

# Calculate the number of years each employee has worked in the company
df['years_in_company'] = (datetime.now() - df['joining_date']).dt.days // 365

print(df['years_in_company'])

0     0
1     6
2     5
3     5
4     4
     ..
95    7
96    6
97    6
98    2
99    7
Name: years_in_company, Length: 100, dtype: int64


In [11]:
#Group employees by Department and calculate the average Salary and median Age for each department. 
grouping= df.groupby('department').agg({'salary': 'mean', 'age': 'median'}).reset_index()
print(grouping)

  department        salary   age
0    Finance  58612.473684  45.0
1         HR  60466.379310  45.0
2         IT  57740.750000  46.5
3  Marketing  62475.950000  45.0


In [12]:
#Increase the Salary of all employees in the IT department by 10%. 
df.loc[df['department'] == 'IT', 'salary'] *= 1.10
print(df['salary'])

0     51530.6
1     47943.5
2     59643.0
3     58094.0
4     80848.0
       ...   
95    57641.1
96    67343.1
97    69045.0
98    67766.6
99    42862.0
Name: salary, Length: 100, dtype: float64


 65607.3 88990.  44589.6 71630.9 54129.9 89212.2 85521.7 45851.3 47385.8
 75313.7 49066.6 52306.1 94227.1 45295.8 60960.9 72630.8 61947.6 58601.4
 44221.1 45807.3 57641.1 67343.1 67766.6]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[df['department'] == 'IT', 'salary'] *= 1.10


# Task 4: Data Transformation

In [13]:
# Rename the resigned column to is_resigned.
df.rename(columns = {'resigned' : 'is_resigned'}, inplace = True)
df

Unnamed: 0,employee_id,name,age,department,salary,joining_date,is_resigned,years_in_company
0,1001,Steven Dunn,30,IT,51530.6,2024-09-06,False,0
1,1002,Jennifer Romero,54,IT,47943.5,2018-07-02,False,6
2,1003,Dana Haney,59,Finance,59643.0,2019-11-01,False,5
3,1004,Louis Powers,60,Finance,58094.0,2019-09-26,False,5
4,1005,Jennifer Gilmore,54,Marketing,80848.0,2020-09-02,False,4
...,...,...,...,...,...,...,...,...
95,1096,John Walton,28,IT,57641.1,2017-08-13,True,7
96,1097,Matthew Alexander,25,IT,67343.1,2018-11-18,False,6
97,1098,Kelly Cole,25,HR,69045.0,2018-07-14,False,6
98,1099,Melissa Harper,52,IT,67766.6,2022-10-20,True,2


In [14]:
# Save only the following columns to a new DataFrame: Employee_ID, Name, Age, Department, Salary, and YearsInCompany.  
new_df = df[['employee_id', 'name', 'age', 'department', 'salary', 'years_in_company']]
new_df

Unnamed: 0,employee_id,name,age,department,salary,years_in_company
0,1001,Steven Dunn,30,IT,51530.6,0
1,1002,Jennifer Romero,54,IT,47943.5,6
2,1003,Dana Haney,59,Finance,59643.0,5
3,1004,Louis Powers,60,Finance,58094.0,5
4,1005,Jennifer Gilmore,54,Marketing,80848.0,4
...,...,...,...,...,...,...
95,1096,John Walton,28,IT,57641.1,7
96,1097,Matthew Alexander,25,IT,67343.1,6
97,1098,Kelly Cole,25,HR,69045.0,6
98,1099,Melissa Harper,52,IT,67766.6,2


In [15]:
# Sort the data by Joining Date in descending order. 
sort_desc = df.sort_values(by = 'joining_date', ascending = False)
sort_desc

Unnamed: 0,employee_id,name,age,department,salary,joining_date,is_resigned,years_in_company
39,1040,Benjamin Lucas,39,HR,76407.0,2024-12-01,False,0
72,1073,Stephanie Hanson,45,IT,94227.1,2024-11-25,True,0
59,1060,Holly Rodriguez,41,HR,85452.0,2024-09-10,False,0
0,1001,Steven Dunn,30,IT,51530.6,2024-09-06,False,0
33,1034,April Yoder,55,Finance,53113.0,2024-07-20,False,0
...,...,...,...,...,...,...,...,...
41,1042,Phillip Ford,58,Marketing,57133.0,2015-06-18,False,9
6,1007,Thomas Lopez,39,HR,66678.0,2015-05-24,False,9
19,1020,Robert Jackson,30,Marketing,48234.0,2015-03-14,True,9
8,1009,Lori Short,56,IT,64432.5,2015-02-28,True,9


# Task 5: Save the Transformed Data 

In [16]:
# Save the final cleaned and transformed dataset to a new file named cleaned_employees.json in JSON format.
df.to_json('cleaned_employees.json', orient='records', lines=True)