In [1]:
# import libraries
import pandas as pd
import numpy as np

In [76]:
# load data
df= pd.read_csv('hr_dataset.csv')
df.head()

Unnamed: 0,Employee_ID,Age,Department,Satisfaction_Level,Last_Evaluation,Projects,Average_Monthly_Hours,Years_at_Company,Left
0,896999,,Finance,0.42,0.08,,999,,1.0
1,331148,,Hr,0.91,0.73,,180,7.0,1.0
2,559437,36.0,Operations,0.93,0.82,7.0,999,4.0,1.0
3,883201,41.0,Finance,0.03,0.53,7.0,297,,1.0
4,562242,,Finance,0.66,0.72,3.0,186,5.0,


In [77]:
print(df.head(40))

    Employee_ID   Age  Department  Satisfaction_Level  Last_Evaluation  \
0        896999   NaN     Finance                0.42             0.08   
1        331148   NaN          Hr                0.91             0.73   
2        559437  36.0  Operations                0.93             0.82   
3        883201  41.0     Finance                0.03             0.53   
4        562242   NaN     Finance                0.66             0.72   
5        538510   NaN       Sales                0.85             0.76   
6        585585  36.0         NaN                0.65             0.39   
7        689574   NaN          Hr                0.89             0.80   
8        394433  58.0          Hr                0.33             0.92   
9        314638   NaN          IT                0.57             0.42   
10         9767   NaN          Hr                0.62             0.26   
11       747950   NaN     Finance                0.71             0.09   
12       102408   NaN     finanCe     

###  DATA CLEANING

In [78]:
df.shape

(30100, 9)

In [79]:
# check for null values
df.isnull().sum()

Employee_ID                  0
Age                      15075
Department                3912
Satisfaction_Level           0
Last_Evaluation              0
Projects                 15052
Average_Monthly_Hours        0
Years_at_Company         15056
Left                     10005
dtype: int64

In [81]:
# calaculate the percentage of missing values for each column 
missing_counts = df.isnull().sum()                     
missing_percent = (missing_counts / len(df)) * 100     
missing_percent = missing_percent.round(1)            
missing_percent

Employee_ID               0.0
Age                      50.1
Department               13.0
Satisfaction_Level        0.0
Last_Evaluation           0.0
Projects                 50.0
Average_Monthly_Hours     0.0
Years_at_Company         50.0
Left                     33.2
dtype: float64

In [82]:
# handle missing values
# Normalize text  in colum Departments, remove spaces and ensure the case is lower
df['Department']=df['Department'].str.lower().str.strip()
df['Department']= df['Department'].fillna('unknown')

In [83]:
df.columns

Index(['Employee_ID', 'Age', 'Department', 'Satisfaction_Level',
       'Last_Evaluation', 'Projects', 'Average_Monthly_Hours',
       'Years_at_Company', 'Left'],
      dtype='object')

In [84]:
# fill missing values in numerical columns with meadian 
numerical_cols= ['Age','Projects','Years_at_Company', 'Left']
for col in numerical_cols:
    median_value = df[col].median()
    df[col]= df[col].fillna(median_value)


In [85]:
df.isnull().sum()

Employee_ID              0
Age                      0
Department               0
Satisfaction_Level       0
Last_Evaluation          0
Projects                 0
Average_Monthly_Hours    0
Years_at_Company         0
Left                     0
dtype: int64

1. For Department, I chose to fill missing values with "unknown" rather than dropping rows. 

   This preserves all employees in the dataset (~13% had missing departments) and allows analysis of how employees with unknown departments compare to others. 
   
   Dropping rows would have removed valuable data and reduced the dataset unnecessarily.

2. For numerical columns (Age, Projects, YearsAtCompany, Left), I fill missing values with the median.

   This is  because the median is not affected by outliers, and it maintains the central tendency of the data without introducing bias from extreme values.

In [86]:
df.describe()

Unnamed: 0,Employee_ID,Age,Satisfaction_Level,Last_Evaluation,Projects,Average_Monthly_Hours,Years_at_Company,Left
count,30100.0,30100.0,30100.0,30100.0,30100.0,30100.0,30100.0,30100.0
mean,501459.486844,41.021528,0.500229,0.500954,4.751595,617.731595,5.767542,0.329502
std,289871.790946,7.92598,0.287933,0.290158,1.232629,386.017581,2.043909,0.47004
min,1005.0,22.0,0.0,0.0,2.0,150.0,1.0,0.0
25%,250675.75,41.0,0.25,0.25,5.0,230.0,6.0,0.0
50%,500246.5,41.0,0.5,0.5,5.0,999.0,6.0,0.0
75%,754074.0,41.0,0.75,0.76,5.0,999.0,6.0,1.0
max,999999.0,60.0,1.0,1.0,7.0,999.0,10.0,1.0


- Average_Monthly_Hours has unrealistic values (e.g., 999 hours), which exceed the maximum possible monthly hours (~730). 
    These could be  placeholders for missing or invalid data and would distort analysis and model performance.
        
    Fix:  Cap values above 400 hours at 200 hours, a realistic monthly workload baseline (≈45 hours/week × 4.3 weeks/month). 
          This ensures the values within humanly possible limits without introducing artificial averages, preserving data integrity.

- Age (22–60 years): No outliers detected; all values fall within a realistic working-age range.
    

- Satisfaction_Level & Last_Evaluation (0.0–1.0): Values fall within expected normalized limits.
    

- Projects (2–7 projects): Range reflects a reasonable employee workload.

- Years_at_Company (1–10 years): Values fall within a typical tenure range.
    

In [87]:
df.loc[df['Average_Monthly_Hours'] > 300, 'Average_Monthly_Hours']=200

In [88]:
print("\nOutliers capped using human workload baseline. ")


Outliers capped using human workload baseline. 


In [89]:
df.duplicated().value_counts()

False    30000
True       100
Name: count, dtype: int64

In [90]:
#Remove Exact Duplicate Rows
df = df.drop_duplicates()
print("\nDuplicate rows removed .")


Duplicate rows removed .


In [91]:
#Ensure Unique Employee_IDs
# Remove duplicate Employee_IDs (keep first valid record)
df = df.drop_duplicates(subset=["Employee_ID"], keep="first")

In [92]:
df.duplicated().value_counts()

False    29524
Name: count, dtype: int64

In [93]:
df.duplicated().sum()

np.int64(0)

In [94]:
df.duplicated(subset= ['Employee_ID']).sum()

np.int64(0)

In [95]:
df.columns

Index(['Employee_ID', 'Age', 'Department', 'Satisfaction_Level',
       'Last_Evaluation', 'Projects', 'Average_Monthly_Hours',
       'Years_at_Company', 'Left'],
      dtype='object')

In [96]:
df= df.rename(columns={
    'Employee_ID': 'EmployeeID',
    'Age': 'Age',
    'Department': 'Department',
    'Satisfaction_Level': 'SatisfactionScore',
    'Last_Evaluation': 'LastEvaluationScore',
    'Projects': 'NumProjects',
    'Average_Monthly_Hours': 'AvgMonthlyHours',
    'Years_at_Company': 'YearsAtCompany',
    'Left': 'Attrition'}
)

In [97]:
df.dtypes

EmployeeID               int64
Age                    float64
Department              object
SatisfactionScore      float64
LastEvaluationScore    float64
NumProjects            float64
AvgMonthlyHours          int64
YearsAtCompany         float64
Attrition              float64
dtype: object

After checking the data types:

- Convert 'Age', 'NumProjects', and 'YearsAtCompany' from float to integer because these values are whole numbers.

- Convert 'Attrition' to integer to clearly represent the binary target.

- Convert 'Department' to categorical to reflect discrete groups and improve memory efficiency.



In [98]:
df["Age"] = df["Age"].astype("Int64")
df["NumProjects"] = df["NumProjects"].astype("Int64")
df["YearsAtCompany"] = df["YearsAtCompany"].astype("Int64")
df["Attrition"] = df["Attrition"].astype("Int64")

In [99]:
df["Department"] = df["Department"].astype("category")

In [101]:
print("\n---DATA QUALITY REPORT  ---")
print("Dataset shape:", df.shape)
print("Employee_ID unique?", df["EmployeeID"].is_unique)
print("\nRemaining missing values:\n", df.isnull().sum())
print("\nDepartment distribution:\n", df["Department"].value_counts())


---DATA QUALITY REPORT  ---
Dataset shape: (29524, 9)
Employee_ID unique? True

Remaining missing values:
 EmployeeID             0
Age                    0
Department             0
SatisfactionScore      0
LastEvaluationScore    0
NumProjects            0
AvgMonthlyHours        0
YearsAtCompany         0
Attrition              0
dtype: int64

Department distribution:
 Department
finance       7314
hr            7269
unknown       3828
sales         3790
operations    3720
it            3603
Name: count, dtype: int64


In [103]:
## Save the cleaened data as csv
df.to_csv("hr_cleaned_dataset.csv", index=False)
print("\nCleaned HR dataset saved.")


Cleaned HR dataset saved.
