# =========================
# Notebook 1: Data Preprocessing
# =========================

In [2]:
# Step 1: Import libraries
import pandas as pd
import numpy as np

In [3]:
# Step 2: Load dataset (use 1000 rows version first for testing)
df = pd.read_csv("/Users/sangsthitapanda/Desktop/L&T PROJECT/archive/employee_attrition_dataset_10000.csv")
print("Shape:", df.shape)
df.head()

Shape: (10000, 26)


Unnamed: 0,Employee_ID,Age,Gender,Marital_Status,Department,Job_Role,Job_Level,Monthly_Income,Hourly_Rate,Years_at_Company,...,Overtime,Project_Count,Average_Hours_Worked_Per_Week,Absenteeism,Work_Environment_Satisfaction,Relationship_with_Manager,Job_Involvement,Distance_From_Home,Number_of_Companies_Worked,Attrition
0,1,58,Male,Single,Finance,Manager,5,7332,81,24,...,No,9,48,16,4,1,1,49,3,No
1,2,48,Female,Divorced,HR,Assistant,4,6069,55,18,...,Yes,9,57,10,4,1,1,25,1,No
2,3,34,Female,Married,Marketing,Manager,4,11485,65,6,...,Yes,3,55,1,1,4,3,21,1,Yes
3,4,27,Female,Divorced,HR,Manager,4,18707,28,12,...,No,9,53,2,3,4,1,46,2,No
4,5,40,Male,Married,HR,Analyst,1,16398,92,3,...,No,1,54,11,1,1,1,43,4,No


In [4]:
# Step 3: Check info and missing values
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Employee_ID                    10000 non-null  int64 
 1   Age                            10000 non-null  int64 
 2   Gender                         10000 non-null  object
 3   Marital_Status                 10000 non-null  object
 4   Department                     10000 non-null  object
 5   Job_Role                       10000 non-null  object
 6   Job_Level                      10000 non-null  int64 
 7   Monthly_Income                 10000 non-null  int64 
 8   Hourly_Rate                    10000 non-null  int64 
 9   Years_at_Company               10000 non-null  int64 
 10  Years_in_Current_Role          10000 non-null  int64 
 11  Years_Since_Last_Promotion     10000 non-null  int64 
 12  Work_Life_Balance              10000 non-null  int64 
 13  Jo

Employee_ID                      0
Age                              0
Gender                           0
Marital_Status                   0
Department                       0
Job_Role                         0
Job_Level                        0
Monthly_Income                   0
Hourly_Rate                      0
Years_at_Company                 0
Years_in_Current_Role            0
Years_Since_Last_Promotion       0
Work_Life_Balance                0
Job_Satisfaction                 0
Performance_Rating               0
Training_Hours_Last_Year         0
Overtime                         0
Project_Count                    0
Average_Hours_Worked_Per_Week    0
Absenteeism                      0
Work_Environment_Satisfaction    0
Relationship_with_Manager        0
Job_Involvement                  0
Distance_From_Home               0
Number_of_Companies_Worked       0
Attrition                        0
dtype: int64

In [5]:

# Step 5: Standardize column names (remove spaces, lowercase)
df.columns = df.columns.str.strip().str.replace(" ", "_").str.lower()

In [6]:
# Step 6: Encode target variable Attrition (Yes=1, No=0)
df["attrition"] = df["attrition"].map({"Yes": 1, "No": 0})

In [8]:
# Step 7: Encode categorical variables
from sklearn.preprocessing import LabelEncoder

cat_cols = df.select_dtypes(include=["object"]).columns
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

print("Categorical columns encoded:", list(cat_cols))

Categorical columns encoded: []


In [9]:
# Step 8: Save cleaned dataset
df.to_csv("/Users/sangsthitapanda/Desktop/L&T PROJECT/archive/employee_attrition_dataset_cleaned.csv", index=False)
print("✅ Preprocessing done. Cleaned file saved.")
df.head()

✅ Preprocessing done. Cleaned file saved.


Unnamed: 0,employee_id,age,gender,marital_status,department,job_role,job_level,monthly_income,hourly_rate,years_at_company,...,overtime,project_count,average_hours_worked_per_week,absenteeism,work_environment_satisfaction,relationship_with_manager,job_involvement,distance_from_home,number_of_companies_worked,attrition
0,1,58,1,2,0,3,5,7332,81,24,...,0,9,48,16,4,1,1,49,3,0
1,2,48,0,0,1,1,4,6069,55,18,...,1,9,57,10,4,1,1,25,1,0
2,3,34,0,1,3,3,4,11485,65,6,...,1,3,55,1,1,4,3,21,1,1
3,4,27,0,0,1,3,4,18707,28,12,...,0,9,53,2,3,4,1,46,2,0
4,5,40,1,1,1,0,1,16398,92,3,...,0,1,54,11,1,1,1,43,4,0
