# =========================
# Notebook 1: Data Preprocessing
# =========================

In [1]:
# Step 1: Import libraries
import pandas as pd
import numpy as np

In [2]:
# Step 2: Load dataset (use 1000 rows version first for testing)
df = pd.read_csv("/Users/sangsthitapanda/Desktop/L&T PROJECT/archive/employee_attrition_dataset.csv")
print("Shape:", df.shape)
df.head()

Shape: (1000, 26)


Unnamed: 0,Employee_ID,Age,Gender,Marital_Status,Department,Job_Role,Job_Level,Monthly_Income,Hourly_Rate,Years_at_Company,...,Overtime,Project_Count,Average_Hours_Worked_Per_Week,Absenteeism,Work_Environment_Satisfaction,Relationship_with_Manager,Job_Involvement,Distance_From_Home,Number_of_Companies_Worked,Attrition
0,1,58,Female,Married,IT,Manager,1,15488,28,15,...,No,6,54,17,4,4,4,20,3,No
1,2,48,Female,Married,Sales,Assistant,5,13079,28,6,...,Yes,2,45,1,4,1,2,25,2,No
2,3,34,Male,Married,Marketing,Assistant,1,13744,24,24,...,Yes,6,34,2,3,4,4,45,3,No
3,4,27,Female,Divorced,Marketing,Manager,1,6809,26,10,...,No,9,48,18,2,3,1,35,3,No
4,5,40,Male,Divorced,Marketing,Executive,1,10206,52,29,...,No,3,33,0,4,1,3,44,3,No


In [3]:
# Step 3: Check info and missing values
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Employee_ID                    1000 non-null   int64 
 1   Age                            1000 non-null   int64 
 2   Gender                         1000 non-null   object
 3   Marital_Status                 1000 non-null   object
 4   Department                     1000 non-null   object
 5   Job_Role                       1000 non-null   object
 6   Job_Level                      1000 non-null   int64 
 7   Monthly_Income                 1000 non-null   int64 
 8   Hourly_Rate                    1000 non-null   int64 
 9   Years_at_Company               1000 non-null   int64 
 10  Years_in_Current_Role          1000 non-null   int64 
 11  Years_Since_Last_Promotion     1000 non-null   int64 
 12  Work_Life_Balance              1000 non-null   int64 
 13  Job_

Employee_ID                      0
Age                              0
Gender                           0
Marital_Status                   0
Department                       0
Job_Role                         0
Job_Level                        0
Monthly_Income                   0
Hourly_Rate                      0
Years_at_Company                 0
Years_in_Current_Role            0
Years_Since_Last_Promotion       0
Work_Life_Balance                0
Job_Satisfaction                 0
Performance_Rating               0
Training_Hours_Last_Year         0
Overtime                         0
Project_Count                    0
Average_Hours_Worked_Per_Week    0
Absenteeism                      0
Work_Environment_Satisfaction    0
Relationship_with_Manager        0
Job_Involvement                  0
Distance_From_Home               0
Number_of_Companies_Worked       0
Attrition                        0
dtype: int64

In [4]:

# Step 5: Standardize column names (remove spaces, lowercase)
df.columns = df.columns.str.strip().str.replace(" ", "_").str.lower()

In [6]:
# Step 6: Encode target variable Attrition (Yes=1, No=0)
df["attrition"] = df["attrition"].map({"Yes": 1, "No": 0})

In [9]:
# Step 7: Encode categorical variables
from sklearn.preprocessing import LabelEncoder

cat_cols = df.select_dtypes(include=["object"]).columns
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

print("Categorical columns encoded:", list(cat_cols))

Categorical columns encoded: ['gender', 'marital_status', 'department', 'job_role', 'overtime']


In [None]:
# Step 8: Save cleaned dataset
df.to_csv("/Users/sangsthitapanda/Desktop/L&T PROJECT/archive/employee_attrition_dataset_cleaned.csv", index=False)
print("✅ Preprocessing done. Cleaned file saved.")
df.head()

✅ Preprocessing done. Cleaned file saved.


Unnamed: 0,employee_id,age,gender,marital_status,department,job_role,job_level,monthly_income,hourly_rate,years_at_company,...,overtime,project_count,average_hours_worked_per_week,absenteeism,work_environment_satisfaction,relationship_with_manager,job_involvement,distance_from_home,number_of_companies_worked,attrition
0,1,58,0,1,2,3,1,15488,28,15,...,0,6,54,17,4,4,4,20,3,0
1,2,48,0,1,4,1,5,13079,28,6,...,1,2,45,1,4,1,2,25,2,0
2,3,34,1,1,3,1,1,13744,24,24,...,1,6,34,2,3,4,4,45,3,0
3,4,27,0,0,3,3,1,6809,26,10,...,0,9,48,18,2,3,1,35,3,0
4,5,40,1,0,3,2,1,10206,52,29,...,0,3,33,0,4,1,3,44,3,0
