## 4. Preprocessing

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [5]:
#df = pd.read_csv("/Users/ritahorta/Downloads/HR_Attrition_Dataset.csv")
df = pd.read_csv("C:/Users/P058886/Downloads/HR_Attrition_Dataset.csv")

In [6]:
print(df.isnull().sum())

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [7]:
print(df['Attrition'].unique())

['Yes' 'No']


In [8]:
#Check for duplicates
df = df.drop_duplicates()

In [None]:
num_cols = df.select_dtypes(include='number').columns

# Save initial size before filtering
initial_shape = df.shape

# Compute IQR, Q1, Q3 for numeric columns
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1

# Filter out outliers
df_filtered = df[~((df[num_cols] < (Q1 - 3 * IQR)) | (df[num_cols] > (Q3 + 3 * IQR))).any(axis=1)]

# Compare shapes
print("Before filtering:", initial_shape)
print("After filtering: ", df_filtered.shape)
print("Rows removed:", initial_shape[0] - df_filtered.shape[0])

Before filtering: (1470, 35)
After filtering:  (1206, 35)
Rows removed: 264


In [15]:
df = df_filtered.copy()

In [16]:
# Converting Attrition into numeric form for modeling

df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})
print(df['Attrition'].unique())

[1 0]


In [17]:
# Seperate features types

num_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('Attrition')
cat_cols = df.select_dtypes(include=['object', 'category']).columns

In [18]:
# Define features and target
X = df.drop('Attrition', axis=1)   # all columns except target
y = df['Attrition']                # target column

In [19]:
# Check numeric columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Check for NaN
print("NaNs per numeric column:\n", X[num_cols].isnull().sum())

# Check for infinite values
print("Infs per numeric column:\n", np.isinf(X[num_cols]).sum())


NaNs per numeric column:
 Age                         0
DailyRate                   0
DistanceFromHome            0
Education                   0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobSatisfaction             0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64
Infs per numeric column:
 Age                         0
DailyRate                   0
DistanceFromHome            0
Education                   0
EmployeeCount               0
Emplo

In [20]:
#Check Nans for categorical columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns
print("NaNs per categorical column:\n", X[cat_cols].isnull().sum())

NaNs per categorical column:
 BusinessTravel    0
Department        0
EducationField    0
Gender            0
JobRole           0
MaritalStatus     0
Over18            0
OverTime          0
dtype: int64


In [21]:
# Define the preprocessing steps for numerical and categorical features
encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, num_cols),
        ('cat', encoder, cat_cols)
    ]
)


In [22]:
# Drop irrelevant columns
X = X.drop(columns=['EmployeeNumber', 'StandardHours', 'EmployeeCount', 'Over18'])
