In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
url = "https://raw.githubusercontent.com/sairaawahid/Employee-Attrition-Predictor-for-HR-Analytics/refs/heads/main/data/WA_Fn-UseC_-HR-Employee-Attrition.csv?token=GHSAT0AAAAAADFFG756W52DH3HXL6EW6DZ42CCVOXA"
df = pd.read_csv(url)

print(f"✅ Data loaded: {df.shape[0]} rows, {df.shape[1]} columns")


✅ Data loaded: 1470 rows, 35 columns


In [3]:
print("\n🧼 Missing Values:\n", df.isnull().sum())


🧼 Missing Values:
 Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole  

In [4]:
# Copy original dataset
df_processed = df.copy()

# Drop columns that may be IDs or have high correlation with target (optional)
df_processed.drop(columns=['EmployeeNumber', 'Over18', 'EmployeeCount', 'StandardHours'], inplace=True)

# Label encode target variable
df_processed['Attrition'] = df_processed['Attrition'].map({'Yes': 1, 'No': 0})

# Identify categorical columns
categorical_cols = df_processed.select_dtypes(include='object').columns
print("🔤 Categorical columns:\n", categorical_cols)

# One-hot encode all remaining categoricals
df_processed = pd.get_dummies(df_processed, drop_first=True)

print("✅ Data after encoding:", df_processed.shape)


🔤 Categorical columns:
 Index(['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
       'MaritalStatus', 'OverTime'],
      dtype='object')
✅ Data after encoding: (1470, 45)


In [5]:
# Identify numeric features
numeric_cols = df_processed.select_dtypes(include=['int64', 'float64']).columns
numeric_cols = [col for col in numeric_cols if col != 'Attrition']  # exclude target

scaler = StandardScaler()
df_processed[numeric_cols] = scaler.fit_transform(df_processed[numeric_cols])

print("✅ Numerical features scaled.")


✅ Numerical features scaled.


In [6]:
# Split features and target
X = df_processed.drop('Attrition', axis=1)
y = df_processed['Attrition']

# 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(f"✅ Train shape: {X_train.shape}, Test shape: {X_test.shape}")


✅ Train shape: (1176, 44), Test shape: (294, 44)


In [7]:
# Check class distribution
print("Before SMOTE:")
print(y_train.value_counts(normalize=True))

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("After SMOTE:")
print(pd.Series(y_train_res).value_counts(normalize=True))


Before SMOTE:
Attrition
0    0.838435
1    0.161565
Name: proportion, dtype: float64
After SMOTE:
Attrition
0    0.5
1    0.5
Name: proportion, dtype: float64


In [8]:
# Save processed data
X_train_res.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train_res.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

print("✅ Preprocessed datasets saved.")


✅ Preprocessed datasets saved.
