In [None]:
pip install pandas numpy seaborn scikit-learn imbalanced-learn matplotlib


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE


In [None]:
# Load the dataset
df = pd.read_csv("employee_attrition.csv")

# Display the first few rows
print(df.head())


In [None]:
# Label Encoding for target variable 'Attrition'
label_encoder = LabelEncoder()
df["Attrition"] = label_encoder.fit_transform(df["Attrition"])  # Yes → 1, No → 0

# One-Hot Encoding for categorical features
df = pd.get_dummies(df, columns=["Job Role", "Department"], drop_first=True)


In [None]:
# Define features and target
X = df.drop(columns=["Attrition", "Employee ID"])  # Drop target and ID
y = df["Attrition"]

# Select top 5 features using ANOVA F-test
selector = SelectKBest(score_func=f_classif, k=5)
X_new = selector.fit_transform(X, y)

# Shape of selected features
print("Shape after SelectKBest:", X_new.shape)


In [None]:
# Reduce to 2 dimensions using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_new)

print("Shape after PCA:", X_pca.shape)


In [None]:
# Check class distribution before SMOTE
print("Before SMOTE:\n", y.value_counts())

# Apply SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_pca, y)

# Check class distribution after SMOTE
print("After SMOTE:\n", y_resampled.value_counts())
