In [None]:
import pandas as pd
import numpy as np

In [None]:
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
DATA_PATH = Path("/content/HR_Employee_Attrition_dataset.csv")
df = pd.read_csv(DATA_PATH)

In [None]:
print("Shape:", df.shape)
print(df.head(2))

Shape: (1470, 35)
   Age Attrition     BusinessTravel  DailyRate Department  DistanceFromHome  \
0   19       Yes  Travel_Frequently        602      Sales                 1   
1   50       Yes  Travel_Frequently        959      Sales                 1   

   Education    EducationField  EmployeeCount  EmployeeNumber  ...  \
0          1  Technical Degree              1             235  ...   
1          4             Other              1            1113  ...   

   RelationshipSatisfaction StandardHours  StockOptionLevel  \
0                         1            80                 0   
1                         4            80                 0   

   TotalWorkingYears  TrainingTimesLastYear WorkLifeBalance  YearsAtCompany  \
0                  1                      5               4               0   
1                  5                      4               3               0   

  YearsInCurrentRole  YearsSinceLastPromotion  YearsWithCurrManager  
0                  0               

In [None]:
for c in df.select_dtypes(include='object').columns:
    df[c] = df[c].astype(str).str.strip()

In [None]:
df = df.drop_duplicates().reset_index(drop=True)

In [None]:
print("\nMissing values per column:\n", df.isna().sum())
print("\nUnique values (first few cols):\n", df.nunique().head())


Missing values per column:
 Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurr

In [None]:
drop_cols = [c for c in ["EmployeeCount", "Over18", "StandardHours", "EmployeeNumber"] if c in df.columns]
df = df.drop(columns=drop_cols)

In [None]:
if "Attrition" in df.columns:
    df["Attrition_Flag"] = (df["Attrition"].str.upper() == "YES").astype(int)
else:
    raise ValueError("Column 'Attrition' not found in the dataset.")

In [None]:
if "YearsAtCompany" in df.columns:
    df["Tenure_Bucket"] = pd.cut(
        df["YearsAtCompany"],
        bins=[-1, 1, 3, 5, 10, 40],
        labels=["<=1y", "1-3y", "3-5y", "5-10y", "10y+"]
    )

In [None]:
bi_cols_order = [c for c in df.columns if c != "Attrition_Flag"] + ["Attrition_Flag"]
df_bi = df[bi_cols_order].copy()
bi_out = Path("/content/hr_clean_for_bi.csv")
df_bi.to_csv(bi_out, index=False)
print(f"\n✅ Saved BI-ready file -> {bi_out}")


✅ Saved BI-ready file -> /content/hr_clean_for_bi.csv
