In [14]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import joblib
import xgboost as xgb

# Load data
df = pd.read_csv("/Users/shauryadityasingh/Downloads/HR Analytics/WA_Fn-UseC_-HR-Employee-Attrition.csv")

# Preprocess
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})
df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis=1, inplace=True)
df = pd.get_dummies(df, drop_first=True)

X = df.drop('Attrition', axis=1)
y = df['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# Train model
model = xgb.XGBClassifier(eval_metric='logloss')
model.fit(X_train_res, y_train_res)

# Create models/ folder
os.makedirs("models", exist_ok=True)

# Save model, scaler, and feature names
joblib.dump(model, "models/xgb_attrition_model.pkl")
joblib.dump(scaler, "models/scaler.pkl")
joblib.dump(X.columns.tolist(), "models/feature_names.pkl")

print("✅ Model, scaler, and feature names saved inside models/")


✅ Model, scaler, and feature names saved inside models/
