In [10]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
import numpy as np

# Read the dataset
data = pd.read_csv('/content/IBM-HR-Employee-Attrition.csv')

# Map the 'Attrition' column to numeric labels
data['Attrition'] = data['Attrition'].map({'Yes': 1, 'No': 0})

# Selecting features for attrition prediction
X = data[['Age', 'DistanceFromHome', 'Education',
          'JobSatisfaction', 'MonthlyIncome', 'PercentSalaryHike',
          'TotalWorkingYears', 'YearsAtCompany', 'YearsSinceLastPromotion',
          'PerformanceRating']]

# Target variable for attrition prediction
y = data['Attrition']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Balancing the class distribution using RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_train_over, y_train_over = oversampler.fit_resample(X_train, y_train)

# Data scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_over)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for randomized search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Define the random forest classifier
rf = RandomForestClassifier(random_state=42)

# Perform randomized search for hyperparameter optimization
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=50, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train_scaled, y_train_over)

# Get the best model from the randomized search
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(f'Best parameters: {random_search.best_params_}')

              precision    recall  f1-score   support

           0       0.88      0.97      0.93       255
           1       0.46      0.15      0.23        39

    accuracy                           0.86       294
   macro avg       0.67      0.56      0.58       294
weighted avg       0.83      0.86      0.83       294

Best parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': False}


In [9]:
import joblib

# Save the best model
joblib.dump(best_model, 'best_random_forest_model_ATTRITION.pkl')

['best_random_forest_model_ATTRITION.pkl']