In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv('Employee-Attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
df.drop(columns=['Over18','EmployeeCount','StandardHours','EmployeeNumber','MonthlyIncome','YearsInCurrentRole','YearsWithCurrManager'],axis =1)


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,Yes,11,3,1,0,8,0,1,6,0
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,No,23,4,4,1,10,3,3,10,1
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,Yes,15,3,2,0,7,3,3,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,Yes,11,3,3,0,8,3,3,8,3
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,No,12,3,4,1,6,3,3,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,...,No,17,3,3,1,17,3,3,5,0
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,...,No,15,3,1,1,9,5,3,7,1
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,...,Yes,20,4,2,1,6,0,3,6,0
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,...,No,14,3,4,0,17,3,2,9,0


In [None]:
travel_map={
    'Non-Travel':0,
    'Travel_Rarely':1,
    'Travel_Frequently':2
}
df['BusinessTravel']=df['BusinessTravel'].map(travel_map)
df['Attrition'] = df['Attrition'].map({'No':0,'Yes':1})
df['OverTime'] = df['OverTime'].map({'No':0,'Yes':1})
df['Gender'] = df['Gender'].map({'Female':0,'Male':1})


In [None]:
le=LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])


In [None]:
X = df.drop(columns = 'Attrition',axis = 1)
y = df['Attrition']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .2,random_state = 42)


In [None]:
# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Fit the Random Forest model
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate the model
print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))

print("Confusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))


Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.88      0.99      0.93       255
           1       0.60      0.08      0.14        39

    accuracy                           0.87       294
   macro avg       0.74      0.53      0.53       294
weighted avg       0.84      0.87      0.82       294

Confusion Matrix (Random Forest):
[[253   2]
 [ 36   3]]


In [22]:
parameters = [ {'n_estimators':[10, 50, 100, 200], 'max_depth':[None, 10, 20, 30], 'min_samples_split':[2, 5, 10]
              ,'criterion':['gini','entropy'],'max_features':['sqrt','log2']}]

grid_search = GridSearchCV(estimator = rf_classifier,
                           param_grid = parameters,
                           cv = 5,
                           scoring = 'recall',
                           n_jobs = -1,
                           verbose=2)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 192 candidates, totalling 960 fits


In [23]:
print('GridSearch CV best score : {:.4f}\n\n'.format(grid_search.best_score_))
# print parameters that give the best results
print('Parameters that give the best results :','\n\n', (grid_search.best_params_))
# print estimator that was chosen by the GridSearch
print('\n\nEstimator that was chosen by the search :','\n\n', (grid_search.best_estimator_))


GridSearch CV best score : 0.2624


Parameters that give the best results : 

 {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 10}


Estimator that was chosen by the search : 

 RandomForestClassifier(min_samples_split=5, n_estimators=10, random_state=42)


In [24]:
y_pred_rf = grid_search.predict(X_test)

# Evaluate the model
print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))

print("Confusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))

Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.88      0.97      0.92       255
           1       0.43      0.15      0.23        39

    accuracy                           0.86       294
   macro avg       0.66      0.56      0.57       294
weighted avg       0.82      0.86      0.83       294

Confusion Matrix (Random Forest):
[[247   8]
 [ 33   6]]
