In [22]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from patsy import dmatrices
from sklearn.metrics import classification_report

#Importing DataFrame
HR_df = pd.read_csv(r'/Users/sanjjj/Desktop/Course/Data Science Programming/Project/data/HRXYZ data.csv')
HR_df

# drop rows with mssing values, most rows do not have NA values
HR_df.dropna(inplace=True)

# change attrition to numerical
attrition_mapping = {'Yes': 1, 'No': 0}

# Apply the mapping to the 'Attrition' column
HR_df['Attrition'] = HR_df['Attrition'].map(attrition_mapping)

# apply mapping to the 'Gender' column 
gender_mapping = {'Male': 1, 'Female': 0}
HR_df['Gender'] = HR_df['Gender'].map(gender_mapping)

# droppping some columns that we think are not important/not variables of interest
HR_df.drop(['Over18', 'EmployeeCount', 'StandardHours', 'EmployeeID'], axis=1, inplace=True)
HR_df

#Creation of dmatrices
formula = 'Attrition ~ 0 + Age + BusinessTravel + Department + DistanceFromHome + Education' + \
    ' + Gender + JobLevel + JobRole + MaritalStatus' + \
    ' + MonthlyIncome + NumCompaniesWorked + PercentSalaryHike' + \
    ' + StockOptionLevel + TotalWorkingYears + TrainingTimesLastYear + YearsAtCompany' + \
    ' + YearsSinceLastPromotion + YearsWithCurrManager' + \
    ' + JobSatisfaction + WorkLifeBalance'

#Separate features (X) and the target variable (y)
Y, X = dmatrices(formula, HR_df, return_type='dataframe')
y = Y['Attrition'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [10,20,50,75,100],
    'max_depth': [5, 10, 15, 20,25],
    
}

# Create the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=66)

# Create GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform the Grid Search to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters and the corresponding model
best_params = grid_search.best_params_
best_rf_classifier = grid_search.best_estimator_

# Train the best model on the entire training data
best_rf_classifier.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred_train = best_rf_classifier.predict(X_train)

# Calculate the accuracy of the best model on the test set
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Best model accuracy on training set: {accuracy_train}")

# Make predictions on the test set using the best model
y_pred_test = best_rf_classifier.predict(X_test)

# Calculate the accuracy of the best model on the test set
accuracy_test = accuracy_score(y_test, y_pred_test)
print(f"Best model accuracy on test set: {accuracy_test}")
print("Best parameters:", best_params)

#To calculate Precision and Recall
print(classification_report(y_test, y_pred_test))


Best model accuracy on training set: 1.0
Best model accuracy on test set: 0.9790697674418605
Best parameters: {'max_depth': 20, 'n_estimators': 100}
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      1449
         1.0       0.99      0.87      0.93       271

    accuracy                           0.98      1720
   macro avg       0.98      0.94      0.96      1720
weighted avg       0.98      0.98      0.98      1720

