In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

I'm going to build a binary classifier to predict if employees will leave a company. The data set I am using was provided in a course I took at Southern New Hampshire University. 

In [2]:
data = pd.read_csv("employee_attrition.csv")
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
data['Attrition'].value_counts()

No     1066
Yes     204
Name: Attrition, dtype: int64

As you can see, we have a mix of numeric and categorical variables. 

The variable I am attempting to predict is "Attrition". I'll convert the "Yes" and "No" values to 1's and 0's, then separate this column from the dataframe. 

In [4]:
target = data['Attrition'].map(dict(Yes=1, No=0))
#I'm also dropping EmployeeCount here, since every value is 1 there's no use in keeping it for modeling
data.drop(columns=['Attrition', 'EmployeeCount'], inplace=True)

Next, I'll select all of the columns with categorical data. These need to be encoded prior to building a model. I'm using sklearn's OneHotEncoder for this. OneHotEncoder will add a new column for each unique value in a categorical column, and every row in the dataframe will be assigned a 0 or 1 value for these new columns to indicate if the categorical label is present. 

In [5]:
categorical_cols = data.select_dtypes('object').columns.tolist()

categorical_cols

['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'Over18',
 'OverTime']

In [6]:
for col in categorical_cols:
    enc = OneHotEncoder(handle_unknown='ignore')
    encoded = enc.fit_transform(data[[col]]).toarray()
    feature_names = enc.get_feature_names_out()
    enc_df = pd.DataFrame(encoded,columns=feature_names)
    data = data.join(enc_df)
    
data.drop(columns=categorical_cols, inplace=True)

Next, lets check for null values in the data set

In [7]:
is_nan = data.isnull()
row_has_nan = is_nan.any(axis=1)
rows_with_nan = data[row_has_nan]
rows_with_nan

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes


The data set has no null values. We can move on to splitting the data into train/test sets. 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=2, stratify=target)

I've decided to use a random forest for the binary classification task. First, I'll run sklearn's GridSearchCV to find the best parameters. 

In [10]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

parameters = {'criterion': ['gini', 'entropy'],
              'max_features': ['auto', 'sqrt', 'log2'],
              'n_estimators': [100, 150,200]}

grid_search = GridSearchCV(rf, parameters)

grid_search.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [100, 150, 200]})

In [11]:
grid_search.best_params_

{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 100}

Now, I can use those parameters to build the final model. 

In [12]:
clf = RandomForestClassifier(criterion='gini', max_features='auto', n_estimators=150, class_weight='balanced')
clf.fit(X_train, y_train)

clf.score(X_test, y_test)

0.8713910761154856

The classifier scored 0.87 on the test data. 