In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
manager_survey = pd.read_csv('/kaggle/input/hr-analytics-case-study/manager_survey_data.csv')
manager_survey

In [None]:
employee_survey = pd.read_csv('/kaggle/input/hr-analytics-case-study/employee_survey_data.csv')
employee_survey

In [None]:
general_data = pd.read_csv('/kaggle/input/hr-analytics-case-study/general_data.csv')
general_data

In [None]:
general_data = general_data.join([manager_survey.drop('EmployeeID', axis=1), employee_survey.drop('EmployeeID', axis=1)])
general_data.drop('EmployeeID', axis=1, inplace=True)
general_data

In [None]:
general_data.info()

Few columns have missing data. The number of missing data in those columns are few, but since the number of observations in the dataset are few, those rows with missing data will not be removed. Instead I will be fillin those missing data with the mean values in the columns they're missing in.

In [None]:
general_data['NumCompaniesWorked'].fillna(general_data['NumCompaniesWorked'].mean(), inplace=True)
general_data['TotalWorkingYears'].fillna(general_data['TotalWorkingYears'].mean(), inplace=True)
general_data['EnvironmentSatisfaction'].fillna(general_data['EnvironmentSatisfaction'].mean(), inplace=True)
general_data['JobSatisfaction'].fillna(general_data['JobSatisfaction'].mean(), inplace=True)
general_data['WorkLifeBalance'].fillna(general_data['WorkLifeBalance'].mean(), inplace=True)
general_data.isnull().sum()

In [None]:
general_data.info()

In [None]:
# checking unique values in categorical columns
general_data['Attrition'].value_counts()

In [None]:
general_data['BusinessTravel'].value_counts()

In [None]:
general_data['Department'].value_counts()

In [None]:
general_data['EducationField'].value_counts()

In [None]:
general_data['Gender'].value_counts()

In [None]:
general_data['JobRole'].value_counts()

In [None]:
general_data['MaritalStatus'].value_counts()

In [None]:
general_data['Over18'].value_counts()

In [None]:
# using labelencoding for columns with only two categories
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
general_data['Attrition'] = le.fit_transform(general_data['Attrition'])
general_data['Gender'] = le.fit_transform(general_data['Gender'])
general_data['Over18'] = le.fit_transform(general_data['Over18'])
general_data.info()

In [None]:
# using dummies for columns with more than two categories
general_data = pd.get_dummies(general_data, columns=['BusinessTravel', 'Department', 'EducationField', 
                                               'JobRole', 'MaritalStatus'])
general_data.info()

In [None]:
import seaborn as sns
sns.countplot(x='Attrition', data=general_data)

In [None]:
len(general_data[general_data['Attrition']==1])/len(general_data)

Only 16% of employees in this dataset left the company so there is a large class imbalance

In [None]:
from imblearn.over_sampling import SMOTE

X = general_data.drop('Attrition', axis=1)
y = general_data['Attrition']

# Resample data
X, y = SMOTE(sampling_strategy=0.5, random_state=0).fit_resample(X, y)
sns.countplot(x=y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=20, random_state=0)
X_train

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, y_test)
acc

# Optimization: Feature Engineering and Preprocessing

## Checking for Correlation between Features and Attrition 

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20,8))
general_data.corr()['Attrition'].sort_values(ascending = False).plot(kind='bar');


Features like EmployeeCount, Over18 and StandardHours have the same values for each observation and also do not help explain if an Empployee will leave or not. Hence they must also be dropped.

In [None]:
# dropping features with single observation types
cols = ['EmployeeCount', 'Over18', 'StandardHours']
X_train.drop(cols, axis=1, inplace=True)
X_test.drop(cols, axis=1, inplace=True)

X_train.info()

In [None]:
clf1 = LogisticRegression()
clf1.fit(X_train, y_train)
pred1 = clf1.predict(X_test)
acc1 = accuracy_score(pred1, y_test)
acc1

## Checking for Correlation between features

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6), dpi=80)
features_corr = pd.DataFrame(X_train).corr()
sns.heatmap(features_corr)

In [None]:
# dropping highly correlated variables
corrdf = features_corr.where(np.triu(np.ones(features_corr.shape), k=1).astype(np.bool))
corrdf = corrdf.unstack().reset_index()
corrdf.columns = ['feature1', 'feature2', 'Correlation']
corrdf.dropna(subset = ['Correlation'], inplace = True)
corrdf['Correlation'] = round(corrdf['Correlation'], 2)
corrdf['Correlation'] = abs(corrdf['Correlation'])
matrix= corrdf.sort_values(by = 'Correlation', ascending = False)
high_corr_mat = matrix[matrix['Correlation']>0.6]
high_corr_mat

In [None]:
features_index = ['Department_Sales','TotalWorkingYears', 'YearsWithCurrManager', 
                  'PercentSalaryHike', 'BusinessTravel_Travel_Frequently', 'EducationField_Human Resources',
                 'YearsSinceLastPromotion']
X_test = X_test.drop(features_index,axis=1)
X_train = X_train.drop(features_index,axis=1)

In [None]:
X_train

In [None]:
clf2 = LogisticRegression()
clf2.fit(X_train, y_train)
pred2 = clf2.predict(X_test)
acc2 = accuracy_score(pred2, y_test)
acc2

## Feature Scaling

In [None]:
pd.set_option('max_columns', None)
X_train

In [None]:
from sklearn.preprocessing import normalize

scaler_cols = [ 'Age', 'DistanceFromHome', 'Education','JobLevel', 'MonthlyIncome', 'NumCompaniesWorked', 'YearsAtCompany', 
               'StockOptionLevel', 'TrainingTimesLastYear','PerformanceRating', 'JobInvolvement', 
              'EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance']

Scaled_train = X_train.copy()
Scaled_test = X_test.copy()

Scaled_train[scaler_cols] = normalize(X_train[scaler_cols], norm='max')
Scaled_test[scaler_cols] = normalize(X_test[scaler_cols], norm='max')


Scaled_train

In [None]:
clf3 = LogisticRegression()
clf3.fit(Scaled_train, y_train)
pred3 = clf3.predict(Scaled_test)
acc3 = accuracy_score(pred3, y_test)
acc3

# Optimization: Hyperparameter Tuning

In [None]:
# defining model parameters
model = LogisticRegression(random_state=42)
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['l1', 'l2']
c_values = [1000, 100, 10, 1.0, 0.1, 0.01]


In [None]:
# defining grid search
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(Scaled_train.append(Scaled_test), y_train.append(y_test))
grid_result


In [None]:
# summarizing results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))