In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Hello! This is a test datast from the wonderful people at IBM. It is my first solo project here on Kaggle that is completed. The dataset is trying to determine some of the causes of attrition among employees or what could cause them to leave. Let's dive in.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

Bringing in all of our necessary packages!

In [None]:
data = pd.read_csv("/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")

data.head()

Reading in our data and taking our first look at it.

In [None]:
data.info()

Looking at the types for each column.

In [None]:
data.describe()

Quick stats for the columns where it applies.

In [None]:
#Checking for NA's in the data.
missing_data = data.isnull().sum()
missing_data

Making sure they are no null or NA values.

Beginning the EDA. I want to see if there are any obvious patterns that could tell me if an employee would leave or not.

In [None]:
sns.countplot('Gender', hue='Attrition', data=data)

In [None]:
sns.countplot('MaritalStatus', hue='Attrition', data=data)

In [None]:
sns.countplot('NumCompaniesWorked', hue='Attrition', data=data)

In [None]:
sns.countplot('StockOptionLevel', hue='Attrition', data=data)

In [None]:
plt.figure(figsize=(12,12))
sns.countplot('DistanceFromHome', hue='Attrition', data=data)
plt.xticks(rotation=45)
plt.show()

In [None]:
#Checking variables that could be important

sns.relplot(x= "YearsAtCompany", y= "JobSatisfaction", hue = "Attrition", data = data)

In [None]:
#Continuing comparing variables I think are important
sns.countplot(x= "YearsSinceLastPromotion",  hue = "Attrition",data = data)

In [None]:
sns.countplot('EducationField', hue='Attrition', data=data)
plt.xticks(rotation=30)
plt.show()

In [None]:
sns.countplot('PercentSalaryHike', hue='Attrition', data=data)

In [None]:
#Checking for collinearity by using a correlation matrix.

correlation_heatmap = data.drop(['Attrition'], axis=1).corr()
plt.figure(figsize=(12,12))
sns.heatmap(correlation_heatmap,cmap='Blues')
plt.show()

In [None]:
#Looking at the ratio 
data['Attrition'].value_counts()

I want to see how the target, in this case Attrition is split. It looks like we have a lot more people who are not leaving which is good for this made up company but we could run into the problem of bias in the data.

In [None]:
data.Attrition.replace({'Yes': 1, 'No': 0}, inplace=True)
data.BusinessTravel.replace({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently':2}, inplace=True)
data.Department.replace({'Sales': 0, 'Research & Development': 1, 'Human Resources': 2, }, inplace=True)
data.Gender.replace({'Female': 0, 'Male': 1}, inplace=True)
data.OverTime.replace({'No': 0, 'Yes':1}, inplace=True)
data.EducationField.replace({'Life Sciences': 0, 'Medical': 1, 'Marketing': 2, 
                             'Technical Degree': 3, 'Human Resources': 4, 'Other':5}, 
                            inplace=True)
data.JobRole.replace({'Sales Executive': 0, 'Research Scientist': 1, 'Laboratory Technician': 2,
                     'Manufacturing Director': 3, 'Healthcare Representative': 4, 'Manager': 5,
                     'Sales Representative': 6, 'Research Director': 7, 'Human Resources': 8}, inplace=True)
data.MaritalStatus.replace({'Single': 0, 'Married': 1, 'Divorced': 2}, inplace=True)

In [None]:
num_cols = ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'HourlyRate',
           'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction',
           'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
           'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'WorkLifeBalance',
           'YearsatCompany', 'YearsinCurrentRole', 'YearsSinceLastPromotion', 'YearswithCurrManager']
num_cols

In [None]:
cat_cols = ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender',
           'JobRole', 'MartialStatus', 'OverTime']
cat_cols

I want to organize the columns to see if which are categorical against which are numeric. I changed some of the numeric to categorical to make it easy for our model.

In [None]:
df1 = data.drop(['EmployeeCount', 'EmployeeNumber', 'StandardHours', 'Over18'], axis=1)
df2 = pd.get_dummies(df1)
df2.head()

In [None]:
X = df2.drop(columns=['Attrition'])
y = df2['Attrition'] 

After our data has been changed to my satisfaction, I am splitting the data one last time into what will become our test and train sets. The X is all of the variables that I want to use to see how they affect attrition rate within the company. The y is the dependent variable I want to observe from the model.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=137)

Splitting into testing and training. Random state for the sake of reproducibility.

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
print("Accurary Score: {}".format(accuracy_score(y_test, rf_predictions)))


The accuracy score! Not bad for the model.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

GBCmodel = GradientBoostingClassifier(n_estimators = 50, max_depth = 4, random_state=137)

GBCmodel.fit(X_train, y_train)
print('GBCmodel Training Score is : ', GBCmodel.score(X_train, y_train))
print('GBCmodel Test Score is : ', GBCmodel.score(X_test, y_test))

y_pred = GBCmodel.predict(X_test)

Using GradientBoosting, which is commonly used in decision trees and by extension, randomforests.

In [None]:
from sklearn.metrics import confusion_matrix

CM = confusion_matrix(y_test, y_pred)
print('Confusion Matrix is : \n', CM)

sns.heatmap(CM, center=True, color = 'rgb')
plt.show()

Checking our confusion matrix. As we can see, our model successfully classified people who did not leave the company.

In [None]:
rf.feature_importances_

In [None]:
from sklearn.inspection import permutation_importance

col_name = list(X.columns)

plt.figure(figsize=(12,12))
plt.barh(col_name, rf.feature_importances_)
plt.show()

Looking at feature importance. This can help identify in the future what might cause a person to leave the company.