In [1]:
#First import necessary modules for the project
import pandas as pd
import numpy as np
from IPython.display import Markdown, display
import matplotlib.pyplot as plt
# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv", sep = ',')
data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [3]:
#getting idea of the range of monthly income.
attritionAndIncome = data[['Attrition', 'MonthlyIncome']].sort_values(by=['MonthlyIncome'])

#number of bins is chosen to be 29
attritionAndIncome['binning'] = pd.cut(attritionAndIncome['MonthlyIncome'], bins=29)
attritionAndIncome

Unnamed: 0,Attrition,MonthlyIncome,binning
513,Yes,1009,"(990.01, 1663.828]"
727,No,1051,"(990.01, 1663.828]"
764,No,1052,"(990.01, 1663.828]"
1338,Yes,1081,"(990.01, 1663.828]"
1365,Yes,1091,"(990.01, 1663.828]"
...,...,...,...
568,Yes,19859,"(19344.172, 19999.0]"
165,No,19926,"(19344.172, 19999.0]"
851,No,19943,"(19344.172, 19999.0]"
746,No,19973,"(19344.172, 19999.0]"


In [4]:
OnlyYesIncome = attritionAndIncome[attritionAndIncome["Attrition"].str.match('Yes')]
yesIncomeSorted= OnlyYesIncome.groupby(by =['binning']).size().reset_index()
OnlyNoIncome = attritionAndIncome[attritionAndIncome["Attrition"].str.match('No')]
NoIncomeSorted= OnlyNoIncome.groupby(by =['binning']).size().reset_index()


In [5]:
newDataIncome = pd.merge(yesIncomeSorted, NoIncomeSorted, on = 'binning').rename(columns={"0_x": "Yes", "0_y":"No"})
newDataIncome

Unnamed: 0,binning,Yes,No
0,"(990.01, 1663.828]",13,12
1,"(1663.828, 2318.655]",33,90
2,"(2318.655, 2973.483]",67,171
3,"(2973.483, 3628.31]",15,88
4,"(3628.31, 4283.138]",15,95
5,"(4283.138, 4937.966]",17,122
6,"(4937.966, 5592.793]",12,113
7,"(5592.793, 6247.621]",11,68
8,"(6247.621, 6902.448]",7,81
9,"(6902.448, 7557.276]",4,29


In [7]:
def getRatioYN(row):
    return row['Yes']/ row['No']
newDataIncome['Yes_No_Ratio'] = newDataIncome.apply(getRatioYN, axis =1)

In [8]:
def printmd(string):
    display(Markdown(string))
    
printmd('Lower the yes_no_ratio lower the attrition is. So, employee with very lowest salary has the highest attrition')
newDataIncome.sort_values(by = 'Yes_No_Ratio')


Lower the yes_no_ratio lower the attrition is. So, employee with very lowest salary has the highest attrition

Unnamed: 0,binning,Yes,No,Yes_No_Ratio
22,"(15415.207, 16070.034]",0,7,0.0
26,"(18034.517, 18689.345]",0,11,0.0
25,"(17379.69, 18034.517]",0,16,0.0
20,"(14105.552, 14760.379]",0,6,0.0
23,"(16070.034, 16724.862]",0,15,0.0
24,"(16724.862, 17379.69]",0,23,0.0
21,"(14760.379, 15415.207]",0,5,0.0
15,"(10831.414, 11486.241]",1,19,0.052632
16,"(11486.241, 12141.069]",1,15,0.066667
27,"(18689.345, 19344.172]",2,27,0.074074


In [None]:
newDataIncome.plot(x = "binning", y = "Yes_No_Ratio")
frame1 = plt.gca()
frame1.axes.get_xaxis().set_visible(False)

plt.xlabel('Incomes')
plt.ylabel('Yes_No_Ratio')
plt.suptitle('Employee attrition vs monthly income', fontsize=14)
plt.show()

In [None]:
#import the csv file and change the Attrition from Yes or No to 0 or 1 and split the data for train and test purpose

data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv", sep = ',')
data['Attrition'] = data['Attrition'].map({'Yes':0 ,'No':1})
train, test = train_test_split(data, test_size=0.2)

In [None]:
test

In [None]:
train_features = train.copy()
train_labels = train_features.pop('Attrition')

test_features = test.copy()
test_labels = test_features.pop('Attrition')
# Create a symbolic input
input = tf.keras.Input(shape=(), dtype=tf.float32)


In [None]:
inputs = {}

for name, column in train_features.items():
    dtype = column.dtype
    if dtype == object:
        dtype = tf.string
    else:
        dtype = tf.float32
    inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)

inputs

In [None]:
numeric_inputs = {name:input for name,input in inputs.items()
                  if input.dtype==tf.float32}

x = layers.Concatenate()(list(numeric_inputs.values()))
norm = preprocessing.Normalization()
norm.adapt(np.array(train[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)

all_numeric_inputs

In [None]:
preprocessed_inputs = [all_numeric_inputs]

In [None]:
for name, input in inputs.items():
    if input.dtype == tf.float32:
        continue
    lookup = preprocessing.StringLookup(vocabulary=np.unique(train_features[name]))
    one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size())
    x = lookup(input)
    x = one_hot(x)
    preprocessed_inputs.append(x)

In [None]:
preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)
train_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

In [None]:
train_features_dict = {name: np.array(value) for name, value in train_features.items()}
test_features_dict = {name: np.array(value) for name, value in test_features.items()}
features_dict = {name:values[:1] for name, values in train_features_dict.items()}
train_preprocessing(features_dict)

In [None]:
def data_model(preprocessing_head, inputs):
    body = tf.keras.Sequential([
        layers.Dense(64),
        layers.Dense(1)
    ])
    preprocessed_inputs = preprocessing_head(inputs)
    result = body(preprocessed_inputs)
    model = tf.keras.Model(inputs, result)
    model.compile(loss=tf.losses.BinaryCrossentropy(from_logits=True),
                optimizer=tf.optimizers.Adam(), metrics=['accuracy'])
    return model

data_model = data_model(train_preprocessing, inputs)

In [None]:
data_model.fit(x=train_features_dict, y=train_labels, epochs=10)

In [None]:
print("Evaluate on test data")
loss, accuracy= data_model.evaluate(test_features_dict, test_labels)
print("test loss :", loss)
print("test accuracy: ", accuracy)

In [None]:
#Here Positive number denotes 
print("Predict on test data")
x = data_model.predict(test_features_dict)
x[40:70]

In [None]:
test_labels[40:70]