In [1]:
#First import necessary modules for the project
import pandas as pd
import numpy as np

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
#import the csv file and change the Attrition from Yes or No to 0 or 1 and split the data for train and test purpose

data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv", sep = ',')
data['Attrition'] = data['Attrition'].map({'Yes':0 ,'No':1})
train, test = train_test_split(data, test_size=0.2)

In [20]:
test

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1240,39,1,Non-Travel,792,Research & Development,1,3,Life Sciences,1,1737,...,4,80,1,9,2,3,9,8,5,8
1119,38,1,Travel_Rarely,1245,Sales,14,3,Life Sciences,1,1582,...,4,80,1,10,3,3,9,8,7,7
1041,28,1,Travel_Rarely,866,Sales,5,3,Medical,1,1469,...,4,80,0,6,4,3,5,4,1,3
1339,22,0,Travel_Rarely,391,Research & Development,7,1,Life Sciences,1,1878,...,1,80,0,1,2,3,1,0,0,0
284,26,1,Travel_Frequently,496,Research & Development,11,2,Medical,1,390,...,3,80,1,5,3,3,5,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1346,45,1,Travel_Rarely,556,Research & Development,25,2,Life Sciences,1,1888,...,4,80,2,10,2,2,9,8,3,8
77,45,1,Travel_Rarely,193,Research & Development,6,4,Other,1,101,...,2,80,0,17,3,4,0,0,0,0
1054,49,1,Travel_Rarely,1490,Research & Development,7,4,Life Sciences,1,1484,...,2,80,2,29,3,3,8,7,0,7
624,53,1,Travel_Rarely,661,Sales,7,2,Marketing,1,862,...,4,80,1,35,3,3,5,2,0,4


In [3]:
train_features = train.copy()
train_labels = train_features.pop('Attrition')

test_features = test.copy()
test_labels = test_features.pop('Attrition')
# Create a symbolic input
input = tf.keras.Input(shape=(), dtype=tf.float32)


In [4]:
inputs = {}

for name, column in train_features.items():
    dtype = column.dtype
    if dtype == object:
        dtype = tf.string
    else:
        dtype = tf.float32
    inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)

inputs

{'Age': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'Age')>,
 'BusinessTravel': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'BusinessTravel')>,
 'DailyRate': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'DailyRate')>,
 'Department': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Department')>,
 'DistanceFromHome': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'DistanceFromHome')>,
 'Education': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'Education')>,
 'EducationField': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'EducationField')>,
 'EmployeeCount': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'EmployeeCount')>,
 'EmployeeNumber': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'EmployeeNumber')>,
 'EnvironmentSatisfaction': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'EnvironmentSatisfaction')>,
 'Gender': <KerasTensor

In [5]:
numeric_inputs = {name:input for name,input in inputs.items()
                  if input.dtype==tf.float32}

x = layers.Concatenate()(list(numeric_inputs.values()))
norm = preprocessing.Normalization()
norm.adapt(np.array(train[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)

all_numeric_inputs

<KerasTensor: shape=(None, 26) dtype=float32 (created by layer 'normalization')>

In [6]:
preprocessed_inputs = [all_numeric_inputs]

In [7]:
for name, input in inputs.items():
    if input.dtype == tf.float32:
        continue
    lookup = preprocessing.StringLookup(vocabulary=np.unique(train_features[name]))
    one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size())
    x = lookup(input)
    x = one_hot(x)
    preprocessed_inputs.append(x)

In [8]:
preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)
train_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

In [15]:
train_features_dict = {name: np.array(value) for name, value in train_features.items()}
test_features_dict = {name: np.array(value) for name, value in test_features.items()}
features_dict = {name:values[:1] for name, values in train_features_dict.items()}
train_preprocessing(features_dict)

<tf.Tensor: shape=(1, 71), dtype=float32, numpy=
array([[-1.106, -1.501, -0.39 ,  0.077,  0.   ,  0.236,  1.161, -0.534,
        -1.04 , -0.975,  0.24 , -0.851, -0.868, -0.684, -0.592, -0.427,
         0.266,  0.   ,  0.231, -0.952, -2.187,  0.335, -0.499, -0.619,
        -0.067, -0.588,  0.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,
         0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  0.   ,  1.   ,
         0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  1.   ,
         0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
         1.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,
         0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  1.   ,  0.   ]],
      dtype=float32)>

In [16]:
def data_model(preprocessing_head, inputs):
    body = tf.keras.Sequential([
        layers.Dense(64),
        layers.Dense(1)
    ])
    preprocessed_inputs = preprocessing_head(inputs)
    result = body(preprocessed_inputs)
    model = tf.keras.Model(inputs, result)
    model.compile(loss=tf.losses.BinaryCrossentropy(from_logits=True),
                optimizer=tf.optimizers.Adam())
    return model

data_model = data_model(train_preprocessing, inputs)

In [18]:
data_model.fit(x=train_features_dict, y=train_labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x155bd912ec8>

In [38]:
#Here Positive number denotes 
print("Predict on test data")
x = data_model.predict(test_features_dict)
x[40:70]

Predict on test data


array([[ 6.594],
       [ 4.197],
       [ 1.634],
       [ 0.97 ],
       [ 1.745],
       [ 4.13 ],
       [-1.062],
       [ 0.359],
       [ 1.453],
       [ 2.099],
       [ 5.348],
       [ 1.781],
       [ 3.09 ],
       [ 1.783],
       [ 1.422],
       [ 1.45 ],
       [ 1.427],
       [ 6.762],
       [ 1.781],
       [ 1.894],
       [ 0.793],
       [ 4.056],
       [ 2.638],
       [ 3.568],
       [ 2.47 ],
       [-0.574],
       [-0.558],
       [ 3.428],
       [ 7.567],
       [ 5.78 ]], dtype=float32)

In [39]:
test_labels[40:70]

969     1
261     1
427     1
138     1
901     1
1023    1
683     0
369     1
988     1
1121    1
1210    1
1386    1
247     1
1005    1
46      1
398     1
1138    1
193     1
1409    1
807     1
725     0
1350    1
23      1
1130    1
9       1
480     0
1271    0
1147    1
244     1
98      1
Name: Attrition, dtype: int64