In [None]:
#First import necessary modules for the project
import pandas as pd
import numpy as np
from IPython.display import Markdown, display
import matplotlib.pyplot as plt
# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv", sep = ',')
data.columns

In [None]:
#getting idea of the range of monthly income.
attritionAndIncome = data[['Attrition', 'MonthlyIncome']].sort_values(by=['MonthlyIncome'])

#number of bins is chosen to be 29
attritionAndIncome['binning'] = pd.cut(attritionAndIncome['MonthlyIncome'], bins=29)
attritionAndIncome

In [None]:
OnlyYesIncome = attritionAndIncome[attritionAndIncome["Attrition"].str.match('Yes')]
yesIncomeSorted= OnlyYesIncome.groupby(by =['binning']).size().reset_index()
OnlyNoIncome = attritionAndIncome[attritionAndIncome["Attrition"].str.match('No')]
NoIncomeSorted= OnlyNoIncome.groupby(by =['binning']).size().reset_index()


In [None]:
newDataIncome = pd.merge(yesIncomeSorted, NoIncomeSorted, on = 'binning').rename(columns={"0_x": "Yes", "0_y":"No"})
newDataIncome

In [None]:
def getRatioYN(row):
    return row['Yes']/ row['No']
newDataIncome['Yes_No_Ratio'] = newDataIncome.apply(getRatioYN, axis =1)

In [None]:
def printmd(string):
    display(Markdown(string))
    
printmd('Lower the yes_no_ratio lower the attrition is. So, employee with very lowest salary has the highest attrition')
newDataIncome.sort_values(by = 'Yes_No_Ratio')


In [None]:
newDataIncome.plot(x = "binning", y = "Yes_No_Ratio")
frame1 = plt.gca()
frame1.axes.get_xaxis().set_visible(False)

plt.xlabel('Incomes')
plt.ylabel('Yes_No_Ratio')
plt.suptitle('Employee attrition vs monthly income', fontsize=14)
plt.show()

In [None]:
attritionAndDistance = data[['Attrition', 'DistanceFromHome']].sort_values(by=['DistanceFromHome'])
attritionAndDistance['binning'] = pd.cut(attritionAndDistance['DistanceFromHome'], bins=14)

In [None]:
OnlyYes = attritionAndDistance[attritionAndDistance["Attrition"].str.match('Yes')]
yesSorted= OnlyYes.groupby(by =['binning']).size().reset_index()
OnlyNo = attritionAndDistance[attritionAndDistance["Attrition"].str.match('No')]
noSorted = OnlyNo.groupby(by =['binning']).size().reset_index()

In [None]:
newDataDistance = pd.merge(yesSorted, noSorted, on = 'binning').rename(columns={"0_x": "Yes", "0_y":"No"})

In [None]:
def getRatio(row):
    return row['No']/ row['Yes']
newDataDistance['No_Yes_Ratio'] = newDataDistance.apply(getRatio, axis =1)

In [None]:
printmd('This shows higher the ratio, lower the attrition is. We can see the close distance once has the lower attrition. Also, the mean distances has higher attrition')
newDataDistance.sort_values(by = 'No_Yes_Ratio')

In [None]:
newDataDistance.plot(x = "binning", y = "No_Yes_Ratio")
frame1 = plt.gca()
frame1.axes.get_xaxis().set_visible(False)

plt.suptitle('Employee attrition vs distance', fontsize=14)
plt.show()

In [None]:
#import the csv file and change the Attrition from Yes or No to 0 or 1 and split the data for train and test purpose

data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv", sep = ',')
data['Attrition'] = data['Attrition'].map({'Yes':0 ,'No':1})
train, test = train_test_split(data, test_size=0.2)

In [None]:
test

In [None]:
train_features = train.copy()
train_labels = train_features.pop('Attrition')

test_features = test.copy()
test_labels = test_features.pop('Attrition')
# Create a symbolic input
input = tf.keras.Input(shape=(), dtype=tf.float32)


In [None]:
inputs = {}

for name, column in train_features.items():
    dtype = column.dtype
    if dtype == object:
        dtype = tf.string
    else:
        dtype = tf.float32
    inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)

inputs

In [None]:
numeric_inputs = {name:input for name,input in inputs.items()
                  if input.dtype==tf.float32}

x = layers.Concatenate()(list(numeric_inputs.values()))
norm = preprocessing.Normalization()
norm.adapt(np.array(train[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)

all_numeric_inputs

In [None]:
preprocessed_inputs = [all_numeric_inputs]

In [None]:
for name, input in inputs.items():
    if input.dtype == tf.float32:
        continue
    lookup = preprocessing.StringLookup(vocabulary=np.unique(train_features[name]))
    one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size())
    x = lookup(input)
    x = one_hot(x)
    preprocessed_inputs.append(x)

In [None]:
preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)
train_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

In [None]:
train_features_dict = {name: np.array(value) for name, value in train_features.items()}
test_features_dict = {name: np.array(value) for name, value in test_features.items()}
features_dict = {name:values[:1] for name, values in train_features_dict.items()}
train_preprocessing(features_dict)

In [None]:
def data_model(preprocessing_head, inputs):
    body = tf.keras.Sequential([
        layers.Dense(64),
        layers.Dense(1)
    ])
    preprocessed_inputs = preprocessing_head(inputs)
    result = body(preprocessed_inputs)
    model = tf.keras.Model(inputs, result)
    model.compile(loss=tf.losses.BinaryCrossentropy(from_logits=True),
                optimizer=tf.optimizers.Adam(), metrics=['accuracy'])
    return model

data_model = data_model(train_preprocessing, inputs)

In [None]:
data_model.fit(x=train_features_dict, y=train_labels, epochs=5)

In [None]:
print("Evaluate on test data")
loss, accuracy= data_model.evaluate(test_features_dict, test_labels)
print("test loss :", loss)
print("test accuracy: ", accuracy)

In [None]:
#Here Positive number denotes 
print("Predict on test data")
x = data_model.predict(test_features_dict)
x[40:70]

In [None]:
test_labels[40:70]