# Absenteeism Prediction

In [None]:
%config IPCompleter.greedy=True

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
rawAbsenteeism = pd.read_csv("Absenteeism_at_work_train.csv")
print(rawAbsenteeism.shape)
rawAbsenteeism.info()

It appears that there are 666 observations of 21 variables. Two variables (Weight and Hit Target) have less than 666 records, indicating the presence of missing values.

In [None]:
# Resolving conflicts with Age
rawAbsenteeism['Age']  = pd.to_numeric(rawAbsenteeism['Age'] , errors='coerce')
rawAbsenteeism['Age'] = rawAbsenteeism['Age'].replace(np.nan, 0, regex=True)
rawAbsenteeism['Age'] = rawAbsenteeism['Age'].astype('int64')
rawAbsenteeism['Age'].replace(0, np.nan, inplace=True)

In [None]:
rawAbsenteeism['Work load Average/day '] = rawAbsenteeism['Work load Average/day '].str.replace(',', '').astype(float)

 ##  Pre-processing 

In [None]:
# Dropping Duplicates if any to keep only unique values
rawAbsenteeism = rawAbsenteeism.drop_duplicates(keep='first')
print(rawAbsenteeism.shape)


I removed the 27 duplicates from the dataset.  

I noticed the variable "Reason for absence" has a minimum value of 0 when the minimum value should be 1 to represent Certain infectious and parasitic diseases

In [None]:
for i in range(rawAbsenteeism.shape[0]): 
    if rawAbsenteeism["Reason for absence"].iloc[i] == 0.0:
        rawAbsenteeism["Reason for absence"].iloc[i] = np.nan
        
rawAbsenteeism["Reason for absence"].describe()


In [None]:
# Missing Values
print("Missing values: Before")
rawAbsenteeism.isnull().sum() * 100 / len(rawAbsenteeism)

In [None]:
# Resolving missing values with mode substitution 
# based on 5% threshold
rawAbsenteeism["Reason for absence"] = rawAbsenteeism["Reason for absence"].fillna(rawAbsenteeism["Reason for absence"].
                                                                                   mode().iloc[0])
# Deleting the missing values in remaining variables
rawAbsenteeism= rawAbsenteeism.dropna()


In [None]:
# Missing Values
print("Missing values: After")
print(rawAbsenteeism.isnull().sum())

print("Shape of dataset:", rawAbsenteeism.shape)

In [None]:
# Transform feature's data type
rawAbsenteeism["Reason for absence"] = rawAbsenteeism["Reason for absence"].astype("category")
rawAbsenteeism["Month of absence"] = rawAbsenteeism["Month of absence"].astype("category")
rawAbsenteeism["Day of the week"] = rawAbsenteeism["Day of the week"].astype("category")
rawAbsenteeism["Seasons"] = rawAbsenteeism["Seasons"].astype("category")
rawAbsenteeism["Disciplinary failure"]=rawAbsenteeism["Disciplinary failure"].astype("category")
rawAbsenteeism["Education"] = rawAbsenteeism["Education"].astype("category")
rawAbsenteeism["Social drinker"] = rawAbsenteeism["Social drinker"].astype("category")
rawAbsenteeism["Social smoker"] = rawAbsenteeism["Social smoker"].astype("category")

rawAbsenteeism.info()

In [None]:
rawAbsenteeism.columns = rawAbsenteeism.columns.str.replace('/', ' per ').str.strip()

In [None]:
non_numrawAbsenteeism = rawAbsenteeism.drop(['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons', 'Disciplinary failure', 
                   'Education', 'Social drinker', 'Social smoker'], axis=1)

non_numrawAbsenteeism.info()

In [None]:
# Detecting and removing Outliers
AbsenteeismOutlier = rawAbsenteeism[(np.abs(stats.zscore(non_numrawAbsenteeism)) < 3).all(axis=1)] 
AbsenteeismOutlier.describe()
print(rawAbsenteeism.shape[0]-AbsenteeismOutlier.shape[0])

In [None]:
# saving the cleaned dataset
rawAbsenteeism.to_csv('CleanedAbsenteeism_DF.csv')

Testsample = rawAbsenteeism.copy()

In [None]:
# Grouping variables and storing into two categories: Categorical and Continous 
catvariables = ['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons', 'Disciplinary failure', 
                   'Education', 'Social drinker', 'Social smoker']

contVariables = ['Transportation expense', 'Distance from Residence to Work', 
                 'Service time', 'Age', 'Work load Average per day', 'Hit target', 'Son', 'Pet', 'Weight', 'Height', 
                 'Body mass index']

In [None]:
print("Before: \n")
print("Maximum of Absenteeism time in hours is", rawAbsenteeism['Absenteeism time in hours'].max(), " while the mean value is ", 
      rawAbsenteeism['Absenteeism time in hours'].mean())
for i in contVariables:
    print("Maximum of", i, "is", rawAbsenteeism[i].max(), " while the mean value is ", rawAbsenteeism[i].mean())


In [None]:
# standardization
for i in contVariables:
     rawAbsenteeism[i] = preprocessing.scale(rawAbsenteeism[i])
    

In [None]:
print("After scaling: \n")
print("Maximum of Absenteeism time in hours is", rawAbsenteeism['Absenteeism time in hours'].max(), " while the mean value is ", 
      rawAbsenteeism['Absenteeism time in hours'].mean())
for i in contVariables:
    print("Maximum of ", i, "is", rawAbsenteeism[i].max(), " while the mean value is ", rawAbsenteeism[i].mean())

## Exploratory Data Analysis

This dataset only shows the absent hous of the employees for the period

In [None]:
with sns.axes_style(style='whitegrid'):
    boxplotTarget = sns.catplot("Day of the week", "Absenteeism time in hours", "Seasons", data=rawAbsenteeism, kind="box")
    boxplotTarget.set_axis_labels("Day", " Absenteeism time in hours");

In [None]:
sns.set_style("ticks")
sns.catplot(data=rawAbsenteeism, x='Reason for absence', kind= 'count',height=3,aspect=3)
sns.catplot(data=rawAbsenteeism, x='Social drinker', kind= 'count',height=3,aspect=2)
sns.catplot(data=rawAbsenteeism, x='Disciplinary failure', kind= 'count',height=3,aspect=2)

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
ax.scatter(rawAbsenteeism["Age"], rawAbsenteeism['Absenteeism time in hours'])
ax.set_xlabel('Proportion of Age')
ax.set_ylabel('Full-value Absenteeism time in hours')
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = (20,20)
rawAbsenteeism.hist()
plt.show()

In [None]:
from scipy.stats import norm
mu = rawAbsenteeism["Absenteeism time in hours"].mean()
sigma = rawAbsenteeism["Absenteeism time in hours"].std()
x = np.linspace(mu-3*sigma, mu+3*sigma, 100)

fig= plt.figure(figsize=(5,5))
plt.axvline(x=mu, color='r')
plt.plot(x, norm.pdf(x,mu,sigma))
plt.hist(rawAbsenteeism["Absenteeism time in hours"], color='yellow', density=True, bins=50)
plt.title('Distribution of absenteeism hours')
plt.xlabel('hours')
plt.ylabel('density of absent hours')
plt.show()

In [None]:
# Creating a copy of cleaned dataset for modeling 
CatYAbsenteeism_DF = rawAbsenteeism.copy()


## Transforming target variable to categorical type to fulfill task 1

In [None]:
#Transforming target variable to categorical to fulfill task 1
CatYAbsenteeism_DF.loc[CatYAbsenteeism_DF['Absenteeism time in hours'] == 0, 'Absenteeism time in hours'] = 0
CatYAbsenteeism_DF.loc[(CatYAbsenteeism_DF['Absenteeism time in hours'] > 0) & 
                       (CatYAbsenteeism_DF['Absenteeism time in hours'] <= 6), 'Absenteeism time in hours'] = 1
CatYAbsenteeism_DF.loc[CatYAbsenteeism_DF['Absenteeism time in hours'] > 6, 'Absenteeism time in hours'] = 2

In [None]:
CatYAbsenteeism_DF['Absenteeism time in hours'] = CatYAbsenteeism_DF['Absenteeism time in hours'].astype('category') # factored
target_AbsentHours = CatYAbsenteeism_DF['Absenteeism time in hours']
print(target_AbsentHours.value_counts().sort_values(ascending=True))



## Feature Selection

In [None]:
CatYAbsenteeism_DF[contVariables]

In [None]:
corr_df = CatYAbsenteeism_DF[contVariables]

corr = corr_df.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(corr_df.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(corr_df.columns)
ax.set_yticklabels(corr_df.columns)
plt.show()


In [None]:
# Create correlation matrix
X = CatYAbsenteeism_DF.drop(['Absenteeism time in hours'], axis=1)

corr_matrix = X.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.60
to_drop = [column for column in upper.columns if any(upper[column] > 0.6)]

print("These variables didnt qualify:", to_drop)
Xnew = X.drop(to_drop, axis=1)


In [None]:
# using 75–25% ratio split
X = CatYAbsenteeism_DF.drop(['ID','Absenteeism time in hours','Age', 'Body mass index'], axis=1)

# removing age and body mass index
removedcont = [3,10]
contVariables=np.delete(contVariables, removedcont).tolist()

In [None]:
Standardscaler = StandardScaler()
X[contVariables] = Standardscaler.fit_transform(X[contVariables])

In [None]:
X_train, X_test,y_train, y_test = train_test_split(X, target_AbsentHours, test_size=0.25, random_state= 123)

print("training:",len(X_train))
print("testing:",len(X_test))
print()
print("testing Class:\n", y_test.value_counts(normalize=True)*100)
print()
print("training Class:\n", y_train.value_counts(normalize=True)*100)


## Training and Making Predictions

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from IPython.display import SVG 
from graphviz import Source 
from IPython.display import display
from sklearn.metrics import classification_report

In [None]:
# Decision Tree
# Training a model on the data
treemodel = DecisionTreeClassifier() 
treemodel = treemodel.fit(X_train, y_train)
print(treemodel)

In [None]:
# visualize model 
graph = Source(tree.export_graphviz(treemodel, 
                                    out_file=None, 
                                    feature_names=X_train.columns,
                                    class_names=['0', '1', '2'],
                                    filled = True) )
display(SVG(graph.pipe(format='svg')))

graph.format = 'png'
graph.render('tree_Absenteeism',view=True)


In [None]:
# Evaluating Model Performance
tree_predict = treemodel.predict(X_test) 

print("Decision Tree: \n")
print("Confusion matrix:\n", confusion_matrix(y_test, tree_predict)) 
treeScore = accuracy_score(y_test, tree_predict)*100
print("Predictive Accuracy score:", treeScore)

print("Classification Report: \n", classification_report(y_test, tree_predict, labels=np.unique(tree_predict)))


In [None]:
# Naive Bayes
gnb = GaussianNB()
gnb_model = gnb.fit(X_train, y_train)

naive_predict=gnb_model.predict(X_test)

print("Naive Bayes: \n")
print("Confusion matrix:\n",confusion_matrix(y_test, naive_predict))
naivebayesScore = accuracy_score(y_test, naive_predict)*100
print("Predictive Accuracy score:",naivebayesScore) 

In [None]:
# SVM - Polynomial
clf = svm.SVC(kernel='poly', degree=3)
clf.fit(X_train, y_train)

# Test result
poly_predict = clf.predict(X_test)

print("SVM - Polynomial: \n")
print("Confusion Matrix:", confusion_matrix(y_test, poly_predict))
polynomialScore = accuracy_score(y_test, poly_predict)*100
print("Accuracy Score:", polynomialScore)

In [None]:
# SVM - RBF
clf = svm.SVC(kernel='rbf', gamma=.3)
clf.fit(X_train, y_train)

# Test result
svmrbf_predict = clf.predict(X_test)

print("SVM - RBF: \n")
print("Confusion Matrix:", confusion_matrix(y_test, svmrbf_predict))
rbfScore = accuracy_score(y_test, svmrbf_predict)*100
print("Accuracy Score:", rbfScore)

**Visual representation of Accuracy scores among models:**

In [None]:
dataScore = {'Model_name':['Decision Tree', 'Naive Bayes', 'SVM_Polynomial', 'SVM_RBF'],  
             'Accuracy_Score':[treeScore, naivebayesScore, polynomialScore, rbfScore]} 
  
df_Score = pd.DataFrame(dataScore) 
  
# display dataframe
df_Score.head()

In [None]:
import matplotlib.ticker as mtick

plt.figure(figsize=(10,8))
sns.set_style("whitegrid", {'axes.grid' : False})
accuracyplot = sns.barplot(x='Model_name', y='Accuracy_Score', data=df_Score)
accuracyplot.yaxis.set_major_formatter(mtick.PercentFormatter())
for p in accuracyplot.patches:
    accuracyplot.annotate('{:.2f}%'.format(p.get_height()),
                          (p.get_x() + p.get_width() / 2, p.get_height()),
                          ha = 'center', va = 'center', xytext = (0, 10), 
                          textcoords = 'offset points')
accuracyplot.set(xlabel="Model", ylabel="Accuracy Score",title="Bar graph showing the accuracy score for model")