In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
 data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
 data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3


In [4]:
#The approach we will use here is to create two classes/targets, one representing people who have been excessively absent and another which represents people that haven't.

#We will take the median value of the absenteeism time in our cell.

# by using the median we have implicitly balanced the data set.

#Roughly half of the targets are zeros, while the other half ones

#if we dont use the median then One of the two classes exclusively thinking it did very well.

#Everything below the median would be considered normal- which is number 0

#Everything above the median would be excessively absent - which is number 1

data_preprocessed['Absenteeism Time in Hours'].median()


3.0

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours']> 3,1,0)

targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
data_preprocessed['Excessive Absenteeism'] = targets



In [7]:
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2,0
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3,1
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3,0


In [8]:
targets.sum()

#total of targets is 319 meaning there are 319 number-1 values. 
# 319/700 people are excessively absent

319

In [9]:
targets.shape[0]

700

In [10]:
targets.sum()/targets.shape[0]

#The result is around zero point four six, so around 46 percent of the targets are ones, thus around 54 percent of the targets are zero.

#Usually 60 40 split will work equally well for a logistic regression.

# However, a 45-55 percent is almost always sufficient 

0.45571428571428574

In [11]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis =1)

In [12]:
data_with_targets.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,7,1,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,7,1,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,7,2,0
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,7,3,1
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,7,3,0


In [13]:
data_with_targets is data_preprocessed

False

In [14]:
#checkpoint to save data
data_with_targets = data_with_targets.copy()



In [15]:
data_with_targets.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,7,1,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,7,1,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,7,2,0
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,7,3,1
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,7,3,0


In [16]:
data_with_targets.shape

(700, 16)

In [17]:
# dropping Date column we already have Day of the Week
data_with_targets = data_with_targets.drop(['Date'], axis =1 )

In [18]:
data_with_targets.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2,0
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3,1
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3,0


In [19]:
data_with_targets.shape

(700, 15)

In [20]:
#select the inputs for our regression

data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,5,2
696,1,0,0,0,225,26,28,237.656,24,0,1,2,5,2
697,1,0,0,0,330,16,28,237.656,25,1,0,0,5,3
698,0,0,0,1,235,16,32,237.656,25,1,0,0,5,3


In [21]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

In [22]:
#scaling function
absenteeism_scaler = StandardScaler()

In [23]:

#This line will calculate the mean and standard deviation of each feature from unscalable inputs.

absenteeism_scaler.fit(unscaled_inputs)

StandardScaler()

In [24]:
#Transform unscaled inputs into Scaled inputs

scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [25]:
scaled_inputs

array([[-0.57735027, -0.09298136, -0.31448545, ...,  0.26848661,
         0.18272635, -0.68370352],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         0.18272635, -0.68370352],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         0.18272635, -0.00772546],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ..., -0.58968976,
        -0.3882935 ,  0.66825259],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
        -0.3882935 ,  0.66825259],
       [-0.57735027, -0.09298136, -0.31448545, ...,  0.26848661,
        -0.3882935 ,  0.66825259]])

In [26]:
scaled_inputs.shape

(700, 14)

In [27]:
# we dont want to over predict our data just incase something happens and our regression model can not handle it.Therefore, we need to add some data aside for testing.
train_test_split(scaled_inputs, targets)

#array 1 = a training dataset with inputs = x_train
#array 2 = a training dataset with targets =  y_train
#array 3 = a test dataset with inputs = x_test
#array 4 = a test dataset with targets = y_test

[array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
          0.7537462 ,  1.34423065],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         -0.95931334, -0.00772546],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
          0.18272635,  0.66825259],
        ...,
        [ 1.73205081, -0.09298136, -0.31448545, ...,  1.12666297,
          1.03925612, -0.00772546],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
          1.03925612, -0.00772546],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
          0.7537462 ,  1.34423065]]),
 array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         -1.24482327, -0.68370352],
        [-0.57735027, -0.09298136,  3.17979734, ...,  0.26848661,
          0.46823627, -0.00772546],
        [-0.57735027, -0.09298136, -0.31448545, ...,  0.26848661,
         -0.95931334, -0.68370352],
        ...,
        [-0.57735027, -0.09298136,  3.17979734, ..., -

In [28]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state =20)

In [29]:
print (x_train.shape,y_train.shape)

(560, 14) (560,)


In [30]:
print (x_test.shape, y_test.shape)

# our 80 - 20 split worked

(140, 14) (140,)


In [31]:
# MODELLING

reg = LogisticRegression()

In [32]:
reg.fit(x_train,y_train)

LogisticRegression()

In [33]:
reg.score(x_train,y_train)


0.7839285714285714

In [34]:
#### Manually check the accuracy of the model ######

#Accuracy means that x% ( inputs) of the model outputs match the targets
# we are trying to predict for the absent hours based on the trained input pattern that we have , Logistic Regression will predict outputs that are close to targets as possible.

#So if we want to find the accuracy of a model manually, we should find the outputs and compare them using Predict function

model_outputs = reg.predict(x_train)

In [35]:
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [36]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [37]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [38]:
#total number of prediction = total number of true as in boolean true = 1

np.sum(model_outputs == y_train)

439

In [39]:
model_outputs.shape[0]

560

In [None]:

#Accuracy = Correct predictions/Observations

439/560=

