# Machine Learning Model Building to Predict Absenteesim

## Import the Relevant libraries 

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,1,0,0,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,1,0,0,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,1,0,0,7,3,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8
698,0,1,0,0,5,3,235,16,32,237.656,25,1,0,0,2


- > It seems that the "reason for absence" will be the most indicative.

- > ‘Daily workload Avg’  will have something to do with it as well, since the busier a person is, the less he or she will want to skip work.
    
- > Finally, 'children' and 'pets', together with "distance from work", should also have something to do with absenteeism.
If you're a child or paid to sick at home, you'll have to go home, take them to the doctor and get them back, which will be much more time consuming than a simple visit to the doctor.


# Creating the Targets

- We will take the median value of the absenteeism time in our cell.
- 	Everything below the median would be considered normal.
- 	Everything above the median would be excessive.


In [4]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

**Median is 3.0  |  So, we will analyze that Moderately Absent (<= 3 Hours) and "Excessively absent" as ( >= 4 hours)**


- let's create a new variable called targets, which will measure if a person has been absent for more than three hours.


In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)
# **Here, Anything More than 3 is 1 and less than 3 is 0**

In [6]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

- Better to do that as parameterization! It makes the code easy to understand and follow.
Moreover, this minimizes the chance of making mistakes.

In [7]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                  data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

**Here, Anything More than 3 is 1 and less than 3 is 0**

In [8]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

let's add the targets to the data frame data_preprocessed, we will do that in a new column called Excessive Absenteeism

In [9]:
data_preprocessed['Excessive Absenteeism'] = targets

In [10]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,1,0,0,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,1,0,0,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,1,0,0,7,3,289,36,33,239.554,30,0,2,1,2,0


Using the Median as a Cut-off line is 'numerically stable and rigid'

We have implicitly balanced the dataset!

Roughly half of the targets are zeros, while the other half ones, as you may remember, this will prevent our model from learning to output.

In order to prove that, let's divide the number of targets that are ones by the total number of targets.


In [11]:
targets.sum() / targets.shape[0]

0.45571428571428574

The result is around 0.46, 
**so around 46 percent of the targets are 1s, thus around 54 percent of the targets are 0s.**

# -> Creating a checkpoint  'data_with_targets'

In [12]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], 
                                           axis = 1)

In [13]:
# To find out that the two are, in fact, different or Same ?

data_with_targets is data_preprocessed

False

False, that means yes, obviously both are different

In [14]:
data_with_targets

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,1,0,0,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,1,0,0,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,1,0,0,7,3,289,36,33,239.554,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,1
698,0,1,0,0,5,3,235,16,32,237.656,25,1,0,0,0


**data_with_targets is our CHECK point for further analysis**

# -> Selecting the Inputs (Excluding the target 'Excessive Absenteeism' column)

In [15]:
data_with_targets.head(2)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,1,0,0,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0


In [16]:
data_with_targets.iloc[ :, 0:14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,1,0,0,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,1,0,0,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,1,0,0,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0
698,0,1,0,0,5,3,235,16,32,237.656,25,1,0,0


- new variable called unscalable_inputs = to all data from data with targets with the last column.

In [17]:
unscaled_inputs = data_with_targets.iloc[ :, :-1]

In [18]:
unscaled_inputs.head(1)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,1,0,0,7,1,289,36,33,239.554,30,0,2,1


# -> A Bit of Statistical Preprocessing
## -> standardize the 'unscalled_inputs' data

In [19]:
# import the relevant module from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler

In [20]:
absenteeism_scaler = StandardScaler()
# absenteeism_scalerwill be used to subtract the Mean and Divide by the StdDev variable-wise (feature-wise)

**StandardScaler the unscalled_inputs**

In [21]:
## to fit our input data, we write absenteeism_scalar.fit(upscaled_inputs) 
## It will contain information about the mean and standard deviation!

absenteeism_scaler.fit(unscaled_inputs)
# This line will calculate and store the Mean and the Std Dev


**Transform the std_scalled**

This operation transforms the unscalled_inputs using the information contained in absenteeism scaler, in simple words, we subtract the mean and divide by the standard deviation

In [22]:
# In order to apply it, we must use another method called transform.
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [23]:
scaled_inputs

array([[-0.57735027,  0.82136542, -0.31448545, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-0.57735027, -1.21748491, -0.31448545, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [-0.57735027,  0.82136542, -0.31448545, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 1.73205081, -1.21748491, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027,  0.82136542, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027,  0.82136542, -0.31448545, ..., -0.44798003,
        -0.01928035,  0.26848661]])

In [24]:
scaled_inputs.shape

(700, 14)

Whenever you get new data, you will know that the standardization information is contained in the “absenteeism scalar”
Thus, you'll be able to standardize the new data in the same way. 

# -> Split the data into Train-Test-Split & shuffle

## Import the relevant module

In [25]:
from sklearn.model_selection import train_test_split
# Splits arrays or matrices into random train and test subsets

**Experimentation**
It will split the data into 75 - 25 percentages

In [26]:
train_test_split(scaled_inputs, targets)
# The output we obtained consists of four arrays, a training data set with inputs, then a test data set with inputs, a trained data set with targets, and a test data set with targets.
# array 1: training dataset with inputs
# array 2: training dataset with targets
# array 3: test dataset with inputs
# array 4: test dataset with targets

[array([[ 1.73205081, -1.21748491, -0.31448545, ..., -0.44798003,
         -0.91902997, -0.58968976],
        [-0.57735027,  0.82136542, -0.31448545, ..., -0.44798003,
         -0.91902997, -0.58968976],
        [-0.57735027,  0.82136542, -0.31448545, ..., -0.44798003,
         -0.01928035,  0.26848661],
        ...,
        [-0.57735027,  0.82136542, -0.31448545, ...,  2.23224237,
          0.88046927, -0.58968976],
        [-0.57735027, -1.21748491,  3.17979734, ..., -0.44798003,
         -0.01928035,  2.8430157 ],
        [-0.57735027,  0.82136542, -0.31448545, ..., -0.44798003,
         -0.91902997, -0.58968976]]),
 array([[-0.57735027,  0.82136542, -0.31448545, ..., -0.44798003,
         -0.91902997, -0.58968976],
        [-0.57735027, -1.21748491,  3.17979734, ..., -0.44798003,
         -0.01928035,  1.12666297],
        [-0.57735027,  0.82136542, -0.31448545, ..., -0.44798003,
         -0.01928035, -0.58968976],
        ...,
        [ 1.73205081, -1.21748491, -0.31448545, ..., -

In [27]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets)

In [28]:
print(x_train.shape, y_train.shape)

(525, 14) (525,)


In [29]:
print(x_test.shape, y_test.shape)

(175, 14) (175,)


In [None]:
print(525/700*100)
print(175/700*100)

**Implementation**

In [30]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20);

In [31]:
print(x_train.shape, y_train.shape)
# 80%

(560, 14) (560,)


In [32]:
print(x_test.shape, y_test.shape)
# 20%

(140, 14) (140,)


In [33]:
print(560/700*100)

80.0


# -> Training the Model

## Logistic Regression with sklearn: Fit the model and Finding the Accuracy

In [34]:
from sklearn.linear_model import LogisticRegression
# for model evaluations let's import metrics
from sklearn import metrics

In [35]:
# we must declare a new variable, which will be a logistic regression object
reg = LogisticRegression()

**Fit**

In [36]:
# Next, we must fit the regression, we simply write reg.fit and in parentheses X train and Y train.
reg.fit(x_train, y_train)

**Model Accuracy/ Score**

In [37]:
# It is easy to evaluate the model accuracy. All we have to do is type reg.score.
## two required arguments, inputs and targets.
reg.score(x_train, y_train)

0.7696428571428572

**Based on the data we used, our model learned to lassify - ~77% (0.7696) ofthe observations correctly**
Meaing 77% of the Model outputs match the targets

## Manually Cross Verifying the Training Accuracy

In [38]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

**Y_Train**

In [39]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

**COMPARE**

we write model outputs == y train, we will get an array which compares the elements of the two variables.


In [40]:
model_outputs == y_train

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True, False,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

**let's sum this array using np.sum() | The result will be the total number of true entries**

In [41]:
np.sum((model_outputs == y_train))

431

In [42]:
model_outputs.shape[0]

560

In [43]:
# Accuracy = Correct predictions / Number of Observations:

np.sum((model_outputs == y_train)) / model_outputs.shape[0]
## We get the same Accuracy O/P:

0.7696428571428572

What we get the exact same result as using the np method score, however, this time we have a much better idea of what that result means.

# -> Extracting the Intercept and Coefficients from a Logistic Regression

### Finding the Intercept and Coefficients

**INTERCEPT**

In [44]:
reg.intercept_

array([-0.18530441])

**Coefficients**

In [45]:
reg.coef_

array([[ 1.71171809,  0.91192261,  0.66551397,  0.66551397,  0.01938308,
        -0.09089828,  0.69847107, -0.04167916, -0.17742811, -0.03565277,
         0.29645761, -0.12724118,  0.39539561, -0.31932416]])

**We want to know what variable those coefficients refer to:**

In [46]:
## we have this variable scaled_inputs which contains the inputs but its an ndarray object and not a dataframe
scaled_inputs.columns.values
# since, scaled_inputs is a ndarray so that's why we have errors!

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [47]:
# This is the corrcet variable to extract features(as it is a dataframe)
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month_Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [48]:
# Store it in a new variable called ‘feature_name’
feature_name = unscaled_inputs.columns.values

# Creation of Summary Table:

**creating a neat data frame that will contain the intercept, the feature names and the corresponding coefficients we can call this data frame summary table.**

In [49]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_) #This will create a new column called Coefficient inside, we will have the information from the regression coefficients.

summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,1.711718
1,Reason_2,0.911923
2,Reason_3,0.665514
3,Reason_4,0.665514
4,Month_Value,0.019383
...,...,...
9,Daily Work Load Average,-0.035653
10,Body Mass Index,0.296458
11,Education,-0.127241
12,Children,0.395396


**ADDING Intercept:**

In [50]:
summary_table.index = summary_table.index + 1

summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

summary_table = summary_table.sort_index()

summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.185304
1,Reason_1,1.711718
2,Reason_2,0.911923
3,Reason_3,0.665514
4,Reason_4,0.665514
...,...,...
10,Daily Work Load Average,-0.035653
11,Body Mass Index,0.296458
12,Education,-0.127241
13,Children,0.395396


# -> Interpreting the Logistic Regression Coefficients

Intercept is aka Bias

Coefficient is aka Weight

We also need Odds ratio column!

All the coefficients that we have referred to the log odds, so to make them more interpretable,
let's find the exponential of these coefficients, I'll create a new series in our data frame called Odds Ratio.


In [51]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [52]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-0.185304,0.830851
1,Reason_1,1.711718,5.538469
2,Reason_2,0.911923,2.489104
3,Reason_3,0.665514,1.945490
4,Reason_4,0.665514,1.945490
...,...,...,...
10,Daily Work Load Average,-0.035653,0.964975
11,Body Mass Index,0.296458,1.345086
12,Education,-0.127241,0.880521
13,Children,0.395396,1.484972


<div class="alert alert-block alert-danger"> Same Below analysis done in the Next  Part-2 Notebook </div>


## -> Coding CustomScaler Class with 3 methods (For Scaling, fit, Transform) on Only required dummy variables 

A big, big problem,When we standardize the inputs, we also standardize the dummies.

This is bad practice because when we standardize, we lose the whole interpretability of a dummy.


In [75]:
# import the libraries needed to create the Custom Scaler

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
       # CODE change     
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std
        
    
# the fit method, which, again based on StandardScale   
    def fit(self, X, y=None):
        #CODE added
        self.scaler = StandardScaler(copy=self.copy, with_mean=self.with_mean, with_std=self.with_std)
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
# the transform method which does the actual scaling
    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[ : , ~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [76]:
# check what are all columns that we've got
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month_Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [77]:
# choose the columns to scale, we later augment this code and put it in comments
# columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
       #'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pet']
    
# select the columns to omit instead!
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [78]:
# create the columns to scale, based on the columns to omit
# using list comprehension to iterate over the list

columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [79]:
# declare a scaler object, specifying the columns you want to scale
absenteeism_scaler = CustomScaler(columns_to_scale)


In [80]:
# fit the data (calculate mean and standard deviation); they are automatically stored inside the object 
absenteeism_scaler.fit(unscaled_inputs)


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [81]:
# standardizes the data, using the transform method 
# in the last line, we fitted the data - in other words
# we found the internal parameters of a model that will be used to transform data. 
# transforming applies these parameters to our data
# note that when you get new data, you can just call 'scaler' again and transform it in the same way as now
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [82]:
# the scaled_inputs are now an ndarray, because sklearn works with ndarrays
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,1,0,0,0.030796,-0.800950,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.030796,-0.800950,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,0,1,0,0,0.030796,-0.232900,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.030796,0.335149,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,0,1,0,0,0.030796,0.335149,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.568019,-0.232900,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.568019,-0.232900,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.568019,0.335149,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690
698,0,1,0,0,-0.568019,0.335149,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690


In [83]:
# check the shape of the inputs
scaled_inputs.shape

(700, 14)

In [84]:
# import train_test_split so we can split our data into train and test
from sklearn.model_selection import train_test_split

In [85]:
# declare 4 variables for the split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

In [87]:
# check the shape of the train inputs and targets
print (x_train.shape, y_train.shape)

(560, 14) (560,)


In [88]:
# import the LogReg model from sklearn
from sklearn.linear_model import LogisticRegression

# import the 'metrics' module, which includes important metrics we may want to use
from sklearn import metrics

In [89]:
# create a logistic regression object
reg = LogisticRegression()

In [90]:
# fit our train inputs
# that is basically the whole training part of the machine learning
reg.fit(x_train,y_train)

In [91]:
# assess the train accuracy of the model
reg.score(x_train,y_train)

0.7642857142857142

That's not unusual as we basically changed five input features, we've lost a practically insignificant amount of accuracy, but we've won interpretability. Now that we are all set, we can discuss the coefficients table once again