# More Preprocessing

## Import relevant libraries

In [4]:
import pandas as pd
import numpy as np

## Load data

In [6]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [7]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,0,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Create targets

There will be two types of targets: moderately absent and extremely absent.

The median is used to create a balanced dataframe. This is going to be useful for the model later.

In [9]:
# Take median value of absenteeism as a cut-off value
median = data_preprocessed['Absenteeism Time in Hours'].median()

In [10]:
# Create targets
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > median, 1, 0)

In [11]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [12]:
data_preprocessed['Excessive Absenteeism'] = targets

In [13]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,0,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [14]:
# check how many targets are 1's - balanced
targets.sum()/targets.shape[0]

0.45571428571428574

In [15]:
# Drop output and inputs from backward elimination
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Daily Work Load Average', 'Distance to Work'], axis = 1)

In [16]:
# Check if these are the same or different - good!
data_with_targets is data_preprocessed

False

## Inputs for Regression

In [18]:
# Use iloc to select position in the dataframe [row, column]
unscaled_inputs = data_with_targets.iloc[:,:-1]
unscaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,33,30,0,2,1
1,0,0,0,0,7,1,118,50,31,0,1,0
2,0,0,0,1,7,2,179,38,31,0,0,0
3,0,0,0,0,7,3,279,39,24,0,2,0
4,0,0,0,1,7,3,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
695,0,0,0,0,5,2,179,40,22,1,2,0
696,0,0,0,0,5,2,225,28,24,0,1,2
697,0,0,0,0,5,3,330,28,25,1,0,0
698,0,0,0,1,5,3,235,32,25,1,0,0


In [19]:
# Create scaler object
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):

    def __init__(self,columns):
        self.scaler = StandardScaler()
        self.columns = columns       

    def fit(self, X):       
        self.scaler.fit(X[self.columns])       
        return self

    def transform(self, X):       
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1) [init_col_order]

In [20]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [21]:
# Use list comprehension
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [22]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [23]:
# Store mean and sd from unscaled_inputs
absenteeism_scaler.fit(unscaled_inputs)

In [24]:
# Scale the inputs
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [25]:
# Check scaled_inputs
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.007725,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,0,0,0,0,0.182726,0.668253,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,0.668253,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...
695,0,0,0,0,-0.388293,-0.007725,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,0,0,0,0,-0.388293,-0.007725,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,0,0,0,0,-0.388293,0.668253,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.668253,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [26]:
# Check shape
scaled_inputs.shape

(700, 12)

# Split the Data

## Import libraries

In [29]:
from sklearn.model_selection import train_test_split

## Split Data

Splitting into a training and test set. Setting as 90/10 or 80/20 and shuffling.

In [31]:
# Splitting and shuffling at 80% at a random state
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [32]:
print(x_train.shape, y_train.shape)

(560, 12) (560,)


In [33]:
print(x_test.shape, y_test.shape)

(140, 12) (140,)


# Create the Model

## Import modules

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Training the model

In [38]:
# Store object
reg = LogisticRegression()

In [39]:
# Fit regression with x and y training parameters
reg.fit(x_train,y_train)

In [40]:
# Check accuracy
reg.score(x_train,y_train)

0.7178571428571429

 ## Manual Accuracy Calculations

In [42]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1,

In [43]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [44]:
# Check outputs
model_outputs == y_train

array([ True,  True,  True,  True, False,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False, False,  True,  True,
       False,  True, False,  True,  True,  True,  True, False,  True,
        True, False, False, False,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True, False,
        True,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True, False, False,  True,  True,  True, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [45]:
# How many equal true
true_entries = np.sum((model_outputs==y_train))

In [46]:
# Manual accuracy
true_entries/model_outputs.shape[0]

0.7178571428571429

## Coefficients and Intercepts

In [48]:
# Find intercept
reg.intercept_

array([0.37717509])

In [49]:
# Find coefficients/weights
reg.coef_

array([[ 1.8535543 , -0.18616809,  1.30453973, -1.12580013,  0.11347887,
        -0.09568826,  0.52560518, -0.19668211,  0.23860728, -0.25555599,
         0.30175637, -0.29223958]])

In [50]:
# Get names 
feature_name = unscaled_inputs.columns.values

In [51]:
# Summary table
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)
# Transpose coeffients to be columns
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,1.853554
1,Reason_2,-0.186168
2,Reason_3,1.30454
3,Reason_4,-1.1258
4,Month Value,0.113479
5,Day of the Week,-0.095688
6,Transportation Expense,0.525605
7,Age,-0.196682
8,Body Mass Index,0.238607
9,Education,-0.255556


In [52]:
# Shift all index by 1 to create a 0 index
summary_table.index = summary_table.index + 1
# Add in the intercept/bias
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,0.377175
1,Reason_1,1.853554
2,Reason_2,-0.186168
3,Reason_3,1.30454
4,Reason_4,-1.1258
5,Month Value,0.113479
6,Day of the Week,-0.095688
7,Transportation Expense,0.525605
8,Age,-0.196682
9,Body Mass Index,0.238607


## Coefficient Interpretation

In [54]:
# Find odds ratio
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [55]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,0.377175,1.45816
1,Reason_1,1.853554,6.382464
2,Reason_2,-0.186168,0.830134
3,Reason_3,1.30454,3.685992
4,Reason_4,-1.1258,0.324393
5,Month Value,0.113479,1.120168
6,Day of the Week,-0.095688,0.908747
7,Transportation Expense,0.525605,1.691482
8,Age,-0.196682,0.821452
9,Body Mass Index,0.238607,1.26948


In [56]:
# Sort dataframe - if odds ratio is around 1 and coefficient is around 0, less important
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
1,Reason_1,1.853554,6.382464
3,Reason_3,1.30454,3.685992
7,Transportation Expense,0.525605,1.691482
0,Intercept,0.377175,1.45816
11,Children,0.301756,1.352232
9,Body Mass Index,0.238607,1.26948
5,Month Value,0.113479,1.120168
6,Day of the Week,-0.095688,0.908747
2,Reason_2,-0.186168,0.830134
8,Age,-0.196682,0.821452


# Test the Model

In [102]:
# Accuracy
reg.score(x_test,y_test)

0.6928571428571428

In [106]:
# Probability of 0 or 1
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.72816549, 0.27183451],
       [0.63169485, 0.36830515],
       [0.44332996, 0.55667004],
       [0.77635148, 0.22364852],
       [0.03034254, 0.96965746],
       [0.46505627, 0.53494373],
       [0.4965402 , 0.5034598 ],
       [0.12287105, 0.87712895],
       [0.76748536, 0.23251464],
       [0.72821122, 0.27178878],
       [0.51502145, 0.48497855],
       [0.22314801, 0.77685199],
       [0.07179438, 0.92820562],
       [0.70221078, 0.29778922],
       [0.28116826, 0.71883174],
       [0.57399425, 0.42600575],
       [0.55529803, 0.44470197],
       [0.59477742, 0.40522258],
       [0.59048754, 0.40951246],
       [0.04747093, 0.95252907],
       [0.69114753, 0.30885247],
       [0.7649202 , 0.2350798 ],
       [0.55531029, 0.44468971],
       [0.5869853 , 0.4130147 ],
       [0.2086022 , 0.7913978 ],
       [0.73733628, 0.26266372],
       [0.53438565, 0.46561435],
       [0.84970713, 0.15029287],
       [0.16259376, 0.83740624],
       [0.75309046, 0.24690954],
       [0.

In [108]:
# Slice out all values of 1
predicted_proba[:,1]

array([0.27183451, 0.36830515, 0.55667004, 0.22364852, 0.96965746,
       0.53494373, 0.5034598 , 0.87712895, 0.23251464, 0.27178878,
       0.48497855, 0.77685199, 0.92820562, 0.29778922, 0.71883174,
       0.42600575, 0.44470197, 0.40522258, 0.40951246, 0.95252907,
       0.30885247, 0.2350798 , 0.44468971, 0.4130147 , 0.7913978 ,
       0.26266372, 0.46561435, 0.15029287, 0.83740624, 0.24690954,
       0.36825139, 0.53140832, 0.53497247, 0.50117369, 0.2350798 ,
       0.49648433, 0.23834638, 0.81067468, 0.41691395, 0.55633608,
       0.26537888, 0.30980794, 0.23836736, 0.80483446, 0.91200949,
       0.68103049, 0.53742384, 0.28482512, 0.22189969, 0.21259926,
       0.42183178, 0.77703637, 0.5188211 , 0.29604885, 0.69952524,
       0.44621804, 0.90382993, 0.29341463, 0.74924888, 0.74313489,
       0.56016388, 0.51072851, 0.32811015, 0.82636075, 0.21858056,
       0.28480159, 0.44296198, 0.26261898, 0.78668805, 0.3255788 ,
       0.23834638, 0.72513581, 0.78937707, 0.39360329, 0.59251

# Export Model

In [None]:
# Use pickling to convert object into character stream
import pickle

# 'Model' as file name, 'wb' as write bytes, 'dump' the information into a file, object to be dumped
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [117]:
# Export absentee scaler to separate model from training data
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)