# Relevant libraries

In [1]:
import numpy as np
import pandas as pd

# Load Data

In [2]:
df = pd.read_csv("Absenteeism_data_prep.csv")
df.head(10)

Unnamed: 0,reasonGroup_1,reasonGroup_2,reasonGroup_3,reasonGroup_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Weekday,Month Value
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,1,7
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,1,7
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,2,7
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,3,7
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,3,7
5,0,0,0,1,179,51,38,239.554,31,0,0,0,2,2,10
6,0,0,0,1,361,52,28,239.554,27,0,1,4,8,4,7
7,0,0,0,1,260,50,36,239.554,23,0,4,0,4,4,7
8,0,0,1,0,155,12,34,239.554,25,0,2,0,40,6,6
9,0,0,0,1,235,11,37,239.554,29,1,1,1,8,0,7


# Create targets
#### Moderately absent < median
#### Excessively absent > median

Using the median as target we also balance the targets (roughly 50/50)

In [3]:
targets = np.where(df["Absenteeism Time in Hours"] > df["Absenteeism Time in Hours"].median(), 1, 0)

In [4]:
df_targets = df
df_targets

Unnamed: 0,reasonGroup_1,reasonGroup_2,reasonGroup_3,reasonGroup_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Weekday,Month Value
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,1,7
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,1,7
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,2,7
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,3,7
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,3,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,8,2,5
696,1,0,0,0,225,26,28,237.656,24,0,1,2,3,2,5
697,1,0,0,0,330,16,28,237.656,25,1,0,0,8,3,5
698,0,0,0,1,235,16,32,237.656,25,1,0,0,2,3,5


In [5]:
df_targets = df_targets.drop(["Absenteeism Time in Hours"], axis=1)

## Backward Elimination

In [7]:
df_targets = df_targets.drop(["Weekday", "Daily Work Load Average","Distance to Work"], axis=1)

In [8]:
df is df_targets

False

# Select Inputs

In [9]:
inputs = df_targets.iloc[:,:14]
inputs

Unnamed: 0,reasonGroup_1,reasonGroup_2,reasonGroup_3,reasonGroup_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value
0,0,0,0,1,289,33,30,0,2,1,7
1,0,0,0,0,118,50,31,0,1,0,7
2,0,0,0,1,179,38,31,0,0,0,7
3,1,0,0,0,279,39,24,0,2,0,7
4,0,0,0,1,289,33,30,0,2,1,7
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,40,22,1,2,0,5
696,1,0,0,0,225,28,24,0,1,2,5
697,1,0,0,0,330,28,25,1,0,0,5
698,0,0,0,1,235,32,25,1,0,0,5


# Split the data

In [10]:
from sklearn.model_selection import train_test_split

### Train, test, validation

#### Reorder columns to separe dummies when applying StandardScaler

In [11]:
columns_value_names = inputs.columns.values
columns_value_names

array(['reasonGroup_1', 'reasonGroup_2', 'reasonGroup_3', 'reasonGroup_4',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value'], dtype=object)

Dummies: 'reasonGroup_1', 'reasonGroup_2', 'reasonGroup_3', 'reasonGroup_4',
                     'Education',

In [12]:
new_columns_order = ['reasonGroup_1', 'reasonGroup_2', 'reasonGroup_3', 'reasonGroup_4',
                     'Education','Transportation Expense', 'Age',  
                    'Body Mass Index','Children', 'Pets', 'Month Value']

In [13]:
inputs = inputs[new_columns_order]
inputs

Unnamed: 0,reasonGroup_1,reasonGroup_2,reasonGroup_3,reasonGroup_4,Education,Transportation Expense,Age,Body Mass Index,Children,Pets,Month Value
0,0,0,0,1,0,289,33,30,2,1,7
1,0,0,0,0,0,118,50,31,1,0,7
2,0,0,0,1,0,179,38,31,0,0,7
3,1,0,0,0,0,279,39,24,2,0,7
4,0,0,0,1,0,289,33,30,2,1,7
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,1,179,40,22,2,0,5
696,1,0,0,0,0,225,28,24,1,2,5
697,1,0,0,0,1,330,28,25,0,0,5
698,0,0,0,1,1,235,32,25,0,0,5


In [14]:
X_train, X_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=42, shuffle=True)

0.8*0.125 = 0.1 = 10% validation

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125,random_state=42, shuffle=True)

In [16]:
X_train_unscaled = X_train.iloc[:,5:14]
X_train_dummies = X_train.iloc[:,0:5]
X_val_unscaled = X_val.iloc[:,5:14]
X_val_dummies = X_val.iloc[:,0:5]
X_test_unscaled = X_test.iloc[:,5:14]
X_test_dummies = X_test.iloc[:,0:5]

# Standardize the data

Below, bin_vars_index is an array of column indexes for the binary variable and cont_vars_index is the same for the continuous variables that you want to scale.

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
scaler = StandardScaler()

In [21]:
scaler.fit(X_train_unscaled)

StandardScaler()

In [22]:
X_train_scaled = scaler.transform(X_train_unscaled)
X_val_scaled = scaler.transform(X_val_unscaled)
X_test_scaled = scaler.transform(X_test_unscaled)

In [23]:
X_train_dummies.values

array([[0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       ...,
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 1],
       [0, 0, 0, 1, 0]])

In [24]:
X_train_scaled = np.concatenate((X_train_dummies.values, X_train_scaled), axis=1)
X_val_scaled = np.concatenate((X_val_dummies.values, X_val_scaled), axis=1)
X_test_scaled = np.concatenate((X_test_dummies.values, X_test_scaled), axis=1)


In [25]:
pd.set_option('display.max_rows', 1000)

In [26]:
X_train_scaled.shape

(490, 11)

# Logistic regression with sklearn

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [28]:
reg = LogisticRegression(max_iter=5000)

In [29]:
X_train_scaled.shape

(490, 11)

In [30]:
y_train.shape

(490,)

In [31]:
reg.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=5000)

In [32]:
reg.score(X_train_scaled,y_train)

0.7673469387755102

### Manually check the accuracy

In [33]:
outputs = reg.predict(X_train_scaled)

In [34]:
np.sum(outputs == y_train)

376

In [35]:
outputs.shape[0]

490

In [36]:
np.sum((outputs == y_train)/outputs.shape[0])

0.7673469387755102

### Intercept and coefficients

In [37]:
reg.intercept_

array([-1.55052938])

In [38]:
reg.coef_

array([[ 2.65911504,  0.69837216,  2.91271305,  0.78808818, -0.25925512,
         0.58130658, -0.26594237,  0.18511705,  0.42560432, -0.27419886,
        -0.00724409]])

In [39]:
feature_name = inputs.columns.values
feature_name

array(['reasonGroup_1', 'reasonGroup_2', 'reasonGroup_3', 'reasonGroup_4',
       'Education', 'Transportation Expense', 'Age', 'Body Mass Index',
       'Children', 'Pets', 'Month Value'], dtype=object)

In [40]:
summary_table = pd.DataFrame(columns=["Feature name"], data=feature_name)
summary_table["Coefficient"]=np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,reasonGroup_1,2.659115
1,reasonGroup_2,0.698372
2,reasonGroup_3,2.912713
3,reasonGroup_4,0.788088
4,Education,-0.259255
5,Transportation Expense,0.581307
6,Age,-0.265942
7,Body Mass Index,0.185117
8,Children,0.425604
9,Pets,-0.274199


In [41]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ["Intercept", reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.550529
1,reasonGroup_1,2.659115
2,reasonGroup_2,0.698372
3,reasonGroup_3,2.912713
4,reasonGroup_4,0.788088
5,Education,-0.259255
6,Transportation Expense,0.581307
7,Age,-0.265942
8,Body Mass Index,0.185117
9,Children,0.425604


#### Interpret coefficients and intercept

In [42]:
summary_table["Odds_ratio"] = np.exp(summary_table.Coefficient)
summary_table.sort_values("Odds_ratio", ascending= False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,reasonGroup_3,2.912713,18.406669
1,reasonGroup_1,2.659115,14.283643
4,reasonGroup_4,0.788088,2.199188
2,reasonGroup_2,0.698372,2.010477
6,Transportation Expense,0.581307,1.788374
9,Children,0.425604,1.530515
8,Body Mass Index,0.185117,1.203359
11,Month Value,-0.007244,0.992782
5,Education,-0.259255,0.771626
7,Age,-0.265942,0.766483


A feature is not particularly important if its coefficient is around 0 or if its odds ratio is around 1.
A weight (coef) of 0 implies that no matter the feature value, we will multiply it by 0 (in the model).
The odds multiple by a multiple equal to the odds ratio (1 = no change).

# Testing the model

In [43]:
reg.score(X_test_scaled,y_test)

0.7714285714285715

In [44]:
predicted_proba = reg.predict_proba(X_test_scaled)

In [45]:
predicted_proba[:,1]

array([0.21257822, 0.15490836, 0.22798256, 0.41349688, 0.38645582,
       0.90738934, 0.27687106, 0.60084331, 0.25360656, 0.22150859,
       0.156043  , 0.28550462, 0.74519024, 0.51597585, 0.31098699,
       0.54724314, 0.11521918, 0.71364464, 0.15462576, 0.38850675,
       0.30816625, 0.22188135, 0.3091384 , 0.28815611, 0.12965552,
       0.80357536, 0.38850675, 0.4042592 , 0.22337687, 0.38902011,
       0.13511982, 0.13336231, 0.60291396, 0.51489671, 0.31052427,
       0.65632692, 0.30821638, 0.13336231, 0.82670826, 0.23699698,
       0.54617239, 0.22225456, 0.59980662, 0.15689841, 0.22487956,
       0.74564483, 0.78997838, 0.82854149, 0.30729589, 0.15661285,
       0.22150859, 0.30683622, 0.38645582, 0.92865295, 0.13286372,
       0.22487956, 0.96873531, 0.31052427, 0.85072319, 0.24787586,
       0.57533696, 0.13361219, 0.42872381, 0.59824992, 0.156043  ,
       0.40321906, 0.64986616, 0.06344136, 0.31145009, 0.49445538,
       0.22412732, 0.22337687, 0.67853456, 0.31006194, 0.13014

# Save the model

In [46]:
import pickle

In [47]:
with open("absenteeism_model","wb") as file:
    pickle.dump(reg, file)

In [48]:
with open("absenteeism_scaler","wb") as file:
    pickle.dump(scaler, file)