# Relevant libraries

In [1]:
import numpy as np
import pandas as pd

# Load Data

In [2]:
df = pd.read_csv("Absenteeism_data_prep.csv")
df.head(10)

Unnamed: 0,reasonGroup_1,reasonGroup_2,reasonGroup_3,reasonGroup_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Weekday,Month Value
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,1,7
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,1,7
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,2,7
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,3,7
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,3,7
5,0,0,0,1,179,51,38,239.554,31,0,0,0,2,2,10
6,0,0,0,1,361,52,28,239.554,27,0,1,4,8,4,7
7,0,0,0,1,260,50,36,239.554,23,0,4,0,4,4,7
8,0,0,1,0,155,12,34,239.554,25,0,2,0,40,6,6
9,0,0,0,1,235,11,37,239.554,29,1,1,1,8,0,7


# Create targets
#### Moderately absent < median
#### Excessively absent > median

Using the median as target we also balance the targets (roughly 50/50)

In [3]:
targets = np.where(df["Absenteeism Time in Hours"] > df["Absenteeism Time in Hours"].median(), 1, 0)

In [4]:
df_targets = df
df_targets

Unnamed: 0,reasonGroup_1,reasonGroup_2,reasonGroup_3,reasonGroup_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Weekday,Month Value
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,1,7
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,1,7
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,2,7
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,3,7
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,3,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,8,2,5
696,1,0,0,0,225,26,28,237.656,24,0,1,2,3,2,5
697,1,0,0,0,330,16,28,237.656,25,1,0,0,8,3,5
698,0,0,0,1,235,16,32,237.656,25,1,0,0,2,3,5


In [5]:
df_targets = df_targets.drop(["Absenteeism Time in Hours"], axis=1)

In [6]:
df is df_targets

False

# Select Inputs

In [7]:
inputs = df_targets.iloc[:,:14]
inputs

Unnamed: 0,reasonGroup_1,reasonGroup_2,reasonGroup_3,reasonGroup_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Weekday,Month Value
0,0,0,0,1,289,36,33,239.554,30,0,2,1,1,7
1,0,0,0,0,118,13,50,239.554,31,0,1,0,1,7
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7
3,1,0,0,0,279,5,39,239.554,24,0,2,0,3,7
4,0,0,0,1,289,36,33,239.554,30,0,2,1,3,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,2,5
696,1,0,0,0,225,26,28,237.656,24,0,1,2,2,5
697,1,0,0,0,330,16,28,237.656,25,1,0,0,3,5
698,0,0,0,1,235,16,32,237.656,25,1,0,0,3,5


# Split the data

In [8]:
from sklearn.model_selection import train_test_split

### Train, test, validation

#### Reorder columns to separe dummies when applying StandardScaler

In [9]:
columns_value_names = inputs.columns.values
columns_value_names

array(['reasonGroup_1', 'reasonGroup_2', 'reasonGroup_3', 'reasonGroup_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Weekday', 'Month Value'], dtype=object)

Dummies: 'reasonGroup_1', 'reasonGroup_2', 'reasonGroup_3', 'reasonGroup_4',
                     'Education',

In [10]:
new_columns_order = ['reasonGroup_1', 'reasonGroup_2', 'reasonGroup_3', 'reasonGroup_4',
                     'Education','Transportation Expense', 'Distance to Work', 'Age', 'Daily Work Load Average',  
                    'Body Mass Index','Children', 'Pets', 'Weekday', 'Month Value']

In [11]:
inputs = inputs[new_columns_order]
inputs

Unnamed: 0,reasonGroup_1,reasonGroup_2,reasonGroup_3,reasonGroup_4,Education,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets,Weekday,Month Value
0,0,0,0,1,0,289,36,33,239.554,30,2,1,1,7
1,0,0,0,0,0,118,13,50,239.554,31,1,0,1,7
2,0,0,0,1,0,179,51,38,239.554,31,0,0,2,7
3,1,0,0,0,0,279,5,39,239.554,24,2,0,3,7
4,0,0,0,1,0,289,36,33,239.554,30,2,1,3,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,1,179,22,40,237.656,22,2,0,2,5
696,1,0,0,0,0,225,26,28,237.656,24,1,2,2,5
697,1,0,0,0,1,330,16,28,237.656,25,0,0,3,5
698,0,0,0,1,1,235,16,32,237.656,25,0,0,3,5


In [12]:
X_train, X_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=42, shuffle=True)

0.8*0.125 = 0.1 = 10% validation

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125,random_state=42, shuffle=True)

In [14]:
X_train_unscaled = X_train.iloc[:,5:14]
X_train_dummies = X_train.iloc[:,0:5]
X_val_unscaled = X_val.iloc[:,5:14]
X_val_dummies = X_val.iloc[:,0:5]
X_test_unscaled = X_test.iloc[:,5:14]
X_test_dummies = X_test.iloc[:,0:5]

# Standardize the data

Below, bin_vars_index is an array of column indexes for the binary variable and cont_vars_index is the same for the continuous variables that you want to scale.

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()

In [17]:
scaler.fit(X_train_unscaled)

StandardScaler()

In [18]:
X_train_scaled = scaler.transform(X_train_unscaled)
X_val_scaled = scaler.transform(X_val_unscaled)
X_test_scaled = scaler.transform(X_test_unscaled)

In [19]:
X_train_dummies.values

array([[0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       ...,
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 1],
       [0, 0, 0, 1, 0]])

In [20]:
X_train_scaled = np.concatenate((X_train_dummies.values, X_train_scaled), axis=1)
X_val_scaled = np.concatenate((X_val_dummies.values, X_val_scaled), axis=1)
X_test_scaled = np.concatenate((X_test_dummies.values, X_test_scaled), axis=1)


In [21]:
pd.set_option('display.max_rows', 1000)

In [22]:
X_train_scaled.shape

(490, 14)

# Logistic regression with sklearn

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [24]:
reg = LogisticRegression(max_iter=5000)

In [25]:
X_train_scaled.shape

(490, 14)

In [26]:
y_train.shape

(490,)

In [27]:
reg.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=5000)

In [28]:
reg.score(X_train_scaled,y_train)

0.763265306122449

### Manually check the accuracy

In [29]:
outputs = reg.predict(X_train_scaled)

In [30]:
np.sum(outputs == y_train)

374

In [31]:
outputs.shape[0]

490

In [32]:
np.sum((outputs == y_train)/outputs.shape[0])

0.763265306122449

### Intercept and coefficients

In [33]:
reg.intercept_

array([-1.54407723])

In [34]:
reg.coef_

array([[ 2.68648803,  0.69202682,  2.92333396,  0.78762855, -0.32703651,
         0.59150126, -0.07896037, -0.2821895 , -0.0557011 ,  0.18698268,
         0.43385821, -0.2703242 , -0.10496917, -0.00927609]])

In [35]:
feature_name = inputs.columns.values
feature_name

array(['reasonGroup_1', 'reasonGroup_2', 'reasonGroup_3', 'reasonGroup_4',
       'Education', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets',
       'Weekday', 'Month Value'], dtype=object)

In [36]:
summary_table = pd.DataFrame(columns=["Feature name"], data=feature_name)
summary_table["Coefficient"]=np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,reasonGroup_1,2.686488
1,reasonGroup_2,0.692027
2,reasonGroup_3,2.923334
3,reasonGroup_4,0.787629
4,Education,-0.327037
5,Transportation Expense,0.591501
6,Distance to Work,-0.07896
7,Age,-0.282189
8,Daily Work Load Average,-0.055701
9,Body Mass Index,0.186983


In [37]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ["Intercept", reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.544077
1,reasonGroup_1,2.686488
2,reasonGroup_2,0.692027
3,reasonGroup_3,2.923334
4,reasonGroup_4,0.787629
5,Education,-0.327037
6,Transportation Expense,0.591501
7,Distance to Work,-0.07896
8,Age,-0.282189
9,Daily Work Load Average,-0.055701


#### Interpret coefficients and intercept

In [38]:
summary_table["Odds_ratio"] = np.exp(summary_table.Coefficient)
summary_table.sort_values("Odds_ratio", ascending= False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,reasonGroup_3,2.923334,18.603207
1,reasonGroup_1,2.686488,14.68003
4,reasonGroup_4,0.787629,2.198177
2,reasonGroup_2,0.692027,1.997761
6,Transportation Expense,0.591501,1.806699
11,Children,0.433858,1.5432
10,Body Mass Index,0.186983,1.205606
14,Month Value,-0.009276,0.990767
9,Daily Work Load Average,-0.055701,0.945822
7,Distance to Work,-0.07896,0.924077


A feature is not particularly important if its coefficient is around 0 or if its odds ratio is around 1.
A weight (coef) of 0 implies that no matter the feature value, we will multiply it by 0 (in the model).
The odds multiple by a multiple equal to the odds ratio (1 = no change).