# Load Data

In [5]:
import pandas as pd
import numpy as np

In [6]:
data_preprocessed = pd.read_csv("Absenteeism_preprocessed.csv")

In [7]:
data_preprocessed.head(10)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
5,0,0,0,1,7,4,179,51,38,239.554,31,0,0,0,2
6,0,0,0,1,7,4,361,52,28,239.554,27,0,1,4,8
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0,4
8,0,0,1,0,7,0,155,12,34,239.554,25,0,2,0,40
9,0,0,0,1,7,0,235,11,37,239.554,29,1,1,1,8


# Create the targets

The approach here is to convert the target variable into two classes. One who is excessively absent and other moderately absent. Everything below the median of absenteeism in hours is taken as normal and above is considered excessive.

In [9]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [12]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median() , 1, 0)

In [13]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [14]:
data_preprocessed['Excessive Absenteeism'] = targets

In [15]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


# A comment on targets

In [16]:
targets.sum()/targets.shape[0]

0.45571428571428574

Since the ratio of targets being 1 and 0 has 46-54 split. It is recommended to proceed with logistic regression. 

In [18]:
target_data = data_preprocessed['Absenteeism Time in Hours']

In [19]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

In [20]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


In [21]:
data_with_targets is data_preprocessed

False

# ML 

In [23]:
data_with_targets.shape

(700, 15)

In [27]:
data_with_targets.iloc[:, 0:14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0


In [28]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

In [29]:
unscaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0


# Standardise the Data

In [31]:
from sklearn.preprocessing import StandardScaler

In [32]:
absenteeism_scaler = StandardScaler()

In [33]:
absenteeism_scaler.fit(unscaled_inputs)

StandardScaler()

In [34]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [35]:
scaled_inputs

array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035,  0.26848661]])

In [36]:
scaled_inputs.shape

(700, 14)

# Split the data into train test and shuffle

In [37]:
from sklearn.model_selection import train_test_split

In [44]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state = 20)

In [45]:
print(x_train.shape, y_train.shape)

(560, 14) (560,)


In [46]:
print(x_test.shape, y_test.shape)

(140, 14) (140,)


# Logistic Regression with sklearn

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Training the model

In [53]:
reg = LogisticRegression()

In [54]:
reg.fit(x_train, y_train)

LogisticRegression()

In [55]:
reg.score(x_train, y_train)

0.7839285714285714

### Manually Check the Accuracy

In [57]:
model_outputs = reg.predict(x_train)

In [59]:
model_outputs, targets

(array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
        1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
        0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
        1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
        0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
        0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 

In [60]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [61]:
np.sum(model_outputs==y_train)

439

In [63]:
model_outputs.shape[0]

560

In [64]:
np.sum(model_outputs==y_train)/model_outputs.shape[0]

0.7839285714285714

It is same as the metric obtained using the function. 

### Finding the intercepts and coefficients

In [65]:
reg.coef_

array([[ 2.07601767,  0.33504757,  1.56162303,  1.32927434,  0.18793677,
        -0.07062253,  0.70639316, -0.03986811, -0.20089491, -0.00456366,
         0.31933564, -0.135508  ,  0.38172443, -0.3332426 ]])

In [67]:
reg.intercept_

array([-0.22206736])

In [68]:
feature_name = unscaled_inputs.columns.values

In [70]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)

In [72]:
summary_table['Coefficient'] = np.transpose(reg.coef_)

In [73]:
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.076018
1,Reason_2,0.335048
2,Reason_3,1.561623
3,Reason_4,1.329274
4,Month Value,0.187937
5,Day of the week,-0.070623
6,Transportation Expense,0.706393
7,Distance to Work,-0.039868
8,Age,-0.200895
9,Daily Work Load Average,-0.004564


In [74]:
summary_table.index = summary_table.index + 1

In [75]:
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.222067
1,Reason_1,2.076018
2,Reason_2,0.335048
3,Reason_3,1.561623
4,Reason_4,1.329274
5,Month Value,0.187937
6,Day of the week,-0.070623
7,Transportation Expense,0.706393
8,Distance to Work,-0.039868
9,Age,-0.200895


# Interpreting the Coefficients

In [76]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [77]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-0.222067,0.800861
1,Reason_1,2.076018,7.972656
2,Reason_2,0.335048,1.398007
3,Reason_3,1.561623,4.766551
4,Reason_4,1.329274,3.778301
5,Month Value,0.187937,1.206757
6,Day of the week,-0.070623,0.931814
7,Transportation Expense,0.706393,2.026668
8,Distance to Work,-0.039868,0.960916
9,Age,-0.200895,0.817998


In [79]:
summary_table.sort_values('Odds_ratio', ascending = False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
1,Reason_1,2.076018,7.972656
3,Reason_3,1.561623,4.766551
4,Reason_4,1.329274,3.778301
7,Transportation Expense,0.706393,2.026668
13,Children,0.381724,1.464808
2,Reason_2,0.335048,1.398007
11,Body Mass Index,0.319336,1.376213
5,Month Value,0.187937,1.206757
10,Daily Work Load Average,-0.004564,0.995447
8,Distance to Work,-0.039868,0.960916


A weight coefficient of 0 implies that no matter the feature value it is always 0. So, this type of columns might not be useful in the model. From the table Daily work load average, distance to work, day of the week seem to be not useful.

Standardaisation shouldn't be implemented to dummies. Scaling the data without standardising the dummies.

In [85]:
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]


In [86]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [87]:
columns_to_scale = ['Month Value', 'Day of the week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets']

In [88]:
absenteeism_scaler = CustomScaler(columns_to_scale)



In [89]:
absenteeism_scaler.fit(unscaled_inputs)

AttributeError: 'CustomScaler' object has no attribute 'copy'

AttributeError: 'CustomScaler' object has no attribute 'copy'

In [None]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)