In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
df=pd.read_csv('Absenteeism_preprocessed.csv')
df.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Pets,Children,Absenteeism Time in Hours,Month Value,Day of the week
0,0,0,0,1,289,36,33,239.554,30,0,1,2,4,7,1
1,0,0,0,0,118,13,50,239.554,31,0,0,1,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,0,0,2,4,7,3
4,0,0,0,1,289,36,33,239.554,30,0,1,2,2,7,3


In [3]:
df['Absenteeism Time in Hours'].median()

3.0

In [4]:
targets=np.where(df['Absenteeism Time in Hours']>df['Absenteeism Time in Hours'].median(),1,0)

In [5]:
df['Excessive absence']=targets

In [6]:
df.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Pets,Children,Absenteeism Time in Hours,Month Value,Day of the week,Excessive absence
0,0,0,0,1,289,36,33,239.554,30,0,1,2,4,7,1,1
1,0,0,0,0,118,13,50,239.554,31,0,0,1,0,7,1,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,2,0
3,1,0,0,0,279,5,39,239.554,24,0,0,2,4,7,3,1
4,0,0,0,1,289,36,33,239.554,30,0,1,2,2,7,3,0


In [7]:
targets.sum()/targets.shape[0]

0.45571428571428574

In [8]:
df_with_targets=df.drop('Absenteeism Time in Hours',axis=1)

In [9]:
df_input=df_with_targets.iloc[:,:-1]

In [10]:
df_input.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Pets,Children,Month Value,Day of the week
0,0,0,0,1,289,36,33,239.554,30,0,1,2,7,1
1,0,0,0,0,118,13,50,239.554,31,0,0,1,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2
3,1,0,0,0,279,5,39,239.554,24,0,0,2,7,3
4,0,0,0,1,289,36,33,239.554,30,0,1,2,7,3


In [11]:
from sklearn.preprocessing import StandardScaler
absenteeism_scaler=StandardScaler()

In [12]:
scaled_inputs=absenteeism_scaler.fit_transform(df_input)

In [13]:
scaled_inputs.shape

(700, 14)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train,X_test,y_train,y_test=train_test_split(scaled_inputs,targets,test_size=0.2,shuffle=True,random_state=365)

In [16]:
X_test.shape

(140, 14)

In [17]:
reg_log=LogisticRegression()

In [18]:
reg_log.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
reg_log.score(X_train,y_train)

0.775

# Manually check accuracy

In [20]:
model_outputs=reg_log.predict(X_train)
np.sum([model_outputs==y_train])

434

In [21]:
accuracy=np.sum([model_outputs==y_train])/model_outputs.shape[0]
accuracy

0.775

# Find intercepts and coeffs

-0.120458684994854

In [24]:
df_input.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Pets',
       'Children', 'Month Value', 'Day of the week'], dtype=object)

In [25]:
summary_table=pd.DataFrame(columns=['Feature name'],data=['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Pets',
       'Children', 'Month Value', 'Day of the week'])


In [32]:
summary_table['Coefficients']=reg_log.coef_.T

In [37]:
summary_table.index=summary_table.index+1

In [39]:
summary_table.reset_index(inplace=True,drop=True)

In [41]:
summary_table.index=summary_table.index+1
summary_table

Unnamed: 0,Feature name,Coefficients
1,Reason_1,2.009795
2,Reason_2,0.658922
3,Reason_3,1.490804
4,Reason_4,1.344727
5,Transportation Expense,0.61453
6,Distance to Work,-0.080474
7,Age,-0.293614
8,Daily Work Load Average,-0.017833
9,Body Mass Index,0.379719
10,Education,-0.074457


In [49]:
summary_table.loc[0]=['Intercept',reg_log.intercept_[0]]

In [54]:
summary_table=summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficients
0,Intercept,-0.120459
1,Reason_1,2.009795
2,Reason_2,0.658922
3,Reason_3,1.490804
4,Reason_4,1.344727
5,Transportation Expense,0.61453
6,Distance to Work,-0.080474
7,Age,-0.293614
8,Daily Work Load Average,-0.017833
9,Body Mass Index,0.379719


In [55]:
summary_table['Odds_ratio']=np.exp(summary_table.Coefficients)
summary_table

Unnamed: 0,Feature name,Coefficients,Odds_ratio
0,Intercept,-0.120459,0.886514
1,Reason_1,2.009795,7.461786
2,Reason_2,0.658922,1.932707
3,Reason_3,1.490804,4.440664
4,Reason_4,1.344727,3.837137
5,Transportation Expense,0.61453,1.848787
6,Distance to Work,-0.080474,0.922679
7,Age,-0.293614,0.745564
8,Daily Work Load Average,-0.017833,0.982325
9,Body Mass Index,0.379719,1.461874


In [56]:
summary_table.sort_values(by='Odds_ratio',ascending=False)

Unnamed: 0,Feature name,Coefficients,Odds_ratio
1,Reason_1,2.009795,7.461786
3,Reason_3,1.490804,4.440664
4,Reason_4,1.344727,3.837137
2,Reason_2,0.658922,1.932707
5,Transportation Expense,0.61453,1.848787
12,Children,0.458224,1.581263
9,Body Mass Index,0.379719,1.461874
13,Month Value,0.178143,1.194996
8,Daily Work Load Average,-0.017833,0.982325
10,Education,-0.074457,0.928247


# Interpretation of the summary table 

According to the above summary table, we can understand that if coefficient is close to 0 or odds_ratio is close to 1, the feature is not too important.

From the above, we can say that Daily Work Load Average and Month Value has very less importance as features to the model.


The above odds_ratio can be read as follows:
* For a person who has given Reason_1 for abseteeism, he/she is likely to be 7.46 (in standard value) times more likely to be excessiely absent.
* Transportation expense is the most important non-dummy feature. It can be interpreted as for 1 standardised unit or for 1 standard deviation increase in transportation cost he/she is 1.848 times more likely to be excessively absent.

In [58]:
reg_log.score(X_test,y_test)

0.7357142857142858

In [59]:
reg_log.predict(X_test)

array([0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0])

In [61]:
predicted_proba=reg_log.predict_proba(X_test)

In [65]:
predicted_proba[:,1]

array([0.46438053, 0.39511157, 0.56700857, 0.8916548 , 0.80248154,
       0.66469872, 0.81755811, 0.13561088, 0.42628816, 0.68650096,
       0.12984271, 0.10803309, 0.29294443, 0.5857611 , 0.72225903,
       0.07145104, 0.56572731, 0.56328119, 0.34947204, 0.19025323,
       0.01642609, 0.31596998, 0.6672274 , 0.56133753, 0.15198272,
       0.5371046 , 0.14508071, 0.0401874 , 0.69513018, 0.33606806,
       0.76427902, 0.17263536, 0.61960976, 0.7578985 , 0.38910788,
       0.76561072, 0.62014772, 0.57255767, 0.58745325, 0.68458296,
       0.90728715, 0.71786467, 0.23935055, 0.25399814, 0.21734591,
       0.93883463, 0.60673929, 0.39764186, 0.8041353 , 0.28370969,
       0.21573754, 0.41999356, 0.87910094, 0.18481103, 0.48039355,
       0.70450031, 0.47131167, 0.23443696, 0.20180251, 0.41801105,
       0.30881153, 0.40186272, 0.71796018, 0.54502564, 0.40150301,
       0.60251778, 0.79440405, 0.22312289, 0.26979471, 0.28169201,
       0.71606381, 0.16555251, 0.01556658, 0.53307109, 0.32813

The above array gives us the probability calculated by the logistic regression model. 
* If the probability is below 0.5, it places 0 as output
* If the probability is above 0.5, it places 1 as output

# Save the model

pickle is a Python module used to convert a python object into a character stream.

In [66]:
import pickle

In [69]:
with open('model','wb') as file:
    pickle.dump(reg_log,file)

In [70]:
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler,file)

The absenteeism scaler must be saved too. This is done inorder to save all the weights and biases. Hence, we don't require the training data again to train the model. It already has the required data in the scaler file.