In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
pd.pandas.set_option('display.max_columns',None)

In [6]:
df = pd.read_csv('original.csv')

In [7]:
df.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


# Feature Engineering

In [27]:
data = df.copy()

In [28]:
# Drop Id column
data.drop('ID',axis=1,inplace=True)

In [29]:
# looking at the data , the reasons can be further categorised to the following groups
# group 1: Diseases related (reason 1-14)
# group 2: Pregnancy related (reason 15-17)
# group 3: Poisoning related(reason18-21)
# group 4: Minor medical issues(reason 22-28)

def group(reason):
    
    g_1 = [1,2,3,4,5,6,7,8,9,10,11,12,13,14]
    g_2 = [15,16,17]
    g_3 =[18,19,20,21]
    g_4 =[22,23,24,25,26,27,28]
    g_5 = [0]
    
    if reason in g_1:
        return 'group 1'
    elif reason in g_2:
        return 'group 2'
    elif reason in g_3:
        return 'group 3'
    elif reason in g_4:
        return 'group 4'
    else:
        return 'group 5'

In [30]:
data['Reason for Absence'] = data['Reason for Absence'].apply(group)

In [31]:
dummies =pd.get_dummies(data['Reason for Absence'])
dummies.drop('group 5',axis=1,inplace=True) # Dropped the column of no reason

In [33]:
# Drop original reason column
data.drop('Reason for Absence',axis=1,inplace=True)

In [35]:
# Convert Date column to get the month and day
data['Date'] =pd.to_datetime(data['Date'],format='%d/%m/%Y')
data['Month'] = data['Date'].apply(lambda x : x.month)
data['Day'] = data['Date'].apply(lambda x: x.weekday())

In [37]:
# Drop orginal date column
data.drop('Date',axis=1,inplace=True)

In [39]:
# Check unique values in education columns 
data['Education'].value_counts()

1    583
3     73
2     40
4      4
Name: Education, dtype: int64

In [40]:
# Since majority of data is in level 1 , group the rest in one group
data['Education'] = data['Education'].map({1:1,2:0,3:0,4:0})

In [42]:
# Define a treshold of number of hours of being absent to be defined as severly absent or not
data['Absenteeism Time in Hours'].median() # use the median absent hours as the treshold

3.0

In [43]:
# Ammend the Absenteeism Time in Hours column accordingly 
data['Absenteeism Time in Hours'] = np.where(data['Absenteeism Time in Hours']<=3,0,1)

In [44]:
data

Unnamed: 0,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day
0,289,36,33,239.554,30,1,2,1,1,7,1
1,118,13,50,239.554,31,1,1,0,0,7,1
2,179,51,38,239.554,31,1,0,0,0,7,2
3,279,5,39,239.554,24,1,2,0,1,7,3
4,289,36,33,239.554,30,1,2,1,0,7,3
...,...,...,...,...,...,...,...,...,...,...,...
695,179,22,40,237.656,22,0,2,0,1,5,2
696,225,26,28,237.656,24,1,1,2,0,5,2
697,330,16,28,237.656,25,0,0,0,1,5,3
698,235,16,32,237.656,25,0,0,0,0,5,3


In [45]:
# Combine the dummies together
data =pd.concat([data,dummies],axis=1)

In [46]:
data

Unnamed: 0,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day,group 1,group 2,group 3,group 4
0,289,36,33,239.554,30,1,2,1,1,7,1,0,0,0,1
1,118,13,50,239.554,31,1,1,0,0,7,1,0,0,0,0
2,179,51,38,239.554,31,1,0,0,0,7,2,0,0,0,1
3,279,5,39,239.554,24,1,2,0,1,7,3,1,0,0,0
4,289,36,33,239.554,30,1,2,1,0,7,3,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,179,22,40,237.656,22,0,2,0,1,5,2,1,0,0,0
696,225,26,28,237.656,24,1,1,2,0,5,2,1,0,0,0
697,330,16,28,237.656,25,0,0,0,1,5,3,1,0,0,0
698,235,16,32,237.656,25,0,0,0,0,5,3,0,0,0,1


In [118]:
# Create a copy file 
data_final = data.copy()

# Split and scale the data

In [133]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [134]:
X = data_final.drop('Absenteeism Time in Hours',axis=1)
y=data['Absenteeism Time in Hours']

In [135]:
X_train

Unnamed: 0,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month,Day,group 1,group 2,group 3,group 4,group 1.1,group 2.1,group 3.1,group 4.1
0,-0.663158,1.395943,0.219843,0.890554,0.998129,0.440488,-0.905873,-0.567181,1.357572,0.649803,1,0,0,0,1,0,0,0
1,0.029553,-0.271423,-1.334604,-0.758513,-0.645157,0.440488,-0.026690,1.100123,0.784022,-1.356943,0,0,0,0,0,0,0,0
2,0.345791,-0.338118,0.686177,2.700691,-0.879913,0.440488,-0.905873,-0.567181,-0.076303,-1.356943,0,0,1,0,0,0,1,0
3,-0.663158,-0.271423,-1.023714,-0.636185,-1.818933,-2.270208,-0.905873,-0.567181,-0.649852,1.318719,0,0,0,1,0,0,0,1
4,-1.581753,-1.338538,0.064399,-0.071040,0.293863,0.440488,-0.905873,-0.567181,1.357572,-1.356943,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555,1.023443,0.062050,0.530733,-0.453735,-0.410402,0.440488,-0.026690,0.266471,1.070797,0.649803,0,0,0,1,0,0,0,1
556,0.029553,-0.271423,-1.334604,-0.254405,-0.645157,0.440488,-0.026690,1.100123,1.644346,0.649803,0,0,0,1,0,0,0,1
557,-0.663158,1.395943,0.219843,0.582242,0.998129,0.440488,-0.905873,-0.567181,0.784022,-1.356943,1,0,0,0,1,0,0,0
558,0.029553,-0.271423,-1.334604,-0.641625,-0.645157,0.440488,-0.026690,1.100123,-0.363077,-0.019112,0,0,0,1,0,0,0,1


In [136]:
X_train.columns.values

array(['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month', 'Day', 'group 1', 'group 2',
       'group 3', 'group 4', 'group 1', 'group 2', 'group 3', 'group 4'],
      dtype=object)

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [138]:
scaler = StandardScaler()

In [139]:
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train.iloc[:,:10]),columns= ['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education','Children', 'Pets', 'Month', 'Day'])

In [140]:
X_train= pd.concat([X_train_scaled,X_train.iloc[:,10:].reset_index(drop=True)],axis=1)

In [141]:
X_train

Unnamed: 0,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month,Day,group 1,group 2,group 3,group 4
0,-0.663158,1.395943,0.219843,0.890554,0.998129,0.440488,-0.905873,-0.567181,1.357572,0.649803,1,0,0,0
1,0.029553,-0.271423,-1.334604,-0.758513,-0.645157,0.440488,-0.026690,1.100123,0.784022,-1.356943,0,0,0,0
2,0.345791,-0.338118,0.686177,2.700691,-0.879913,0.440488,-0.905873,-0.567181,-0.076303,-1.356943,0,0,1,0
3,-0.663158,-0.271423,-1.023714,-0.636185,-1.818933,-2.270208,-0.905873,-0.567181,-0.649852,1.318719,0,0,0,1
4,-1.581753,-1.338538,0.064399,-0.071040,0.293863,0.440488,-0.905873,-0.567181,1.357572,-1.356943,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555,1.023443,0.062050,0.530733,-0.453735,-0.410402,0.440488,-0.026690,0.266471,1.070797,0.649803,0,0,0,1
556,0.029553,-0.271423,-1.334604,-0.254405,-0.645157,0.440488,-0.026690,1.100123,1.644346,0.649803,0,0,0,1
557,-0.663158,1.395943,0.219843,0.582242,0.998129,0.440488,-0.905873,-0.567181,0.784022,-1.356943,1,0,0,0
558,0.029553,-0.271423,-1.334604,-0.641625,-0.645157,0.440488,-0.026690,1.100123,-0.363077,-0.019112,0,0,0,1


In [142]:
X_test_scaled = pd.DataFrame(scaler.transform(X_test.iloc[:,:10]),columns= ['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education','Children', 'Pets', 'Month', 'Day'])
X_test= pd.concat([X_test_scaled,X_test.iloc[:,10:].reset_index(drop=True)],axis=1)

In [151]:
X_test.columns.values

array(['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month', 'Day', 'group 1', 'group 2',
       'group 3', 'group 4'], dtype=object)

# Model

In [144]:
from sklearn.linear_model import LogisticRegression

In [145]:
model =LogisticRegression(penalty='l2')
model.fit(X_train,y_train)
predictions =model.predict(X_test)



# Evaluation

In [146]:
from sklearn.metrics import classification_report,confusion_matrix

In [147]:
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

           0       0.81      0.82      0.81        82
           1       0.74      0.72      0.73        58

    accuracy                           0.78       140
   macro avg       0.77      0.77      0.77       140
weighted avg       0.78      0.78      0.78       140

[[67 15]
 [16 42]]


In [148]:
model.score(X_train,y_train)

0.7696428571428572

In [149]:
model.coef_

array([[ 0.66652509, -0.05626649, -0.26465308, -0.02607705,  0.24324472,
         0.10749674,  0.42409304, -0.318727  ,  0.07407742, -0.15774086,
         2.74159925,  0.66154046,  2.89964295,  0.80164434]])

In [150]:
model.intercept_

array([-1.55974268])

In [173]:
result_table =pd.DataFrame(['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month', 'Day', 'group 1', 'group 2',
       'group 3', 'group 4'],columns=['Feature'])

In [174]:
result_table['Coefficients'] = np.transpose(model.coef_)

In [175]:
result_table

Unnamed: 0,Feature,Coefficients
0,Transportation Expense,0.666525
1,Distance to Work,-0.056266
2,Age,-0.264653
3,Daily Work Load Average,-0.026077
4,Body Mass Index,0.243245
5,Education,0.107497
6,Children,0.424093
7,Pets,-0.318727
8,Month,0.074077
9,Day,-0.157741


In [176]:
result_table.index = result_table.index + 1
result_table.loc[0] = ['Intercept',model.intercept_[0]]
result_table = result_table.sort_index()
result_table

Unnamed: 0,Feature,Coefficients
0,Intercept,-1.559743
1,Transportation Expense,0.666525
2,Distance to Work,-0.056266
3,Age,-0.264653
4,Daily Work Load Average,-0.026077
5,Body Mass Index,0.243245
6,Education,0.107497
7,Children,0.424093
8,Pets,-0.318727
9,Month,0.074077


# Calculate Odds Ratio to study actual statistical implication of each feature to the likelihood of excessive absenteeism

In [177]:
result_table['Odds Ratio'] = np.exp(result_table.Coefficients)

In [183]:
result_table = result_table.sort_values('Odds Ratio',ascending=False)


In [184]:
result_table

Unnamed: 0,Feature,Coefficients,Odds Ratio
13,group 3,2.899643,18.167657
11,group 1,2.741599,15.511772
14,group 4,0.801644,2.229203
1,Transportation Expense,0.666525,1.947458
12,group 2,0.66154,1.937775
7,Children,0.424093,1.528204
5,Body Mass Index,0.243245,1.275381
6,Education,0.107497,1.113487
9,Month,0.074077,1.07689
4,Daily Work Load Average,-0.026077,0.97426


# Running the model after removing non-contributing features

In [193]:
# RE-Run the model after removing features with low weights ['Day','Distance to Work','Daily Work Load Average']
reduced_data = data_final.copy()
reduced_data.drop(['Day','Distance to Work','Daily Work Load Average'],axis=1,inplace=True)

In [194]:
X = reduced_data.drop('Absenteeism Time in Hours',axis=1)
y=data['Absenteeism Time in Hours']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [199]:
X_train.columns.values

array(['Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month', 'group 1', 'group 2', 'group 3',
       'group 4'], dtype=object)

In [200]:
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train.iloc[:,:7]),columns= ['Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month'])

X_train= pd.concat([X_train_scaled,X_train.iloc[:,7:].reset_index(drop=True)],axis=1)

X_test_scaled = pd.DataFrame(scaler.transform(X_test.iloc[:,:7]),columns= ['Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month'])
X_test= pd.concat([X_test_scaled,X_test.iloc[:,7:].reset_index(drop=True)],axis=1)

In [204]:
X_test.columns.values

array(['Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month', 'group 1', 'group 2', 'group 3',
       'group 4'], dtype=object)

In [203]:
model =LogisticRegression(penalty='l2')
model.fit(X_train,y_train)
predictions =model.predict(X_test)

print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80        82
           1       0.72      0.72      0.72        58

    accuracy                           0.77       140
   macro avg       0.76      0.76      0.76       140
weighted avg       0.77      0.77      0.77       140

[[66 16]
 [16 42]]




In [206]:
result_table =pd.DataFrame(['Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month', 'group 1', 'group 2', 'group 3',
       'group 4'],columns=['Feature'])

result_table['Coefficients'] = np.transpose(model.coef_)

result_table.index = result_table.index + 1
result_table.loc[0] = ['Intercept',model.intercept_[0]]
result_table = result_table.sort_index()
result_table['Odds Ratio'] = np.exp(result_table.Coefficients)
result_table = result_table.sort_values('Odds Ratio',ascending=False)
result_table

Unnamed: 0,Feature,Coefficients,Odds Ratio
10,group 3,2.930879,18.744107
8,group 1,2.729732,15.328777
11,group 4,0.778384,2.17795
9,group 2,0.688367,1.990462
1,Transportation Expense,0.643081,1.902334
5,Children,0.404565,1.49865
3,Body Mass Index,0.250972,1.285274
4,Education,0.096326,1.101118
7,Month,0.069732,1.072221
2,Age,-0.257689,0.772835
