In [97]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

In [98]:
raw_data = pd.read_csv('Absenteeism_data.csv')
raw_data.head(10)

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2
5,3,23,10/07/2015,179,51,38,239.554,31,1,0,0,2
6,10,22,17/07/2015,361,52,28,239.554,27,1,1,4,8
7,20,23,24/07/2015,260,50,36,239.554,23,1,4,0,4
8,14,19,06/07/2015,155,12,34,239.554,25,1,2,0,40
9,1,22,13/07/2015,235,11,37,239.554,29,3,1,1,8


In [99]:
df = raw_data.copy()
df.describe()

Unnamed: 0,ID,Reason for Absence,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,17.951429,19.411429,222.347143,29.892857,36.417143,271.801774,26.737143,1.282857,1.021429,0.687143,6.761429
std,11.028144,8.356292,66.31296,14.804446,6.379083,40.021804,4.254701,0.66809,1.112215,1.166095,12.670082
min,1.0,0.0,118.0,5.0,27.0,205.917,19.0,1.0,0.0,0.0,0.0
25%,9.0,13.0,179.0,16.0,31.0,241.476,24.0,1.0,0.0,0.0,2.0
50%,18.0,23.0,225.0,26.0,37.0,264.249,25.0,1.0,1.0,0.0,3.0
75%,28.0,27.0,260.0,50.0,40.0,294.217,31.0,1.0,2.0,1.0,8.0
max,36.0,28.0,388.0,52.0,58.0,378.884,38.0,4.0,4.0,8.0,120.0


In [100]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Preprocessing

### Drop 'ID'

In [101]:
df = df.drop(['ID'], axis=1)
df.head()

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


### Dummies

In [102]:
reason = pd.get_dummies(df['Reason for Absence'], drop_first=True)

### Group the Reason for Absence

In [103]:
df = df.drop(['Reason for Absence'], axis=1)
df.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [104]:
reason_type_1 = reason.loc[:, 1:14].max(axis=1)
reason_type_2 = reason.loc[:, 15:17].max(axis=1)
reason_type_3 = reason.loc[:, 18:21].max(axis=1)
reason_type_4 = reason.loc[:, 22:].max(axis=1)

In [105]:
df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis=1)

In [106]:
df.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


In [107]:
df.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 0, 1, 2, 3],
      dtype=object)

In [108]:
col = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Diseases Related Reason', 'Pregnancy Related Reason', 'Injury Related Reason', 'Minor Reason']

In [109]:
df.columns = col

In [110]:
df.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Diseases Related Reason,Pregnancy Related Reason,Injury Related Reason,Minor Reason
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


### Reorder Columns

In [111]:
reorder = ['Diseases Related Reason', 'Pregnancy Related Reason', 'Injury Related Reason', 'Minor Reason', 'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

In [112]:
df = df[reorder]

In [113]:
df.head()

Unnamed: 0,Diseases Related Reason,Pregnancy Related Reason,Injury Related Reason,Minor Reason,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2


### Checkpoint

In [114]:
df_mod = df.copy()
df_mod.head()

Unnamed: 0,Diseases Related Reason,Pregnancy Related Reason,Injury Related Reason,Minor Reason,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2


### 'Date' Column

In [115]:
df_mod['Date'] = pd.to_datetime(df_mod['Date'], format = '%d/%m/%Y')

In [116]:
df_mod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Diseases Related Reason    700 non-null    uint8         
 1   Pregnancy Related Reason   700 non-null    uint8         
 2   Injury Related Reason      700 non-null    uint8         
 3   Minor Reason               700 non-null    uint8         
 4   Date                       700 non-null    datetime64[ns]
 5   Transportation Expense     700 non-null    int64         
 6   Distance to Work           700 non-null    int64         
 7   Age                        700 non-null    int64         
 8   Daily Work Load Average    700 non-null    float64       
 9   Body Mass Index            700 non-null    int64         
 10  Education                  700 non-null    int64         
 11  Children                   700 non-null    int64         
 12  Pets    

### Extract Month Value

In [117]:
df_mod['Date'][0]

Timestamp('2015-07-07 00:00:00')

In [118]:
df_mod['Date'][0].month

7

In [119]:
list_month = []

for i in range(df_mod.shape[0]):
    list_month.append(df_mod['Date'][i].month)

In [120]:
df_mod['Month Value'] =  list_month

In [121]:
df_mod.head()

Unnamed: 0,Diseases Related Reason,Pregnancy Related Reason,Injury Related Reason,Minor Reason,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7


### Extract Day of the Week

In [122]:
df_mod['Date'][0]

Timestamp('2015-07-07 00:00:00')

In [123]:
df_mod['Date'][0].weekday()

1

In [124]:
def to_weekday(value):
    return value.weekday()

In [125]:
df_mod['Day of the Week'] = df_mod['Date'].apply(to_weekday)

In [126]:
df_mod = df_mod.drop(['Date'], axis=1)

In [127]:
df_mod.columns

Index(['Diseases Related Reason', 'Pregnancy Related Reason',
       'Injury Related Reason', 'Minor Reason', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index',
       'Education', 'Children', 'Pets', 'Absenteeism Time in Hours',
       'Month Value', 'Day of the Week'],
      dtype='object')

In [128]:
column = ['Diseases Related Reason', 'Pregnancy Related Reason',
       'Injury Related Reason', 'Minor Reason', 'Month Value', 'Day of the Week','Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index',
       'Education', 'Children', 'Pets', 'Absenteeism Time in Hours']

In [129]:
df_mod = df_mod[column]
df_mod.head()

Unnamed: 0,Diseases Related Reason,Pregnancy Related Reason,Injury Related Reason,Minor Reason,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,1,2,1,2


### map() 'Education'

In [130]:
df_mod['Education'].unique()

array([1, 3, 2, 4], dtype=int64)

In [131]:
df_mod['Education'].value_counts()

1    583
3     73
2     40
4      4
Name: Education, dtype: int64

In [132]:
df_mod['Education'] = df_mod['Education'].map({1:0, 2:1, 3:1, 4:1})

In [133]:
df_mod['Education'].value_counts()

0    583
1    117
Name: Education, dtype: int64

### Final Checkpoint

In [134]:
df_final = df_mod.copy()
df_final.head()

Unnamed: 0,Diseases Related Reason,Pregnancy Related Reason,Injury Related Reason,Minor Reason,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


# Logistic Regression

### Targets

In [135]:
df_final['Absenteeism Time in Hours'].median()

3.0

In [136]:
targets = np.where(df_final['Absenteeism Time in Hours'] > 
                   df_final['Absenteeism Time in Hours'].median(), 1, 0)

In [137]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [138]:
df_final['Excessive Absenteeism'] = targets

In [139]:
df_final.head()

Unnamed: 0,Diseases Related Reason,Pregnancy Related Reason,Injury Related Reason,Minor Reason,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [140]:
targets.sum() / targets.shape[0]
# 45-55 split of target variable are enough for logistic regression

0.45571428571428574

In [141]:
df_targets = df_final.drop(['Absenteeism Time in Hours'], axis=1)

In [142]:
df_targets.head()

Unnamed: 0,Diseases Related Reason,Pregnancy Related Reason,Injury Related Reason,Minor Reason,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


### Inputs

In [143]:
inputs =  df_targets.iloc[:, :-1]

In [144]:
inputs.head()

Unnamed: 0,Diseases Related Reason,Pregnancy Related Reason,Injury Related Reason,Minor Reason,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1


In [145]:
input_dummi = inputs.iloc[:, :4]
input_dummi.head()

Unnamed: 0,Diseases Related Reason,Pregnancy Related Reason,Injury Related Reason,Minor Reason
0,0,0,0,1
1,0,0,0,0
2,0,0,0,1
3,1,0,0,0
4,0,0,0,1


In [146]:
input_no_dummi = inputs.iloc[:, 4:]
input_no_dummi.head()

Unnamed: 0,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,7,1,289,36,33,239.554,30,0,2,1
1,7,1,118,13,50,239.554,31,0,1,0
2,7,2,179,51,38,239.554,31,0,0,0
3,7,3,279,5,39,239.554,24,0,2,0
4,7,3,289,36,33,239.554,30,0,2,1


### Standardize

In [147]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(input_no_dummi)

StandardScaler()

In [148]:
scaled_inputs = scaler.transform(input_no_dummi)

In [149]:
scaled_inputs

array([[ 0.18272635, -0.68370352,  1.00584437, ..., -0.44798003,
         0.88046927,  0.26848661],
       [ 0.18272635, -0.68370352, -1.57468098, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [ 0.18272635, -0.00772546, -0.6541427 , ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [-0.3882935 ,  0.66825259,  1.62456682, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.3882935 ,  0.66825259,  0.19094163, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.3882935 ,  0.66825259,  1.03602595, ..., -0.44798003,
        -0.01928035,  0.26848661]])

In [151]:
scaled_input = pd.DataFrame(scaled_inputs)

In [152]:
scaled = pd.concat([input_dummi, scaled_input], axis=1)

In [153]:
scaled

Unnamed: 0,Diseases Related Reason,Pregnancy Related Reason,Injury Related Reason,Minor Reason,0,1,2,3,4,5,6,7,8,9
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,-0.44798,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,-0.44798,-0.01928,-0.58969
2,0,0,0,1,0.182726,-0.007725,-0.654143,1.426749,0.24831,-0.806331,1.002633,-0.44798,-0.91903,-0.58969
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,-0.44798,0.880469,-0.58969
4,0,0,0,1,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,-0.44798,0.880469,0.268487
5,0,0,0,1,0.182726,1.344231,-0.654143,1.426749,0.24831,-0.806331,1.002633,-0.44798,-0.91903,-0.58969
6,0,0,0,1,0.182726,1.344231,2.092381,1.494345,-1.320435,-0.806331,0.061825,-0.44798,-0.01928,2.843016
7,0,0,0,1,0.182726,1.344231,0.568211,1.359154,-0.065439,-0.806331,-0.878984,-0.44798,2.679969,-0.58969
8,0,0,1,0,0.182726,-1.359682,-1.016322,-1.209478,-0.379188,-0.806331,-0.40858,-0.44798,0.880469,-0.58969
9,0,0,0,1,0.182726,-1.359682,0.190942,-1.277074,0.091435,-0.806331,0.532229,2.232242,-0.01928,0.268487


### Train Test Split

In [154]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(scaled, targets, test_size=0.20, random_state=42)

### Logistic Regression

In [155]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
reg = LogisticRegression()

### Training the Model

In [156]:
reg.fit(x_train, y_train)

LogisticRegression()

In [157]:
reg.score(x_train, y_train)

0.7660714285714286

### Manually check the Accuracy

In [158]:
model_outputs = reg.predict(x_train)

In [159]:
model_outputs == y_train

array([ True,  True,  True,  True, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True, False,  True, False,  True,
        True, False, False,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True, False,
        True,  True, False, False,  True,  True, False, False,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [160]:
np.sum((model_outputs == y_train))

429

In [161]:
model_outputs.shape[0]

560

In [162]:
np.sum((model_outputs == y_train)) / model_outputs.shape[0]

0.7660714285714286

### Intercept and Coefficients

In [163]:
reg.intercept_

array([-1.7575216])

In [164]:
reg.coef_

array([[ 2.92993196,  0.73453473,  3.07517417,  0.99389691,  0.07923215,
        -0.15754023,  0.67570676, -0.05915557, -0.25914578, -0.02183528,
         0.24334102, -0.10857391,  0.41638209, -0.31223952]])

In [165]:
feature_name = inputs.columns.values

In [166]:
summary_table = pd.DataFrame(columns = ['Feature Name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature Name,Coefficient
0,Diseases Related Reason,2.929932
1,Pregnancy Related Reason,0.734535
2,Injury Related Reason,3.075174
3,Minor Reason,0.993897
4,Month Value,0.079232
5,Day of the Week,-0.15754
6,Transportation Expense,0.675707
7,Distance to Work,-0.059156
8,Age,-0.259146
9,Daily Work Load Average,-0.021835


In [167]:
summary_table.index = summary_table.index +1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-1.757522
1,Diseases Related Reason,2.929932
2,Pregnancy Related Reason,0.734535
3,Injury Related Reason,3.075174
4,Minor Reason,0.993897
5,Month Value,0.079232
6,Day of the Week,-0.15754
7,Transportation Expense,0.675707
8,Distance to Work,-0.059156
9,Age,-0.259146


### Interpreting the Coefficients

In [168]:
summary_table['Odds ratio'] = np.exp(summary_table.Coefficient)

In [169]:
summary_table.sort_values('Odds ratio', ascending = False)

Unnamed: 0,Feature Name,Coefficient,Odds ratio
3,Injury Related Reason,3.075174,21.653653
1,Diseases Related Reason,2.929932,18.726356
4,Minor Reason,0.993897,2.701742
2,Pregnancy Related Reason,0.734535,2.084512
7,Transportation Expense,0.675707,1.965422
13,Children,0.416382,1.516465
11,Body Mass Index,0.243341,1.275504
5,Month Value,0.079232,1.082456
10,Daily Work Load Average,-0.021835,0.978401
8,Distance to Work,-0.059156,0.94256


### Testing the Model

In [170]:
reg.score(x_test, y_test)

0.7714285714285715

In [171]:
predicted_probability = reg.predict_proba(x_test)
predicted_probability

array([[0.82479129, 0.17520871],
       [0.87647778, 0.12352222],
       [0.77483419, 0.22516581],
       [0.58309947, 0.41690053],
       [0.51095095, 0.48904905],
       [0.08342333, 0.91657667],
       [0.66267711, 0.33732289],
       [0.3461606 , 0.6538394 ],
       [0.75267385, 0.24732615],
       [0.71080784, 0.28919216],
       [0.8873699 , 0.1126301 ],
       [0.68878666, 0.31121334],
       [0.25475123, 0.74524877],
       [0.51186407, 0.48813593],
       [0.74009104, 0.25990896],
       [0.45183015, 0.54816985],
       [0.92160616, 0.07839384],
       [0.25305153, 0.74694847],
       [0.88831271, 0.11168729],
       [0.59781606, 0.40218394],
       [0.67248635, 0.32751365],
       [0.78027914, 0.21972086],
       [0.67318629, 0.32681371],
       [0.66823202, 0.33176798],
       [0.87004581, 0.12995419],
       [0.18751338, 0.81248662],
       [0.60454451, 0.39545549],
       [0.56493446, 0.43506554],
       [0.79462288, 0.20537712],
       [0.62889024, 0.37110976],
       [0.

In [172]:
predicted_probability[:, 1]

array([0.17520871, 0.12352222, 0.22516581, 0.41690053, 0.48904905,
       0.91657667, 0.33732289, 0.6538394 , 0.24732615, 0.28919216,
       0.1126301 , 0.31121334, 0.74524877, 0.48813593, 0.25990896,
       0.54816985, 0.07839384, 0.74694847, 0.11168729, 0.40218394,
       0.32751365, 0.21972086, 0.32681371, 0.33176798, 0.12995419,
       0.81248662, 0.39545549, 0.43506554, 0.20537712, 0.37110976,
       0.10948246, 0.14912889, 0.57495084, 0.55143848, 0.23623228,
       0.70078502, 0.28178608, 0.15543437, 0.88706285, 0.17579459,
       0.55534605, 0.23744595, 0.54101124, 0.14896319, 0.19552432,
       0.75074335, 0.80366594, 0.90000061, 0.34375748, 0.13935571,
       0.22814911, 0.31879016, 0.4382348 , 0.94654415, 0.16119568,
       0.25067327, 0.97534083, 0.27678597, 0.87930574, 0.24399158,
       0.54749599, 0.13830599, 0.54078931, 0.65526552, 0.1126301 ,
       0.41648288, 0.66690325, 0.06577497, 0.2931184 , 0.48693582,
       0.28919216, 0.2267064 , 0.73718365, 0.31879016, 0.11730

In [173]:
predict = reg.predict(x_test)

In [174]:
summary = pd.DataFrame((predicted_probability[:, 1]), columns = ['Probability'])
summary['Prediction'] = predict
summary['Target'] = y_test

In [175]:
summary.sort_values(by= ['Probability'], ascending= False)

Unnamed: 0,Probability,Prediction,Target
56,0.975341,1,1
135,0.948692,1,1
53,0.946544,1,1
97,0.945666,1,1
5,0.916577,1,1
80,0.916577,1,1
117,0.912289,1,1
81,0.904462,1,1
47,0.900001,1,1
87,0.890441,1,1


## Overview of Test Results

In [176]:
overview = pd.DataFrame((predicted_probability[:, 1]), columns = ['Probability of being Absent'])
overview['Predicted Absent more than 3 hours'] = predict
overview['Observed Absent more than 3 hours'] = y_test


In [177]:
overview.sort_values(by= ['Probability of being Absent'], ascending= False)

Unnamed: 0,Probability of being Absent,Predicted Absent more than 3 hours,Observed Absent more than 3 hours
56,0.975341,1,1
135,0.948692,1,1
53,0.946544,1,1
97,0.945666,1,1
5,0.916577,1,1
80,0.916577,1,1
117,0.912289,1,1
81,0.904462,1,1
47,0.900001,1,1
87,0.890441,1,1
