In [48]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [25]:
data = pd.read_csv("data_preprocessed.csv")

In [26]:
data.head(10)

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_1,Reason_2,Reason_3,Reason_4,Month Values,Day of the Week
0,2015-07-07,289,36,33,239.554,30,0,2,1,4,False,False,False,True,7,1
1,2015-07-14,118,13,50,239.554,31,0,1,0,0,False,False,False,False,7,1
2,2015-07-15,179,51,38,239.554,31,0,0,0,2,False,False,False,True,7,2
3,2015-07-16,279,5,39,239.554,24,0,2,0,4,True,False,False,False,7,3
4,2015-07-23,289,36,33,239.554,30,0,2,1,2,False,False,False,True,7,3
5,2015-07-10,179,51,38,239.554,31,0,0,0,2,False,False,False,True,7,4
6,2015-07-17,361,52,28,239.554,27,0,1,4,8,False,False,False,True,7,4
7,2015-07-24,260,50,36,239.554,23,0,4,0,4,False,False,False,True,7,4
8,2015-07-06,155,12,34,239.554,25,0,2,0,40,False,False,True,False,7,0
9,2015-07-13,235,11,37,239.554,29,1,1,1,8,False,False,False,True,7,0


In [27]:
data["Absenteeism Time in Hours"].median()

3.0

In [28]:
targets = np.where(data['Absenteeism Time in Hours']<data["Absenteeism Time in Hours"].median(),1,0)

In [30]:
data["Excessive Absenteeism"] = targets

In [31]:
data.head(10)

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_1,Reason_2,Reason_3,Reason_4,Month Values,Day of the Week,Excessive Absenteeism
0,2015-07-07,289,36,33,239.554,30,0,2,1,4,False,False,False,True,7,1,0
1,2015-07-14,118,13,50,239.554,31,0,1,0,0,False,False,False,False,7,1,1
2,2015-07-15,179,51,38,239.554,31,0,0,0,2,False,False,False,True,7,2,1
3,2015-07-16,279,5,39,239.554,24,0,2,0,4,True,False,False,False,7,3,0
4,2015-07-23,289,36,33,239.554,30,0,2,1,2,False,False,False,True,7,3,1
5,2015-07-10,179,51,38,239.554,31,0,0,0,2,False,False,False,True,7,4,1
6,2015-07-17,361,52,28,239.554,27,0,1,4,8,False,False,False,True,7,4,0
7,2015-07-24,260,50,36,239.554,23,0,4,0,4,False,False,False,True,7,4,0
8,2015-07-06,155,12,34,239.554,25,0,2,0,40,False,False,True,False,7,0,0
9,2015-07-13,235,11,37,239.554,29,1,1,1,8,False,False,False,True,7,0,0


In [32]:
targets.sum() / targets.shape[0]

np.float64(0.39285714285714285)

In [33]:
data_with_targets = data.drop(['Absenteeism Time in Hours', 'Date'], axis=1)

In [34]:
data_with_targets

Unnamed: 0,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Reason_1,Reason_2,Reason_3,Reason_4,Month Values,Day of the Week,Excessive Absenteeism
0,289,36,33,239.554,30,0,2,1,False,False,False,True,7,1,0
1,118,13,50,239.554,31,0,1,0,False,False,False,False,7,1,1
2,179,51,38,239.554,31,0,0,0,False,False,False,True,7,2,1
3,279,5,39,239.554,24,0,2,0,True,False,False,False,7,3,0
4,289,36,33,239.554,30,0,2,1,False,False,False,True,7,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,179,22,40,237.656,22,1,2,0,True,False,False,False,5,2,0
696,225,26,28,237.656,24,0,1,2,True,False,False,False,5,2,0
697,330,16,28,237.656,25,1,0,0,True,False,False,False,5,3,0
698,235,16,32,237.656,25,1,0,0,False,False,False,True,5,3,1


#### Select Inputs

In [35]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

In [36]:
scalar = sklearn.preprocessing.StandardScaler()

In [37]:
scalar.fit(unscaled_inputs)

0,1,2
,copy,True
,with_mean,True
,with_std,True


#### Standardize Inputs

In [40]:
scaled_inputs = scalar.transform(unscaled_inputs)

#### Make into dataset

In [43]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(scaled_inputs, targets, test_size=0.2, random_state=42)

In [45]:
X_train.shape, y_train.shape

((560, 14), (560,))

In [46]:
X_test.shape, y_test.shape

((140, 14), (140,))

### Logistic Regression

In [49]:
model = LogisticRegression()

In [50]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [51]:
model.score(X_train, y_train)

0.7607142857142857

#### Manually Checking Score

In [52]:
model_ouput = model.predict(X_train)

In [54]:
np.sum((model_ouput == y_train)) / model_ouput.shape[0]

np.float64(0.7607142857142857)

### Intercepts and Coeffecients

In [55]:
model.intercept_

array([-0.6648332])

In [56]:
model.coef_

array([[-0.44506876, -0.12423716,  0.31276019,  0.16783073, -0.31339497,
         0.1502103 , -0.46785063,  0.30348605, -2.34119336, -0.3077652 ,
        -1.58259014, -1.57348768, -0.01512182, -0.02571605]])

In [57]:
unscaled_inputs.columns.values

array(['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Month Values', 'Day of the Week'], dtype=object)

In [58]:
summary_table = pd.DataFrame(columns=["Features"], data = unscaled_inputs.columns.values)
summary_table["Coefficients"] = np.transpose(model.coef_)

In [59]:
summary_table

Unnamed: 0,Features,Coefficients
0,Transportation Expense,-0.445069
1,Distance to Work,-0.124237
2,Age,0.31276
3,Daily Work Load Average,0.167831
4,Body Mass Index,-0.313395
5,Education,0.15021
6,Children,-0.467851
7,Pets,0.303486
8,Reason_1,-2.341193
9,Reason_2,-0.307765


In [60]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ["Intercept", model.intercept_[0]]
summary_table = summary_table.sort_index()

In [61]:
summary_table

Unnamed: 0,Features,Coefficients
0,Intercept,-0.664833
1,Transportation Expense,-0.445069
2,Distance to Work,-0.124237
3,Age,0.31276
4,Daily Work Load Average,0.167831
5,Body Mass Index,-0.313395
6,Education,0.15021
7,Children,-0.467851
8,Pets,0.303486
9,Reason_1,-2.341193


In [62]:
summary_table["Odds_ratio"] = np.exp(summary_table.Coefficients)

In [63]:
summary_table

Unnamed: 0,Features,Coefficients,Odds_ratio
0,Intercept,-0.664833,0.514359
1,Transportation Expense,-0.445069,0.64078
2,Distance to Work,-0.124237,0.88317
3,Age,0.31276,1.367194
4,Daily Work Load Average,0.167831,1.182736
5,Body Mass Index,-0.313395,0.730961
6,Education,0.15021,1.162079
7,Children,-0.467851,0.626347
8,Pets,0.303486,1.354573
9,Reason_1,-2.341193,0.096213


In [64]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Features,Coefficients,Odds_ratio
3,Age,0.31276,1.367194
8,Pets,0.303486,1.354573
4,Daily Work Load Average,0.167831,1.182736
6,Education,0.15021,1.162079
13,Month Values,-0.015122,0.984992
14,Day of the Week,-0.025716,0.974612
2,Distance to Work,-0.124237,0.88317
10,Reason_2,-0.307765,0.735088
5,Body Mass Index,-0.313395,0.730961
1,Transportation Expense,-0.445069,0.64078
