# import relevant libraries

In [6]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle

# loading the data

In [7]:
data = pd.read_csv('df_preprocesed.csv')
data=data.copy()
data.head()

Unnamed: 0,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Group 1,Group 2,Group 3,Group 4,Month value,Day value
0,289,36,33,239.554,30,0,2,1,4,0,0,0,1,7,2
1,118,13,50,239.554,31,0,1,0,0,0,0,0,0,7,2
2,179,51,38,239.554,31,0,0,0,2,0,0,0,1,7,3
3,279,5,39,239.554,24,0,2,0,4,1,0,0,0,7,4
4,289,36,33,239.554,30,0,2,1,2,0,0,0,1,7,4


# create  targets for logistic regression 

converting the 'Abesnteeism Time in Hours' values to a binary values: \
0 - 'Absenteeism Time in Hours' <u> <b> bellow </b> </u> the median\
1 - 'Absenteeism Time in Hours' <u> <b> above </b> </u> the median

In [8]:
median_Absenteeism_Time = np.median(data['Absenteeism Time in Hours'])
data['Absenteeism Time in Hours'] = data['Absenteeism Time in Hours'].map(lambda x:0 if (x<=median_Absenteeism_Time) else 1)


# checkpoint

In [9]:
data_with_targets = data.copy()
data_with_targets.head()

Unnamed: 0,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Group 1,Group 2,Group 3,Group 4,Month value,Day value
0,289,36,33,239.554,30,0,2,1,1,0,0,0,1,7,2
1,118,13,50,239.554,31,0,1,0,0,0,0,0,0,7,2
2,179,51,38,239.554,31,0,0,0,0,0,0,0,1,7,3
3,279,5,39,239.554,24,0,2,0,1,1,0,0,0,7,4
4,289,36,33,239.554,30,0,2,1,0,0,0,0,1,7,4


# creating inputs and targets

In [11]:
X = data_with_targets.loc[:,data_with_targets.columns!='Absenteeism Time in Hours']
Y = data_with_targets['Absenteeism Time in Hours']
feature_names = X.columns.values

In [12]:
X

Unnamed: 0,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Group 1,Group 2,Group 3,Group 4,Month value,Day value
0,289,36,33,239.554,30,0,2,1,0,0,0,1,7,2
1,118,13,50,239.554,31,0,1,0,0,0,0,0,7,2
2,179,51,38,239.554,31,0,0,0,0,0,0,1,7,3
3,279,5,39,239.554,24,0,2,0,1,0,0,0,7,4
4,289,36,33,239.554,30,0,2,1,0,0,0,1,7,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,179,22,40,237.656,22,1,2,0,1,0,0,0,5,3
696,225,26,28,237.656,24,0,1,2,1,0,0,0,5,3
697,330,16,28,237.656,25,1,0,0,1,0,0,0,5,4
698,235,16,32,237.656,25,1,0,0,0,0,0,1,5,4


# standardization

In [14]:
numeric_columns = ['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index','Children','Pets','Month value','Day value']
binary_columns = ['Education','Group 1', 'Group 2', 'Group 3', 'Group 4']
scaler = StandardScaler()
scaler.fit(X[numeric_columns])
x_numeric_scaled =pd.DataFrame(data = scaler.transform(X[numeric_columns]), columns=numeric_columns)
x_scaled = pd.concat([x_numeric_scaled,X[binary_columns]], axis=1)
feature_names=x_scaled.columns.values
x_scaled

Unnamed: 0,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets,Month value,Day value,Education,Group 1,Group 2,Group 3,Group 4
0,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487,0.182726,-0.683704,0,0,0,0,1
1,-1.574681,-1.141882,2.130803,-0.806331,1.002633,-0.019280,-0.589690,0.182726,-0.683704,0,0,0,0,0
2,-0.654143,1.426749,0.248310,-0.806331,1.002633,-0.919030,-0.589690,0.182726,-0.007725,0,0,0,0,1
3,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0.880469,-0.589690,0.182726,0.668253,0,1,0,0,0
4,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487,0.182726,0.668253,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,0.880469,-0.589690,-0.388293,-0.007725,1,1,0,0,0
696,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,-0.019280,1.126663,-0.388293,-0.007725,0,1,0,0,0
697,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,-0.919030,-0.589690,-0.388293,0.668253,1,1,0,0,0
698,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,-0.919030,-0.589690,-0.388293,0.668253,1,0,0,0,1


# splitting the data to train and test

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled,Y,test_size=0.2)

# logistic Regression

### training the model

In [16]:
reg = LogisticRegression()
reg.fit(x_train,y_train)
y_hat_train = reg.predict(x_train)

### checking model score

In [17]:
#the score:
score = reg.score(x_train,y_train)
print('Regression score : ' + str(score))
#the accuracy:
accuracy= metrics.accuracy_score(y_train,y_hat_train)
print('Accuracy of train values : '+ str(accuracy))

Regression score : 0.7696428571428572
Accuracy of train values : 0.7696428571428572


# creating a summary table:

In [18]:
intercept = reg.intercept_[0]

In [19]:
coefficients = reg.coef_

In [20]:
summary_table = pd.DataFrame(columns= ['Feature name'], data= feature_names)

In [21]:
summary_table['Coefficent'] = np.transpose(coefficients)

In [22]:
summary_table.index = summary_table.index+1
summary_table.loc[0] = ['Inttercept', intercept]
summary_table=summary_table.sort_index()

In [23]:
summary_table['Odds ratio'] = np.exp(summary_table['Coefficent'] )

In [25]:
summary_table = summary_table.sort_values('Odds ratio',ascending=False)


In [27]:
summary_table

Unnamed: 0,Feature name,Coefficent,Odds ratio
13,Group 3,2.804515,16.519055
11,Group 1,2.708298,15.003723
12,Group 2,0.785218,2.192884
14,Group 4,0.771702,2.163446
1,Transportation Expense,0.634071,1.88527
6,Children,0.458961,1.582428
5,Body Mass Index,0.244268,1.276686
8,Month value,0.076317,1.079305
4,Daily Work Load Average,0.054846,1.056378
2,Distance to Work,-0.007044,0.992981


# testing the mode:


In [28]:
reg.score(x_test,y_test)

0.75

In [29]:
predicted_proba = reg.predict_proba(x_test)

# save the model

In [137]:
with open ('model','wb') as file:
    pickle.dump(reg,file)

### saving the scaler 

In [139]:
with open ('scaler','wb') as filee:
    pickle.dump(scaler,filee)

### saving the x test with probability and prediction:


In [30]:
x_test['Probability'] = predicted_proba[:,1]
x_test['Prediction'] = np.int64(np.round(predicted_proba[:,1]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test['Probability'] = predicted_proba[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test['Prediction'] = np.int64(np.round(predicted_proba[:,1]))


In [31]:
x_test.to_csv('Absenteeism predictions.csv', index=False)