In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator , TransformerMixin

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
 data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
 data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3


In [4]:
#The approach we will use here is to create two classes/targets, one representing people who have been excessively absent and another which represents people that haven't.

#We will take the median value of the absenteeism time in our cell.

# by using the median we have implicitly balanced the data set.

#Roughly half of the targets are zeros, while the other half ones

#if we dont use the median then One of the two classes exclusively thinking it did very well.

#Everything below the median would be considered normal- which is number 0

#Everything above the median would be excessively absent - which is number 1

data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours']> 3,1,0)

targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
data_preprocessed['Excessive Absenteeism'] = targets

In [7]:
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2,0
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3,1
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3,0


In [8]:
targets.sum()

#total of targets is 319 meaning there are 319 number-1 values. 
# 319/700 people are excessively absent

319

In [9]:
targets.shape[0]

700

In [10]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Distance to Work','Day of the Week','Daily Work Load Average'], axis =1)

In [11]:
data_with_targets.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Date,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,33,30,0,2,1,7,1
1,0,0,0,0,2015-07-14,118,50,31,0,1,0,7,0
2,0,0,0,1,2015-07-15,179,38,31,0,0,0,7,0
3,1,0,0,0,2015-07-16,279,39,24,0,2,0,7,1
4,0,0,0,1,2015-07-23,289,33,30,0,2,1,7,0


In [12]:
data_with_targets is data_preprocessed

False

In [13]:
#checkpoint to save data
data_with_targets = data_with_targets.copy()

In [14]:
data_with_targets.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Date,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,33,30,0,2,1,7,1
1,0,0,0,0,2015-07-14,118,50,31,0,1,0,7,0
2,0,0,0,1,2015-07-15,179,38,31,0,0,0,7,0
3,1,0,0,0,2015-07-16,279,39,24,0,2,0,7,1
4,0,0,0,1,2015-07-23,289,33,30,0,2,1,7,0


In [15]:
data_with_targets.shape

(700, 13)

In [16]:
# dropping Date column we already have Day of the Week
data_with_targets = data_with_targets.drop(['Date'], axis =1 )

In [17]:
data_with_targets.shape

(700, 12)

In [18]:
#select the inputs for our regression

data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value
0,0,0,0,1,289,33,30,0,2,1,7
1,0,0,0,0,118,50,31,0,1,0,7
2,0,0,0,1,179,38,31,0,0,0,7
3,1,0,0,0,279,39,24,0,2,0,7
4,0,0,0,1,289,33,30,0,2,1,7
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,40,22,1,2,0,5
696,1,0,0,0,225,28,24,0,1,2,5
697,1,0,0,0,330,28,25,1,0,0,5
698,0,0,0,1,235,32,25,1,0,0,5


In [19]:
unscaled_inputs = data_with_targets.iloc[:,:-1]
unscaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value
0,0,0,0,1,289,33,30,0,2,1,7
1,0,0,0,0,118,50,31,0,1,0,7
2,0,0,0,1,179,38,31,0,0,0,7
3,1,0,0,0,279,39,24,0,2,0,7
4,0,0,0,1,289,33,30,0,2,1,7
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,40,22,1,2,0,5
696,1,0,0,0,225,28,24,0,1,2,5
697,1,0,0,0,330,28,25,1,0,0,5
698,0,0,0,1,235,32,25,1,0,0,5


In [20]:
class CustomScaler(BaseEstimator,TransformerMixin):
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self,X, y = None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ =np.var(X[self.columns])
        
    def transform(self,X , y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns =self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [21]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value'], dtype=object)

In [22]:
#columns_to_scale = ['Transportation Expense', 'Age', 'Body Mass Index', 'Education', 'Children', 'Pets', 'Month Value']

columns_to_leave = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4','Education']

In [23]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_leave ]

In [24]:
#Omitting the dummy variables from the Standardization
absenteeism_scaler = CustomScaler(columns_to_scale) 



In [25]:
absenteeism_scaler.fit(unscaled_inputs)

In [26]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [27]:
scaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value
0,0,0,0,1,1.005844,-0.536062,0.767431,0,0.880469,0.268487,0.182726
1,0,0,0,0,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690,0.182726
2,0,0,0,1,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690,0.182726
3,1,0,0,0,0.854936,0.405184,-0.643782,0,0.880469,-0.589690,0.182726
4,0,0,0,1,1.005844,-0.536062,0.767431,0,0.880469,0.268487,0.182726
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690,-0.388293
696,1,0,0,0,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663,-0.388293
697,1,0,0,0,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690,-0.388293
698,0,0,0,1,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690,-0.388293


In [28]:
# we dont want to over predict our data just incase something happens and our regression model can not handle it.Therefore, we need to add some data aside for testing.
# train_test_split(scaled_inputs, targets)

#array 1 = a training dataset with inputs = x_train
#array 2 = a training dataset with targets =  y_train
#array 3 = a test dataset with inputs = x_test
#array 4 = a test dataset with targets = y_test


x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [29]:
print (x_train.shape,y_train.shape)

(560, 11) (560,)


In [30]:
print (x_test.shape, y_test.shape)

# our 80 - 20 split worked

(140, 11) (140,)


In [31]:
# MODELLING

reg = LogisticRegression()

In [32]:
reg.fit(x_train,y_train)

LogisticRegression()

In [33]:
reg.score(x_train,y_train)

0.7732142857142857

In [34]:
#### Manually check the accuracy of the model ######

#Accuracy means that x% ( inputs) of the model outputs match the targets
# we are trying to predict for the absent hours based on the trained input pattern that we have , Logistic Regression will predict outputs that are close to targets as possible.

#So if we want to find the accuracy of a model manually, we should find the outputs and compare them using Predict function

model_outputs = reg.predict(x_train)

In [35]:
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [36]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [37]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [38]:
#total number of matching prediction = total number of true is 1 in boolean

np.sum(model_outputs == y_train)

433

In [39]:
model_outputs.shape[0]

560

In [40]:

#Accuracy = Correct predictions/Observations

np.sum(model_outputs == y_train)/model_outputs.shape[0]

0.7732142857142857

In [41]:
# Finding the intercept and coefficients

reg.intercept_

array([-1.6474549])

In [42]:
reg.coef_

array([[ 2.80019733,  0.95188356,  3.11555338,  0.83900082,  0.60528415,
        -0.16989096,  0.27981088, -0.21053312,  0.34826214, -0.27739602,
         0.1589299 ]])

In [43]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value'], dtype=object)

In [44]:
feature_name = unscaled_inputs.columns.values

In [45]:
summary_table = pd.DataFrame(columns=['Feature Name'],data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table


#The further away from zero (coefficient), no matter if positive or negative, the bigger the weight of this feature.

Unnamed: 0,Feature Name,Coefficient
0,Reason 1,2.800197
1,Reason 2,0.951884
2,Reason 3,3.115553
3,Reason 4,0.839001
4,Transportation Expense,0.605284
5,Age,-0.169891
6,Body Mass Index,0.279811
7,Education,-0.210533
8,Children,0.348262
9,Pets,-0.277396


In [46]:
summary_table.index = summary_table.index +1
summary_table.loc[0]= ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()

summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-1.647455
1,Reason 1,2.800197
2,Reason 2,0.951884
3,Reason 3,3.115553
4,Reason 4,0.839001
5,Transportation Expense,0.605284
6,Age,-0.169891
7,Body Mass Index,0.279811
8,Education,-0.210533
9,Children,0.348262


In [47]:
#A feature is NOT particularly important if
# - coefficient is around 0 = whatever we multiply with 0 will equal to 0 
# - odds_ratio is around 1 = if odd_ratio is 1 = no change


summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [48]:
summary_table

Unnamed: 0,Feature Name,Coefficient,Odds_ratio
0,Intercept,-1.647455,0.192539
1,Reason 1,2.800197,16.447892
2,Reason 2,0.951884,2.590585
3,Reason 3,3.115553,22.545903
4,Reason 4,0.839001,2.314054
5,Transportation Expense,0.605284,1.831773
6,Age,-0.169891,0.843757
7,Body Mass Index,0.279811,1.32288
8,Education,-0.210533,0.810152
9,Children,0.348262,1.416604


In [49]:
summary_table.sort_values('Odds_ratio', ascending = False)


#Odds_ratio of 'Daily Work Load Average','Distance to Work','Day of the Week', is nearly 1 , hence this variable will remain unchanged. We need to consider dropping it as it is useless for our predection model

#From the coefficients, it seems that whenever a person has stated reason 1 or in particular it could be any reason, we have a much higher chance of getting excessive absence.

#So by looking at the coefficients table, we will notice that the most strongly pronounced features seem to be the 'four reasons for absence',' the transportation expense and whether a person has children AND the bottom two columns.(top and bottom rows are significant)



#We can carry on in this way, finishing with the 'daily work load, average distance to work and day of the week', which seem to have the smallest impact, their weight is almost zero as they are in the middle range.

Unnamed: 0,Feature Name,Coefficient,Odds_ratio
3,Reason 3,3.115553,22.545903
1,Reason 1,2.800197,16.447892
2,Reason 2,0.951884,2.590585
4,Reason 4,0.839001,2.314054
5,Transportation Expense,0.605284,1.831773
9,Children,0.348262,1.416604
7,Body Mass Index,0.279811,1.32288
11,Month Value,0.15893,1.172256
6,Age,-0.169891,0.843757
8,Education,-0.210533,0.810152


In [50]:
#reason 0 = No reason =baseline model(when no reason is given)
#reason 1 = Various deseases
#reason 2 = pregnancy and giving birth
#reason 3= poisoning
#reason 4= light diseases


#the most crucial reason for excessive absence is poisoning.= reason 1

#The weight means the odds of someone being excessively absent after being poisoned are 20 times higher than when no reason was reported.

#A person who was reported this is 14 times more likely to be excessively absent than a person who didn't specify a reason.

#Reason 3- pregnantcy it's only around two times more likely to be excessively absent than the base model. +++++report this reason +++

In [51]:
#TRANSPORT EXPENSE*** It's odds ratio implies that for one standardized unit or for one standard deviation increase in transportation expense, it is close to twice as likely to be excessively absent.

In [52]:
#Pet is a continuous variable, its odds ratio is zero point seven, so for each additional standardized unit of PET, the odds are one minus its odds ratio or twenty four percent lower than the base bottle.

#One explanation may be if you have several pets, you're probably not taking care of them on your own.

#Not being solely responsible for them implies somebody else can take them to the doctor if something

#is wrong.


#the odds are 1-0.75122 = 25% lower than the base model (no pet)

In [53]:
#Nevertheless, without an intercept, each prediction would be off the mark by precisely that value.



In [55]:
#Backwarrd elimination - simplify our model by removing the features that have close to or no contribution to the mode

#when we have p-Values, we get rid of all coefficients with p-Values >0.05

#Checking the regression accuracy, we see a very small difference in model score

#This shows us that the three variables we dropped were useless with or without them, we obtained practically the same results.

In [57]:
######TESTING ACCURACY #####

reg.score(x_test,y_test)


#So based on data that the model has NEVER seen before, we can say that in 75% of the cases the model will predict correctly if a person is going to be excessively absent.

0.75

In [61]:
#the first column shows the probability our model assigned to the observation being zero and the second, the probability, the model assigned to the observation being one.

predicted_proba = reg.predict_proba(x_test)


#That's why summing any two numbers horizontally will give you an output of one.

In [62]:
predicted_proba.shape

(140, 2)

In [64]:
predicted_proba[:,1]

#if the proba is below 0.5 , it will be a 0
#if the proba is above 0.5 , it will be a 1

array([0.28659587, 0.41275772, 0.55979179, 0.21840536, 0.91589146,
       0.66512397, 0.70015424, 0.86896029, 0.21374596, 0.25096368,
       0.50602402, 0.77515087, 0.92870849, 0.26821867, 0.69065865,
       0.4528329 , 0.44947725, 0.4607293 , 0.59798883, 0.94638425,
       0.2996991 , 0.21840536, 0.57962872, 0.57962872, 0.75216435,
       0.25433741, 0.48982726, 0.14309805, 0.79650267, 0.21840536,
       0.36956558, 0.67906035, 0.68502567, 0.52868083, 0.21840536,
       0.53506551, 0.22147081, 0.73692105, 0.40498044, 0.60505988,
       0.21075848, 0.45224466, 0.23751292, 0.39833498, 0.82755447,
       0.56797575, 0.69113325, 0.28659587, 0.21935267, 0.2033097 ,
       0.57628256, 0.3294664 , 0.66512397, 0.26949499, 0.83321968,
       0.43491525, 0.88374612, 0.23127072, 0.33415858, 0.34432939,
       0.69909345, 0.65494263, 0.29244941, 0.79200758, 0.20750276,
       0.26840558, 0.08708566, 0.22147081, 0.73245417, 0.30530219,
       0.22147081, 0.29014408, 0.90438191, 0.46061297, 0.60174

In [None]:
####SAVE THE MODEL = save the reg object#

In [65]:
import pickle

In [67]:
#pickle reg.model -logistic regresion model
with open('model','wb')as file:
    pickle.dump(reg,file)
    
#to unpickle using save function instead of dump    

In [68]:
#pickle scaler 
with open('scaler','wb')as file:
    pickle.dump(absenteeism_scaler,file)
    
    
      #to unpickle using save function instead of dump    

In [None]:
#Creating a MODULE for later use of the logistic model - getting the ML learning ready for deployment
#storing code in a module will allow us to resue it without trouble



###create a speacial class that we are going to use from here on to predict new data###
