# Business Case Study - Logistic Regression

In [1]:
import pickle
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
data_preprocessed = pd.read_csv("C:/Online_Courses/The Data Science Course Complete Data Science Bootcamp 2024/Excel Files/Absenteeism_preprocessed.csv")
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [3]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [4]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] >
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [5]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
data_preprocessed['Excessive Absenteeism'] = targets

In [7]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


# A Comment on Targets

In [8]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [9]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of the Week',
                                           'Daily Work Load Average', 'Distance to Work'], 
                                           axis=1) 

In [10]:
data_with_targets is data_preprocessed

False

In [11]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


# Selecting Inputs for Regression

In [12]:
data_with_targets.shape

(700, 12)

In [13]:
data_with_targets.iloc[:, :-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [14]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

# Standardize the Data

In [15]:
# absenteeism_scaler = StandardScaler()

In [16]:
class CustomScaler(BaseEstimator, TransformerMixin): 
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std
        self.scaler = StandardScaler(copy=self.copy, with_mean=self.with_mean, 
                                     with_std=self.with_std)
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = X[self.columns].mean()
        self.var_ = X[self.columns].var()
        return self

    def transform(self, X, y=None, copy=None):
        if copy is None: 
            copy = self.copy
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.drop(columns=self.columns)
        X_transformed = pd.concat([X_not_scaled, X_scaled], axis=1)
        X_transformed = X_transformed[X.columns]
        return X_transformed

In [17]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [18]:
columns_to_scale = ['Month Value', 'Day of the Week', 'Transportation Expense', 
                    'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 
                    'Children', 'Pet']

columns_to_omit = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Education']

In [19]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [20]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [21]:
absenteeism_scaler.fit(unscaled_inputs)

In [22]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [23]:
unscaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [24]:
scaled_inputs.shape

(700, 11)

# Splitting the data into Train, Test, & Shuffle

In [25]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 637  1.732051 -0.092981 -0.314485 -1.217485    -1.166834   
 283 -0.577350 -0.092981 -0.314485  0.821365     0.629611   
 475 -0.577350 -0.092981 -0.314485  0.821365     0.030796   
 428 -0.577350 -0.092981 -0.314485  0.821365     0.929019   
 560 -0.577350 -0.092981 -0.314485  0.821365     1.527833   
 ..        ...       ...       ...       ...          ...   
 309 -0.577350 -0.092981 -0.314485  0.821365     0.929019   
 507  1.732051 -0.092981 -0.314485 -1.217485    -0.568019   
 515 -0.577350 -0.092981 -0.314485  0.821365     0.929019   
 298 -0.577350 -0.092981 -0.314485  0.821365     0.929019   
 257  1.732051 -0.092981 -0.314485 -1.217485     0.929019   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 637                0.040034 -1.320435        -0.643782          0 -0.019280   
 283                0.190942  1.032682         2.649049          0 -0.019280   
 475                1.0058

In [26]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, 
                                                    random_state = 20)

In [27]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [28]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


# Logistic Regression with sklearn

In [29]:
reg = LogisticRegression()

In [30]:
reg.fit(x_train, y_train)

In [31]:
reg.score(x_train, y_train)

0.7892857142857143

## Manually Checking the Accuracy

In [32]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [33]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [34]:
model_outputs == y_train

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [35]:
np.sum((model_outputs == y_train))

442

In [36]:
model_outputs.shape[0]

560

In [37]:
np.sum((model_outputs == y_train)) / model_outputs.shape[0]

0.7892857142857143

# Finding the Intercepts and Coefficients

In [38]:
reg.intercept_

array([-0.15996908])

In [39]:
reg.coef_

array([[ 2.05299955,  0.32802547,  1.55064307,  1.29630439,  0.01968239,
         0.71304234, -0.20212992,  0.33611579, -0.34650064,  0.38055556,
        -0.31823261]])

In [40]:
feature_name = unscaled_inputs.columns.values
feature_name

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [41]:
summary_table = pd.DataFrame(columns=['Feature Name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Reason_1,2.053
1,Reason_2,0.328025
2,Reason_3,1.550643
3,Reason_4,1.296304
4,Month Value,0.019682
5,Transportation Expense,0.713042
6,Age,-0.20213
7,Body Mass Index,0.336116
8,Education,-0.346501
9,Children,0.380556


In [42]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()

In [43]:
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-0.159969
1,Reason_1,2.053
2,Reason_2,0.328025
3,Reason_3,1.550643
4,Reason_4,1.296304
5,Month Value,0.019682
6,Transportation Expense,0.713042
7,Age,-0.20213
8,Body Mass Index,0.336116
9,Education,-0.346501


# Interpreting the Coefficients

In [44]:
type(summary_table)

pandas.core.frame.DataFrame

In [45]:
summary_table['Odds Ratio'] = np.exp(summary_table.Coefficient)

In [46]:
summary_table

Unnamed: 0,Feature Name,Coefficient,Odds Ratio
0,Intercept,-0.159969,0.85217
1,Reason_1,2.053,7.791236
2,Reason_2,0.328025,1.388224
3,Reason_3,1.550643,4.714501
4,Reason_4,1.296304,3.655761
5,Month Value,0.019682,1.019877
6,Transportation Expense,0.713042,2.040189
7,Age,-0.20213,0.816989
8,Body Mass Index,0.336116,1.399501
9,Education,-0.346501,0.707158


In [47]:
summary_table.sort_values('Odds Ratio', ascending=False)

Unnamed: 0,Feature Name,Coefficient,Odds Ratio
1,Reason_1,2.053,7.791236
3,Reason_3,1.550643,4.714501
4,Reason_4,1.296304,3.655761
6,Transportation Expense,0.713042,2.040189
10,Children,0.380556,1.463097
8,Body Mass Index,0.336116,1.399501
2,Reason_2,0.328025,1.388224
5,Month Value,0.019682,1.019877
0,Intercept,-0.159969,0.85217
7,Age,-0.20213,0.816989


# Testing the Model

In [48]:
reg.score(x_test, y_test)

0.7285714285714285

In [49]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.75437874, 0.24562126],
       [0.59659057, 0.40340943],
       [0.44432779, 0.55567221],
       [0.76194183, 0.23805817],
       [0.06719286, 0.93280714],
       [0.28193702, 0.71806298],
       [0.29134233, 0.70865767],
       [0.07137203, 0.92862797],
       [0.74741948, 0.25258052],
       [0.75763979, 0.24236021],
       [0.47969455, 0.52030545],
       [0.15482956, 0.84517044],
       [0.03591105, 0.96408895],
       [0.72399614, 0.27600386],
       [0.22844071, 0.77155929],
       [0.50467688, 0.49532312],
       [0.47822389, 0.52177611],
       [0.48558043, 0.51441957],
       [0.36682921, 0.63317079],
       [0.03422621, 0.96577379],
       [0.74294353, 0.25705647],
       [0.76194183, 0.23805817],
       [0.47586714, 0.52413286],
       [0.46999143, 0.53000857],
       [0.15804602, 0.84195398],
       [0.74853037, 0.25146963],
       [0.49100882, 0.50899118],
       [0.89801678, 0.10198322],
       [0.16306032, 0.83693968],
       [0.76194183, 0.23805817],
       [0.

In [50]:
predicted_proba.shape

(140, 2)

In [51]:
predicted_proba[:, 1]

array([0.24562126, 0.40340943, 0.55567221, 0.23805817, 0.93280714,
       0.71806298, 0.70865767, 0.92862797, 0.25258052, 0.24236021,
       0.52030545, 0.84517044, 0.96408895, 0.27600386, 0.77155929,
       0.49532312, 0.52177611, 0.51441957, 0.63317079, 0.96577379,
       0.25705647, 0.23805817, 0.52413286, 0.53000857, 0.84195398,
       0.25146963, 0.50899118, 0.10198322, 0.83693968, 0.23805817,
       0.39774944, 0.72904052, 0.72633865, 0.52324638, 0.23805817,
       0.63749844, 0.25369465, 0.84591457, 0.45654633, 0.63004336,
       0.2369909 , 0.47484028, 0.24925761, 0.10807392, 0.8362488 ,
       0.68434073, 0.73379032, 0.24020261, 0.24439607, 0.23592693,
       0.48660833, 0.06672586, 0.71806298, 0.23899588, 0.84644022,
       0.40624919, 0.94656433, 0.25158387, 0.08082868, 0.08126759,
       0.71476961, 0.72162826, 0.25492727, 0.84583546, 0.23401401,
       0.24453097, 0.01179846, 0.25481203, 0.83683998, 0.28073898,
       0.24705862, 0.07730671, 0.91114837, 0.45362359, 0.63203

# Save the Model

In [52]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [53]:
with open('scaler', 'wb') as file: 
    pickle.dump(absenteeism_scaler, file)