In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator , TransformerMixin

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
 data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [None]:
 data_preprocessed.head()

In [None]:
#The approach we will use here is to create two classes/targets, one representing people who have been excessively absent and another which represents people that haven't.

#We will take the median value of the absenteeism time in our cell.

# by using the median we have implicitly balanced the data set.

#Roughly half of the targets are zeros, while the other half ones

#if we dont use the median then One of the two classes exclusively thinking it did very well.

#Everything below the median would be considered normal- which is number 0

#Everything above the median would be excessively absent - which is number 1

data_preprocessed['Absenteeism Time in Hours'].median()


In [None]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours']> 3,1,0)

targets

In [None]:
data_preprocessed['Excessive Absenteeism'] = targets



In [None]:
data_preprocessed.head()

In [None]:
targets.sum()

#total of targets is 319 meaning there are 319 number-1 values. 
# 319/700 people are excessively absent

In [None]:
targets.shape[0]

In [None]:
targets.sum()/targets.shape[0]

#The result is around zero point four six, so around 46 percent of the targets are ones, thus around 54 percent of the targets are zero.

#Usually 60 40 split will work equally well for a logistic regression.

# However, a 45-55 percent is almost always sufficient 

In [None]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis =1)

In [None]:
data_with_targets.head()

In [None]:
data_with_targets is data_preprocessed

In [None]:
#checkpoint to save data
data_with_targets = data_with_targets.copy()



In [None]:
data_with_targets.head()

In [None]:
data_with_targets.shape

In [None]:
# dropping Date column we already have Day of the Week
data_with_targets = data_with_targets.drop(['Date'], axis =1 )

In [None]:
data_with_targets.head()

In [None]:
data_with_targets.shape

In [None]:
#select the inputs for our regression

data_with_targets.iloc[:,:-1]

In [None]:
unscaled_inputs = data_with_targets.iloc[:,:-1]
unscaled_inputs

In [None]:
#scaling function
#absenteeism_scaler = StandardScaler()

In [None]:
class CustomScaler(BaseEstimator,TransformerMixin):
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self,X, y = None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ =np.var(X[self.columns])
        
    def transform(self,X , y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns =self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [None]:
unscaled_inputs.columns.values

In [None]:
columns_to_scale = ['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value', 'Day of the Week']

In [None]:
#Omitting the dummy variables from the Standardization
absenteeism_scaler = CustomScaler(columns_to_scale) 

In [None]:
absenteeism_scaler.fit(unscaled_inputs)

In [None]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [None]:
scaled_inputs

In [None]:
#####This line will calculate the mean and standard deviation of each feature from unscalable inputs

#absenteeism_scaler.fit(unscaled_inputs)


In [None]:
#Transform unscaled inputs into Scaled inputs

# scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [None]:
# scaled_inputs




In [None]:
# scaled_inputs.shape

In [None]:
# we dont want to over predict our data just incase something happens and our regression model can not handle it.Therefore, we need to add some data aside for testing.
# train_test_split(scaled_inputs, targets)

#array 1 = a training dataset with inputs = x_train
#array 2 = a training dataset with targets =  y_train
#array 3 = a test dataset with inputs = x_test
#array 4 = a test dataset with targets = y_test

In [None]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state =20)

In [None]:
print (x_train.shape,y_train.shape)

In [None]:
print (x_test.shape, y_test.shape)

# our 80 - 20 split worked

In [None]:
# MODELLING

reg = LogisticRegression()

In [None]:
reg.fit(x_train,y_train)

In [None]:
reg.score(x_train,y_train)


In [None]:
#### Manually check the accuracy of the model ######

#Accuracy means that x% ( inputs) of the model outputs match the targets
# we are trying to predict for the absent hours based on the trained input pattern that we have , Logistic Regression will predict outputs that are close to targets as possible.

#So if we want to find the accuracy of a model manually, we should find the outputs and compare them using Predict function

model_outputs = reg.predict(x_train)

In [None]:
model_outputs

In [None]:
y_train

In [None]:
model_outputs == y_train

In [None]:
#total number of matching prediction = total number of true as in boolean true = 1

np.sum(model_outputs == y_train)

In [None]:
model_outputs.shape[0]

In [None]:

#Accuracy = Correct predictions/Observations

np.sum(model_outputs == y_train)/model_outputs.shape[0]



In [None]:
# Finding the intercept and coefficients

reg.intercept_

In [None]:
reg.coef_

In [None]:
unscaled_inputs.columns.values

In [None]:
feature_name = unscaled_inputs.columns.values

In [None]:
summary_table = pd.DataFrame(columns=['Feature Name'],data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table


#The further away from zero (coefficient), no matter if positive or negative, the bigger the weight of this feature.

In [None]:
summary_table.index = summary_table.index +1
summary_table.loc[0]= ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()

summary_table

In [None]:
#A feature is NOT particularly important if
# - coefficient is around 0 = whatever we multiply with 0 will equal to 0 
# - odds_ratio is around 1 = if odd_ratio is 1 = no change


summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [None]:
summary_table



In [None]:
summary_table.sort_values('Odds_ratio', ascending = False)


#Odds_ratio of 'Daily Work Load Average','Distance to Work','Day of the Week', is nearly 1 , hence this variable will remain unchanged. We need to consider dropping it as it is useless for our predection model

#From the coefficients, it seems that whenever a person has stated reason 1 or in particular it could be any reason, we have a much higher chance of getting excessive absence.