# Repeating the steps in the previous notebook to create a logistic regression model but with the improved dataset.

In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset_final = pd.read_csv('dataset_improved.csv')
dataset_final.head(10)

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day,Transportation Expense,Body Mass Index,Education,Children,Pets,Excessively Absent
0,0,0,0,1,7,1,289,30,0,2,1,1
1,0,0,0,0,7,1,118,31,0,1,0,0
2,0,0,0,1,7,2,179,31,0,0,0,0
3,1,0,0,0,7,3,279,24,0,2,0,1
4,0,0,0,1,7,3,289,30,0,2,1,0
5,0,0,0,1,7,4,179,31,0,0,0,0
6,0,0,0,1,7,4,361,27,0,1,4,1
7,0,0,0,1,7,4,260,23,0,4,0,1
8,0,0,1,0,7,0,155,25,0,2,0,1
9,0,0,0,1,7,0,235,29,1,1,1,1


In [3]:
raw_features = dataset_final.iloc[:, 0:-1]
targets = dataset_final.iloc[:, -1]

In [4]:
raw_features

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day,Transportation Expense,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,30,0,2,1
1,0,0,0,0,7,1,118,31,0,1,0
2,0,0,0,1,7,2,179,31,0,0,0
3,1,0,0,0,7,3,279,24,0,2,0
4,0,0,0,1,7,3,289,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,1,2,0
696,1,0,0,0,5,2,225,24,0,1,2
697,1,0,0,0,5,3,330,25,1,0,0
698,0,0,0,1,5,3,235,25,1,0,0


In [5]:
targets

0      1
1      0
2      0
3      1
4      0
      ..
695    1
696    0
697    1
698    0
699    0
Name: Excessively Absent, Length: 700, dtype: int64

In [6]:
#It is vital to standardize the dataset for it to be used in a machine learning model. SKLearn has builtin functions that 
#serve the purpose. Standardization is subtracting the mean from every value and dividing the result by the standard
#deviation. Can be interpreted as centering and scaling the data. However since the dataset consists of dummy variables
#too, we have to design a custom scaler that doesn't involve the dummy variables with any kind of standardization.

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler=StandardScaler(copy, with_mean, with_std)
        self.columns=columns
        self.mean_=None
        self.var_=None
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_=np.mean(X[self.columns])
        self.var=np.var(X[self.columns])
        return self
    def transform(self, X, y=None, copy=None):
        init_col_order=X.columns
        std_X=pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        raw_X=X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([raw_X, std_X], axis=1)[init_col_order]

In [7]:
columns_to_be_standardized=['Month', 'Day', 'Transportation Expense', 'Body Mass Index', 'Children', 'Pets']

In [8]:
absenteeism_scaler=CustomScaler(columns_to_be_standardized)



In [9]:
absenteeism_scaler.fit(raw_features)
std_features = absenteeism_scaler.transform(raw_features)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(std_features, targets, train_size=0.8)
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

reg = LogisticRegression()
reg.fit(X_train, y_train)

LogisticRegression()

In [10]:
reg.score(X_train, y_train)

0.7732142857142857

In [12]:
feature_names = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month', 'Day',
       'Transportation Expense', 'Body Mass Index', 'Education',
       'Children', 'Pets']
summary_table = pd.DataFrame(columns=['Feature Name'], data=feature_names)
summary_table['Coefficient']=reg.coef_.T
summary_table.index=summary_table.index+1
summary_table.loc[0]=['Intercept', reg.intercept_[0]]
summary_table['Log h']=np.exp(summary_table['Coefficient'])
summary_table=summary_table.sort_values('Log h', ascending=False)
summary_table

Unnamed: 0,Feature Name,Coefficient,Log h
3,Reason 3,2.966463,19.423107
1,Reason 1,2.79222,16.317211
4,Reason 4,0.968149,2.633067
2,Reason 2,0.930067,2.534678
7,Transportation Expense,0.62355,1.865538
10,Children,0.397365,1.487899
8,Body Mass Index,0.194248,1.214397
5,Month,0.112766,1.11937
9,Education,0.10469,1.110366
11,Pets,-0.2014,0.817585


In [15]:
#reg.score() will return the accuracy of the model while dealing with the testing set.

In [16]:
reg.score(X_test, y_test)

0.7642857142857142

In [14]:
#To predict the probability of an entry being excessively absent
predict_proba = reg.predict_proba(X_test)
predict_proba = predict_proba[:, 1]
predict_proba

array([0.19142555, 0.76220765, 0.24623091, 0.22099619, 0.70573061,
       0.30347081, 0.58498705, 0.22099619, 0.40007702, 0.83291667,
       0.07079639, 0.29783628, 0.20884271, 0.13591449, 0.53168904,
       0.1104755 , 0.60363258, 0.75601698, 0.49421416, 0.14563498,
       0.50143678, 0.19902169, 0.09038612, 0.2078495 , 0.13006636,
       0.62995055, 0.50872192, 0.7123723 , 0.59494418, 0.57768169,
       0.78108823, 0.20420385, 0.79763521, 0.76615217, 0.40365576,
       0.53968447, 0.10735106, 0.65043198, 0.83707771, 0.57731619,
       0.56846332, 0.88802892, 0.30564693, 0.19902169, 0.11839772,
       0.16544445, 0.85505573, 0.86662072, 0.4455297 , 0.22449515,
       0.27333605, 0.71833866, 0.46183342, 0.26814114, 0.77491678,
       0.21011008, 0.60626782, 0.21824754, 0.51752747, 0.609854  ,
       0.20909976, 0.60855216, 0.34090818, 0.92746613, 0.38467233,
       0.30222857, 0.23890986, 0.23738866, 0.77715049, 0.18315409,
       0.23539821, 0.21096295, 0.75688323, 0.07268956, 0.67644