# OLS, Ridge, and Lasso Logistic Regression #

## by Lorenz Madarang ##

## Data: https://archive.ics.uci.edu/ml/datasets/Absenteeism+at+work ##

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
from string import punctuation
from collections import Counter
import operator
from sklearn import linear_model
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

  from pandas.core import datetools


### Data Ingestion and Data Cleaning ##
I ingested the data from an Excel flat file and then I renamed the columns so that they would be easier to conduct python functions and python methods.  The assignment asks that we create a binary outcome so I converted the initial target "Absenteeism time in hours" which is a continuous variable into a binary type.  I did this by classifying the amount of absenteeism into "Greater Than a Workday" (8 hours) and "Less Than a Workday" (< 8 hours).  I also removed columns of data that would not help in the model such as the "ID" column.  Also, I removed the "Absenteeism time in hours" column as an input into the models as the Category dataframe which is the target was derived exactly from the "Absenteeism  time in hours" column.  

In [2]:
absentee = pd.read_excel('Absenteeism_at_work.xls')

In [3]:
absentee.head()

Unnamed: 0,ID,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,...,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours
0,11,26,7,3,1,289,36,13,33,239554,...,0,1,2,1,0,1,90,172,30,4
1,36,0,7,3,1,118,13,18,50,239554,...,1,1,1,1,0,0,98,178,31,0
2,3,23,7,4,1,179,51,18,38,239554,...,0,1,0,1,0,0,89,170,31,2
3,7,7,7,5,1,279,5,14,39,239554,...,0,1,2,1,1,0,68,168,24,4
4,11,23,7,5,1,289,36,13,33,239554,...,0,1,2,1,0,1,90,172,30,2


In [4]:
absentee = absentee.rename(index=str, columns = {'ID': 'id',
                                                'Reason for absence' : 'reason_absence',
                                                'Month of absence' : 'month_absence',
                                                'Day of the week' : 'day',
                                                'Seasons' : 'season',
                                                'Transportation expense' : 'transport_expense',
                                                'Distance from Residence to Work' : 'distance_to_work',
                                                'Service time' : 'service_time',
                                                'Age' : 'age',
                                                'Work load Average/day ' : 'workload_average_day',
                                                'Hit target' : 'hit_target',
                                                'Disciplinary failure' : 'discipline_failure',
                                                'Education' : 'education',
                                                'Son' : 'num_children',
                                                'Social drinker' : 'social_drinker',
                                                'Social smoker' : 'social_smoker',
                                                'Pet' : 'pet',
                                                'Weight' : 'weight',
                                                'Height' : 'height',
                                                'Body mass index' : 'bmi',
                                                'Absenteeism time in hours' : 'absentee_hours'})

In [5]:
absentee.head()

Unnamed: 0,id,reason_absence,month_absence,day,season,transport_expense,distance_to_work,service_time,age,workload_average_day,...,discipline_failure,education,num_children,social_drinker,social_smoker,pet,weight,height,bmi,absentee_hours
0,11,26,7,3,1,289,36,13,33,239554,...,0,1,2,1,0,1,90,172,30,4
1,36,0,7,3,1,118,13,18,50,239554,...,1,1,1,1,0,0,98,178,31,0
2,3,23,7,4,1,179,51,18,38,239554,...,0,1,0,1,0,0,89,170,31,2
3,7,7,7,5,1,279,5,14,39,239554,...,0,1,2,1,1,0,68,168,24,4
4,11,23,7,5,1,289,36,13,33,239554,...,0,1,2,1,0,1,90,172,30,2


In [6]:
absentee.drop(['id'], 1, inplace=True)

In [7]:
absentee_cat = []

In [8]:
for i in range(0, 740):
    if absentee['absentee_hours'][i] > 8:
        absentee_cat.append(1)
    else:
        absentee_cat.append(0)

In [9]:
category = pd.DataFrame()

In [10]:
category['absent_category'] = absentee_cat

In [11]:
absentee.head()

Unnamed: 0,reason_absence,month_absence,day,season,transport_expense,distance_to_work,service_time,age,workload_average_day,hit_target,discipline_failure,education,num_children,social_drinker,social_smoker,pet,weight,height,bmi,absentee_hours
0,26,7,3,1,289,36,13,33,239554,97,0,1,2,1,0,1,90,172,30,4
1,0,7,3,1,118,13,18,50,239554,97,1,1,1,1,0,0,98,178,31,0
2,23,7,4,1,179,51,18,38,239554,97,0,1,0,1,0,0,89,170,31,2
3,7,7,5,1,279,5,14,39,239554,97,0,1,2,1,1,0,68,168,24,4
4,23,7,5,1,289,36,13,33,239554,97,0,1,2,1,0,1,90,172,30,2


In [12]:
absentee.drop(['absentee_hours'], 1, inplace=True)

In [13]:
Y = list(category['absent_category'])

### OLS Logistic Regression ###
Two OLS logistic modles were created.  One model was created using the statsmodel package and the other was created using the sklearn.  

In [14]:
# Declare predictors.
X_statsmod = absentee

# The Statsmodels formulation requires a column with constant value 1 that
# will act as the intercept.
X_statsmod['intercept'] = 1 

# Declare and fit the model.
logit = sm.Logit(Y, X_statsmod)
result = logit.fit()

# Lots of information about the model and its coefficients, but the
# accuracy rate for predictions is missing.
print(result.summary())

         Current function value: 0.223719
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  740
Model:                          Logit   Df Residuals:                      720
Method:                           MLE   Df Model:                           19
Date:                Sun, 20 May 2018   Pseudo R-squ.:                  0.2316
Time:                        14:40:10   Log-Likelihood:                -165.55
converged:                      False   LL-Null:                       -215.44
                                        LLR p-value:                 5.881e-13
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
reason_absence          -0.1138      0.019     -5.859      0.000      -0.152      -0.076
month_absence           -0.0203    



In [15]:
# Calculate accuracy. First, get probability that each row will be admitted.
pred_statsmod = result.predict(X_statsmod)

# Code admission as 1 if probability is greater than .5.
pred_y_statsmod = np.where(pred_statsmod < .5, 0, 1)

Y = pd.DataFrame(Y)

# Accuracy table.
table = pd.crosstab(category['absent_category'], pred_y_statsmod)

print('\n Accuracy by admission status')
print(table)
print('\n Percentage accuracy')
print((table.iloc[0,0] + table.iloc[1,1]) / (table.sum().sum()))


 Accuracy by admission status
col_0              0  1
absent_category        
0                670  7
1                 56  7

 Percentage accuracy
0.914864864865


In [16]:
absentee.drop(['intercept'], 1, inplace=True)

In [26]:
# Declare a logistic regression classifier.
# Parameter regularization coefficient C described above.
lr = LogisticRegression(C=1e12)
y = category['absent_category']
X = absentee

# Fit the model.
fit = lr.fit(X, y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X)

print('\n Accuracy by admission status')
print(pd.crosstab(pred_y_sklearn, y))

print('\n Percentage accuracy')
print(lr.score(X, y))

Coefficients
[[ -1.20631466e-02  -5.36826404e-04  -1.44745447e-03  -9.92832756e-05
   -1.72578860e-03  -7.91995460e-03  -5.70482610e-04  -3.15228527e-03
    7.79343817e-06  -1.02965468e-02  -1.19067043e-04  -2.87069118e-04
    5.12443982e-04   3.55569085e-04  -3.39389935e-05  -6.36865488e-04
   -6.41223247e-03  -1.14504947e-02  -4.18750647e-03]]
[-0.00010276]

 Accuracy by admission status
absent_category    0   1
row_0                   
0                677  63

 Percentage accuracy
0.914864864865


In [27]:
cross_val_score(lr, X, y, cv=10)

array([ 0.90666667,  0.90666667,  0.90666667,  0.91891892,  0.91891892,
        0.91891892,  0.91891892,  0.91780822,  0.91780822,  0.91780822])

In [28]:
score = cross_val_score(lr, X, y, cv=10)
print("Mean accuracy is {}".format(score.mean()))

Mean accuracy is 0.9149100333209923


### Lasso Logistic Regression ###

In [20]:
lrlasso = LogisticRegression(penalty='l1')
y = category['absent_category']
X = absentee

# Fit the model.
fit = lrlasso.fit(X, y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lrlasso.predict(X)

print('\n Accuracy by admission status')
print(pd.crosstab(pred_y_sklearn, y))

print('\n Percentage accuracy')
print(lrlasso.score(X, y))

Coefficients
[[ -1.06052937e-01   4.07776796e-03  -2.62072719e-01   5.45547243e-02
   -5.65880370e-04  -1.66507845e-02  -7.49975626e-03  -6.90214331e-03
    6.59174812e-06  -8.83997596e-03  -3.46227377e+00   0.00000000e+00
    1.13921296e-01   1.25060822e+00  -8.55070896e-02  -2.16127460e-01
    1.36336998e-03   6.40738001e-03  -6.44614008e-02]]
[ 0.]

 Accuracy by admission status
absent_category    0   1
row_0                   
0                673  59
1                  4   4

 Percentage accuracy
0.914864864865


In [21]:
cross_val_score(lrlasso, X, y, cv=10)

array([ 0.89333333,  0.89333333,  0.8       ,  0.90540541,  0.91891892,
        0.91891892,  0.91891892,  0.91780822,  0.91780822,  0.93150685])

In [22]:
score = cross_val_score(lrlasso, X, y, cv=10)
print("Mean accuracy is {}".format(score.mean()))

Mean accuracy is 0.9029465630013576


### Ridge Logistic Regression ###

In [23]:
lrridge = LogisticRegression(penalty='l2')
y = category['absent_category']
X = absentee

# Fit the model.
fit = lrridge.fit(X, y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lrridge.predict(X)

print('\n Accuracy by admission status')
print(pd.crosstab(pred_y_sklearn, y))

print('\n Percentage accuracy')
print(lrridge.score(X, y))

Coefficients
[[ -1.20629148e-02  -5.36818946e-04  -1.44742666e-03  -9.92817803e-05
   -1.72579948e-03  -7.91980879e-03  -5.70476536e-04  -3.15223505e-03
    7.79321482e-06  -1.02963705e-02  -1.19064620e-04  -2.87063791e-04
    5.12433532e-04   3.55561634e-04  -3.39382675e-05  -6.36852742e-04
   -6.41213434e-03  -1.14503182e-02  -4.18743286e-03]]
[-0.00010276]

 Accuracy by admission status
absent_category    0   1
row_0                   
0                677  63

 Percentage accuracy
0.914864864865


In [24]:
cross_val_score(lrridge, X, y, cv=10)

array([ 0.90666667,  0.90666667,  0.90666667,  0.91891892,  0.91891892,
        0.91891892,  0.91891892,  0.91780822,  0.91780822,  0.91780822])

In [25]:
score = cross_val_score(lrridge, X, y, cv=10)
print("Mean accuracy is {}".format(score.mean()))

Mean accuracy is 0.9149100333209923


## Overall Evaluation ##
The two models that performed the best were the OLS Logistic Regression and the Ridge Logistic Regression.  Essentially, the OLS Logistic Regression and the Ridge Logistic Regression were the same.  The cross-validation scores for both models were the same.  The Lasso Logisitic Regression did not perform well.  I believe that the Lasso Logistic Regression did not perform as well because all the features in the model were most likely significant and it removed features that was able to account for a bit of the variance.  

One of the strengths of regression is that it is not a black box model.  We are able to understand what goes in the formulation and prediction of a regression model.  One of th elimitations of regression is the initial assumption that the residuals of the model will be normal.  It is pretty difficult to have normally distributed residuals in real life.  The validity of a regression model comes into question when the residuals do not follow a normal distribution.  