In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
preprocessed_data = pd.read_csv('preprocessed_data.csv')
preprocessed_data.columns

Index(['Unnamed: 0', 'reason_type_1', 'reason_type_2', 'reason_type_3',
       'reason_type_4', 'Month value', 'Weekday', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index',
       'Education', 'Children', 'Pets', 'Absenteeism Time in Hours'],
      dtype='object')

In [3]:
preprocessed_data.columns = ['Unnamed: 0', 'Reason_1', 'Reason_2', 'Reason_3',
       'Reason_4', 'Month value', 'Weekday', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index',
       'Education', 'Children', 'Pets', 'Absenteeism Time in Hours']

In [4]:
preprocessed_data = preprocessed_data.iloc[:,1:]
preprocessed_data

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month value,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2


In [5]:
preprocessed_data['Absenteeism Time in Hours'].unique()

array([  4,   0,   2,   8,  40,   1,   7,   3,  32,   5,  16,  24,  64,
        56,  80, 120, 112, 104,  48], dtype=int64)

In [6]:
preprocessed_data['Absenteeism Time in Hours'].median()

3.0

In [7]:
preprocessed_data['Excessive Absentees'] = np.where(preprocessed_data['Absenteeism Time in Hours'] > preprocessed_data['Absenteeism Time in Hours'].median(),
                                                   1, 0)

In [8]:
preprocessed_data

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month value,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absentees
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2,0


In [9]:
preprocessed_data.drop(['Absenteeism Time in Hours'], axis=1, inplace=True)

In [10]:
preprocessed_data

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month value,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absentees
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,0


In [11]:
preprocessed_data['Excessive Absentees'].sum()

319

In [12]:
targets = pd.DataFrame(preprocessed_data.iloc[:,-1])

In [13]:
preprocessed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Reason_1                 700 non-null    int64  
 1   Reason_2                 700 non-null    int64  
 2   Reason_3                 700 non-null    int64  
 3   Reason_4                 700 non-null    int64  
 4   Month value              700 non-null    int64  
 5   Weekday                  700 non-null    int64  
 6   Transportation Expense   700 non-null    int64  
 7   Distance to Work         700 non-null    int64  
 8   Age                      700 non-null    int64  
 9   Daily Work Load Average  700 non-null    float64
 10  Body Mass Index          700 non-null    int64  
 11  Education                700 non-null    int64  
 12  Children                 700 non-null    int64  
 13  Pets                     700 non-null    int64  
 14  Excessive Absentees      7

In [14]:
inputs = pd.DataFrame(preprocessed_data.iloc[:,:-1])

In [15]:
targets

Unnamed: 0,Excessive Absentees
0,1
1,0
2,0
3,1
4,0
...,...
695,1
696,0
697,1
698,0


In [16]:
inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month value,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(inputs, targets, random_state=20)

In [18]:
X_train

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month value,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
266,0,0,0,1,8,2,118,13,50,265.615,31,0,1,0
424,1,0,0,0,4,2,118,10,37,239.409,28,0,0,0
290,0,0,0,1,10,2,235,11,37,265.017,29,1,1,1
397,0,0,0,1,3,1,378,49,36,244.387,21,0,2,4
398,0,0,0,1,3,0,179,51,38,244.387,31,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,1,0,0,0,5,2,118,13,50,378.884,31,0,1,0
223,0,0,0,1,6,3,291,31,40,377.550,25,0,1,1
271,0,0,0,1,9,4,179,51,38,294.217,31,0,0,0
474,0,0,0,1,7,4,361,52,28,230.290,27,0,1,4


In [19]:
y_train

Unnamed: 0,Excessive Absentees
266,0
424,0
290,1
397,1
398,1
...,...
218,1
223,1
271,0
474,1


In [20]:
525/700

0.75

In [21]:
reg1 = LogisticRegression()

In [22]:
reg1.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
reg1.score(X_train, y_train)

0.7295238095238096

## Standardize

In [25]:
inputs.columns

Index(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month value',
       'Weekday', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
       'Pets'],
      dtype='object')

In [26]:
inputs.drop(['Daily Work Load Average', 'Weekday'], axis=1, inplace=True)

In [27]:
inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month value,Transportation Expense,Distance to Work,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,36,33,30,0,2,1
1,0,0,0,0,7,118,13,50,31,0,1,0
2,0,0,0,1,7,179,51,38,31,0,0,0
3,1,0,0,0,7,279,5,39,24,0,2,0
4,0,0,0,1,7,289,36,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,22,40,22,1,2,0
696,1,0,0,0,5,225,26,28,24,0,1,2
697,1,0,0,0,5,330,16,28,25,1,0,0
698,0,0,0,1,5,235,16,32,25,1,0,0


In [28]:
# scaled_data = StandardScaler()
# scaled_data.fit(preprocessed_data.iloc[:,:-1])
# standardized_data = scaled_data.transform(preprocessed_data.iloc[:,:-1])
# standardized_data

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    def __init__(self,columns):
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.array(np.mean(X[self.columns]))
        self.var_ = np.array(np.var(X[self.columns]))
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

columns = ['Month value', 'Transportation Expense', 'Distance to Work',
       'Age', 'Body Mass Index',
       'Children', 'Pets']
scaler = CustomScaler(columns)

In [29]:
# standardized_data.shape
scaler.fit(inputs)
scaler.transform(inputs)
standardized_data = scaler.transform(inputs)

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [30]:
# standardized_data[0]
standardized_data

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month value,Transportation Expense,Distance to Work,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,0.412816,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,-1.141882,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,1.426749,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,-1.682647,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,0.412816,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,-0.533522,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-0.263140,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-0.939096,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.939096,-0.692937,-0.408580,1,-0.919030,-0.589690


In [31]:
X_train, X_test, y_train, y_test = train_test_split(standardized_data, preprocessed_data.iloc[:,-1].values, random_state=20)

In [32]:
X_train

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month value,Transportation Expense,Distance to Work,Age,Body Mass Index,Education,Children,Pets
266,0,0,0,1,0.468236,-1.574681,-1.141882,2.130803,1.002633,0,-0.019280,-0.589690
424,1,0,0,0,-0.673803,-1.574681,-1.344669,0.091435,0.297027,0,-0.919030,-0.589690
290,0,0,0,1,1.039256,0.190942,-1.277074,0.091435,0.532229,1,-0.019280,0.268487
397,0,0,0,1,-0.959313,2.348925,1.291558,-0.065439,-1.349389,0,0.880469,2.843016
398,0,0,0,1,-0.959313,-0.654143,1.426749,0.248310,1.002633,0,-0.919030,-0.589690
...,...,...,...,...,...,...,...,...,...,...,...,...
218,1,0,0,0,-0.388293,-1.574681,-1.141882,2.130803,1.002633,0,-0.019280,-0.589690
223,0,0,0,1,-0.102784,1.036026,0.074838,0.562059,-0.408580,0,-0.019280,0.268487
271,0,0,0,1,0.753746,-0.654143,1.426749,0.248310,1.002633,0,-0.919030,-0.589690
474,0,0,0,1,0.182726,2.092381,1.494345,-1.320435,0.061825,0,-0.019280,2.843016


In [33]:
reg1.fit(X_train, y_train)

In [34]:
reg1.score(X_train, y_train)

0.7866666666666666

In [35]:
reg1.predict(X_train)

array([0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1,

In [36]:
reg1.coef_

array([[ 2.73261951,  0.97820841,  3.03311159,  0.70966746,  0.17570843,
         0.65119921, -0.01678255, -0.1826429 ,  0.3488031 , -0.34761946,
         0.28416011, -0.28651989]])

In [37]:
reg1.intercept_

array([-1.50938203])

In [38]:
reg1.coef_.size

12

In [39]:
# log(odds) = intercept + c1*a1 + c2*a2 + c3*a3 + ..... + c14*a14
reg_analysis = pd.DataFrame(reg1.coef_.T, columns=['Coefficients'])

In [40]:
reg_analysis

Unnamed: 0,Coefficients
0,2.73262
1,0.978208
2,3.033112
3,0.709667
4,0.175708
5,0.651199
6,-0.016783
7,-0.182643
8,0.348803
9,-0.347619


In [41]:
reg_analysis['Coefficients']

0     2.732620
1     0.978208
2     3.033112
3     0.709667
4     0.175708
5     0.651199
6    -0.016783
7    -0.182643
8     0.348803
9    -0.347619
10    0.284160
11   -0.286520
Name: Coefficients, dtype: float64

In [42]:
# reg_analysis = pd.concat([pd.DataFrame(preprocessed_data.iloc[:,:-1].columns, columns=['Feature names']), reg_analysis], axis=1)
reg_analysis = pd.concat([pd.DataFrame(inputs.columns, columns=['Feature names']), reg_analysis], axis=1)

In [43]:
reg_analysis

Unnamed: 0,Feature names,Coefficients
0,Reason_1,2.73262
1,Reason_2,0.978208
2,Reason_3,3.033112
3,Reason_4,0.709667
4,Month value,0.175708
5,Transportation Expense,0.651199
6,Distance to Work,-0.016783
7,Age,-0.182643
8,Body Mass Index,0.348803
9,Education,-0.347619


In [44]:
reg_analysis.sort_values(by=['Coefficients'], ascending=False)

Unnamed: 0,Feature names,Coefficients
2,Reason_3,3.033112
0,Reason_1,2.73262
1,Reason_2,0.978208
3,Reason_4,0.709667
5,Transportation Expense,0.651199
8,Body Mass Index,0.348803
10,Children,0.28416
4,Month value,0.175708
6,Distance to Work,-0.016783
7,Age,-0.182643


In [45]:
# log(odds) = intercept + c1*a1 + c2*a2 + c3*a3 + ..... + c14*a14

In [46]:
reg_analysis['Odd Ratio'] = np.exp(reg_analysis['Coefficients'])

In [47]:
reg_analysis

Unnamed: 0,Feature names,Coefficients,Odd Ratio
0,Reason_1,2.73262,15.373104
1,Reason_2,0.978208,2.659687
2,Reason_3,3.033112,20.761734
3,Reason_4,0.709667,2.033315
4,Month value,0.175708,1.19209
5,Transportation Expense,0.651199,1.917839
6,Distance to Work,-0.016783,0.983357
7,Age,-0.182643,0.833066
8,Body Mass Index,0.348803,1.41737
9,Education,-0.347619,0.706368


In [48]:
reg_analysis.sort_values(by=['Odd Ratio'], ascending=False)

Unnamed: 0,Feature names,Coefficients,Odd Ratio
2,Reason_3,3.033112,20.761734
0,Reason_1,2.73262,15.373104
1,Reason_2,0.978208,2.659687
3,Reason_4,0.709667,2.033315
5,Transportation Expense,0.651199,1.917839
8,Body Mass Index,0.348803,1.41737
10,Children,0.28416,1.328646
4,Month value,0.175708,1.19209
6,Distance to Work,-0.016783,0.983357
7,Age,-0.182643,0.833066


In [49]:
inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month value,Transportation Expense,Distance to Work,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,36,33,30,0,2,1
1,0,0,0,0,7,118,13,50,31,0,1,0
2,0,0,0,1,7,179,51,38,31,0,0,0
3,1,0,0,0,7,279,5,39,24,0,2,0
4,0,0,0,1,7,289,36,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,22,40,22,1,2,0
696,1,0,0,0,5,225,26,28,24,0,1,2
697,1,0,0,0,5,330,16,28,25,1,0,0
698,0,0,0,1,5,235,16,32,25,1,0,0


In [50]:
import pickle

In [51]:
help(pickle.dump)

Help on built-in function dump in module _pickle:

dump(obj, file, protocol=None, *, fix_imports=True, buffer_callback=None)
    Write a pickled representation of obj to the open file object file.

    This is equivalent to ``Pickler(file, protocol).dump(obj)``, but may
    be more efficient.

    The optional *protocol* argument tells the pickler to use the given
    protocol; supported protocols are 0, 1, 2, 3, 4 and 5.  The default
    protocol is 4. It was introduced in Python 3.4, and is incompatible
    with previous versions.

    Specifying a negative protocol version selects the highest protocol
    version supported.  The higher the protocol used, the more recent the
    version of Python needed to read the pickle produced.

    The *file* argument must have a write() method that accepts a single
    bytes argument.  It can thus be a file object opened for binary
    writing, an io.BytesIO instance, or any other custom object that meets
    this interface.

    If *fix_import

In [52]:
with open('scaler', 'wb') as file:
    pickle.dump(scaler, file)

In [53]:
with open('model', 'wb') as file:
    pickle.dump(reg1, file)

In [54]:
reg1.predict_proba(X_test)

array([[0.69097637, 0.30902363],
       [0.57967688, 0.42032312],
       [0.38521162, 0.61478838],
       [0.76959396, 0.23040604],
       [0.09635386, 0.90364614],
       [0.30641527, 0.69358473],
       [0.30758632, 0.69241368],
       [0.1229497 , 0.8770503 ],
       [0.7940963 , 0.2059037 ],
       [0.73210992, 0.26789008],
       [0.47181079, 0.52818921],
       [0.22701991, 0.77298009],
       [0.08406132, 0.91593868],
       [0.74717129, 0.25282871],
       [0.33284903, 0.66715097],
       [0.54626469, 0.45373531],
       [0.5344344 , 0.4655656 ],
       [0.52193328, 0.47806672],
       [0.426902  , 0.573098  ],
       [0.04771569, 0.95228431],
       [0.70017496, 0.29982504],
       [0.76959396, 0.23040604],
       [0.38987063, 0.61012937],
       [0.38987063, 0.61012937],
       [0.24524278, 0.75475722],
       [0.75006587, 0.24993413],
       [0.51550504, 0.48449496],
       [0.87863156, 0.12136844],
       [0.17964474, 0.82035526],
       [0.76959396, 0.23040604],
       [0.

In [55]:
pred_scores = reg1.predict(X_test)

In [56]:
(pred_scores == y_test).sum()

131

In [57]:
len(y_test)

175

In [58]:
131/175*100

74.85714285714286

In [59]:
reg1.score(X_test,y_test)

0.7485714285714286

In [60]:
scaler