## ML with Pre-processed Data

### Import relavant libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
data_preprocessed = pd.read_csv('Preprocessed_Data.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [4]:
# Basically, Regression tells us which variables are more important for the analysis

### Creating Targets

In [5]:
# We will take the Median value of the 'Absenteeism Time in Hours' and use it as a cut-off line
#Everyone below Median is Moderately Absent
#Everyone above Median is Excessively Absent

In [6]:
# So, take Median of 'Absenteeism Time in Hours'
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [7]:
# Moderately Absent(<=3)
# Excessively Absent(>=4)

In [8]:
# So, we make the 'Absenteeism Time in Hours' as 0 and 1
# If <=3 -->0
# if >=4 -->1

In [9]:
# Now convert all the data in 'Absenteeism Time in Hours' using np.where()
targets = np.where(data_preprocessed['Absenteeism Time in Hours']> data_preprocessed['Absenteeism Time in Hours'].median(),
                   1, 0)#(condition, True, False)

In [14]:
data_preprocessed['Excessive Absenteeism'] = targets

### A Note on targets here...

In [15]:
# Now, check the no. of 1s in the targets
targets.sum() / targets.shape[0]

0.45571428571428574

In [16]:
# It means 45% of data is having 1s and remaining 54% is 0s

In [17]:
# It's okay to have 50-50 data for Logistic Regression
# 60-40 split will usually work well for a Logistic Regression

In [18]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day', 'Daily Work Load Average', 'Distance to Work'], axis=1)

In [19]:
data_with_targets.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


In [20]:
data_with_targets is data_preprocessed

False

### Selecting the inputs for Regression

In [21]:
data_with_targets.shape

(700, 12)

In [22]:
# Now we will select all columns except the Last(Excessive Absenteeism) column
unscaled_inputs = data_with_targets.iloc[:,:-1]

### Standardize the Data

In [23]:
# from sklearn.preprocessing import StandardScaler

# absenteeism_scaler = StandardScaler()

In [24]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [27]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [28]:
# Columns to scale
# Except Reason_1, Reason_2, Reason_3, Reason4, Education. Remaining columns to be scaled
# columns_to_scale = ['Month', 'Day',
#        'Transportation Expense', 'Distance to Work', 'Age',
#        'Daily Work Load Average', 'Body Mass Index',
#        'Children', 'Pets']

columns_to_omit = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Education']

columns_to_scale = [feature for feature in unscaled_inputs.columns.values if feature not in columns_to_omit]

In [29]:
#Now create an instance to StandardScaler
absenteeism_scaler = CustomScaler(columns_to_scale)

In [30]:
absenteeism_scaler.fit(unscaled_inputs)

CustomScaler(columns=['Month', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [31]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [32]:
#To initialize the scaling mechanism and applied changes are in absenteeism_scaler object itself
# absenteeism_scaler.fit(unscaled_inputs)

In [33]:
# To Apply Standardization, we use transform() method
# Subtract the Mean and Divide by Standard Deviation(Standardization)
# scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [34]:
# Checking how the data is
scaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [35]:
# Wow!!! The data has been Standardized

In [36]:
scaled_inputs.shape

(700, 11)

### Splitting the data to Train and Test and Shuffling

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size=0.33, random_state=42)

In [39]:
print(X_train.shape, y_train.shape)

(469, 11) (469,)


In [40]:
print(X_test.shape, y_test.shape)

(231, 11) (231,)


### Logistic Regression with sklearn

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [42]:
log = LogisticRegression()

In [43]:
log.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
log.score(X_train, y_train)

0.7782515991471215

### Checking Accuracy manually

In [45]:
outputs = log.predict(X_train)

In [46]:
outputs

array([0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,

In [47]:
y_train

array([0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,

In [42]:
outputs == y_train

array([ True, False, False,  True,  True, False, False,  True,  True,
        True, False,  True,  True, False,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True, False,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True, False,  True, False, False,
        True,  True,  True,  True, False,  True, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False,  True, False,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,

In [48]:
# We can sum it to check how many are True(as True = 1)
expected_correct = np.sum((outputs == y_train))

In [49]:
outputs.shape[0]

469

In [50]:
# Expected correctly / Total outputs
expected_correct / outputs.shape[0]

0.7782515991471215

### Finding an Intercept and coefficient

In [51]:
log.intercept_

array([-1.40817991])

In [52]:
log.coef_

array([[ 2.4636391 ,  0.61401849,  2.92963192,  0.66583001,  0.08016529,
         0.71146674, -0.18549707,  0.20562576, -0.12366107,  0.36456587,
        -0.26031316]])

In [53]:
# We should be aware that above coef_ array values corresponds to which Column Name...So
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [54]:
feature_name = unscaled_inputs.columns.values

In [55]:
summary_table = pd.DataFrame(columns=['Feature Name'], data=feature_name)

summary_table['Coefficient'] = np.transpose(log.coef_)

summary_table

Unnamed: 0,Feature Name,Coefficient
0,Reason 1,2.463639
1,Reason 2,0.614018
2,Reason 3,2.929632
3,Reason 4,0.66583
4,Month,0.080165
5,Transportation Expense,0.711467
6,Age,-0.185497
7,Body Mass Index,0.205626
8,Education,-0.123661
9,Children,0.364566


In [58]:
summary_table.index = summary_table.index + 1

summary_table.loc[0] = ['Intercept', log.intercept_[0]]

summary_table = summary_table.sort_index()

summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-1.40818
1,Intercept,-1.40818
2,Intercept,-1.40818
3,Reason 1,2.463639
4,Reason 2,0.614018
5,Reason 3,2.929632
6,Reason 4,0.66583
7,Month,0.080165
8,Transportation Expense,0.711467
9,Age,-0.185497


In [59]:
# Coefficients are also called as Weights while Intercept is bias

In [61]:
# If weight is bigger, that feature is important

### Interpreting Coefficients

In [62]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [63]:
summary_table

Unnamed: 0,Feature Name,Coefficient,Odds_ratio
0,Intercept,-1.40818,0.244588
1,Intercept,-1.40818,0.244588
2,Intercept,-1.40818,0.244588
3,Reason 1,2.463639,11.747484
4,Reason 2,0.614018,1.847842
5,Reason 3,2.929632,18.720738
6,Reason 4,0.66583,1.946105
7,Month,0.080165,1.083466
8,Transportation Expense,0.711467,2.036977
9,Age,-0.185497,0.830691


In [64]:
# Sorting the summary_table based on Odds_ratio
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature Name,Coefficient,Odds_ratio
5,Reason 3,2.929632,18.720738
3,Reason 1,2.463639,11.747484
8,Transportation Expense,0.711467,2.036977
6,Reason 4,0.66583,1.946105
4,Reason 2,0.614018,1.847842
12,Children,0.364566,1.439889
10,Body Mass Index,0.205626,1.228293
7,Month,0.080165,1.083466
11,Education,-0.123661,0.883679
9,Age,-0.185497,0.830691


In [65]:
# All features are Standardized(Including Dummies!!!:())

In [66]:
# So we just standardize only the columns which are not Dummy
# So go to Standardized code and do Standardization again and then fit model

In [67]:
# The intercept or the BIAS 'calibrates' the model

### Backward Elimination

#### We can simplify our model by removing all features which have close to no contribution to the model

#### When we have p-values, we get rid of all coefficients with p-values >0.05

In [68]:
# So, we start executing the code from our last checkpoint, where we created targets

In [69]:
# After Removing 'Day','Distance to Work', 'Work Load in Hours' columns also we got same Accuracy. So its perfect

## Testing the model

In [70]:
# Till now, we have Train Accuracy of 77% only

In [71]:
# Now, we use test data to Test our model

In [72]:
log.score(X_test, y_test)

0.7619047619047619

### Test Accuray is always < Train Accuracy

In [73]:
# Now, we calculate probability of being 0 or 1
predicted_proba = log.predict_proba(X_test)
predicted_proba

array([[0.78901521, 0.21098479],
       [0.83753351, 0.16246649],
       [0.780261  , 0.219739  ],
       [0.57832534, 0.42167466],
       [0.54096943, 0.45903057],
       [0.09549651, 0.90450349],
       [0.64140547, 0.35859453],
       [0.36240816, 0.63759184],
       [0.6658314 , 0.3341686 ],
       [0.75027863, 0.24972137],
       [0.85251061, 0.14748939],
       [0.64757634, 0.35242366],
       [0.32318321, 0.67681679],
       [0.41576861, 0.58423139],
       [0.73133658, 0.26866342],
       [0.50632074, 0.49367926],
       [0.87025969, 0.12974031],
       [0.21283988, 0.78716012],
       [0.8553653 , 0.1446347 ],
       [0.55231334, 0.44768666],
       [0.75533877, 0.24466123],
       [0.75454233, 0.24545767],
       [0.68406891, 0.31593109],
       [0.65278206, 0.34721794],
       [0.86352415, 0.13647585],
       [0.16334283, 0.83665717],
       [0.56360305, 0.43639695],
       [0.53090822, 0.46909178],
       [0.77110106, 0.22889894],
       [0.56922403, 0.43077597],
       [0.

In [74]:
predicted_proba.shape

(231, 2)

In [70]:
# The above has 2 columns. First col is Not Excessively Absent, Second cols is Excessively Absent
# So, we have to extract only the Second column
predicted_proba[:,1]

array([0.21098479, 0.16246649, 0.219739  , 0.42167466, 0.45903057,
       0.90450349, 0.35859453, 0.63759184, 0.3341686 , 0.24972137,
       0.14748939, 0.35242366, 0.67681679, 0.58423139, 0.26866342,
       0.49367926, 0.12974031, 0.78716012, 0.1446347 , 0.44768666,
       0.24466123, 0.24545767, 0.31593109, 0.34721794, 0.13647585,
       0.83665717, 0.43639695, 0.46909178, 0.22889894, 0.43077597,
       0.10690138, 0.13079925, 0.58878269, 0.59530597, 0.27318427,
       0.64023795, 0.31593109, 0.12318871, 0.85712932, 0.18820397,
       0.50512274, 0.24124336, 0.62694929, 0.13906311, 0.21314174,
       0.68920274, 0.78720567, 0.86056904, 0.3061224 , 0.14182606,
       0.24972137, 0.31100553, 0.45903057, 0.91955359, 0.12821902,
       0.21314174, 0.97312871, 0.27318427, 0.87014566, 0.2742811 ,
       0.66600591, 0.10910632, 0.48267275, 0.63759184, 0.14748939,
       0.4805062 , 0.63122933, 0.07511117, 0.26419019, 0.45971127,
       0.24972137, 0.22889894, 0.69115529, 0.31100553, 0.13117

### From here, We are almost done. So we have to save this model to our future uses for new datasets and pass that dataset through SQL and Analyze it by Tableau

## Save the Model

In [75]:
# Saving the model means saving LogisticRegression() object

In [76]:
# To save a model, we have many ways. But we use a very simple and general way: Python Pickling

In [77]:
# Pickle - is a Python module used to convert a Python object into a character stream
# Good thing is the file size is Less than 1KB!!!

In [78]:
import pickle

In [79]:
with open('Logistic Model', 'wb') as file:#open('filename', writebytes mode)
    pickle.dump(log, file)#log->The object which we want to store

In [80]:
# We should pickle absenteeism_scaler too. As it Standardized the features

In [81]:
with open('Absenteeism scaler', 'wb') as file2:
    pickle.dump(absenteeism_scaler,file2)

## Deployment

In [82]:
# The 2nd step of the deployment is about creating a mechanism to load the saved model and make predictions

### Deployment can be done in 2 ways...
#### 1. Clumsy(We just load the New data and execute all steps)
#### 2. Clever(We create a Module)