# PART-2 (Using a CLASS to Automate Model building): Saving the Model, &  object that was used for training data!

## Importing the Libraries, Data till Obtaining unscaled_inputs

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,1,0,0,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,1,0,0,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,1,0,0,7,3,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8
698,0,1,0,0,5,3,235,16,32,237.656,25,1,0,0,2


In [4]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                  data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [5]:
data_preprocessed['Excessive Absenteeism'] = targets

In [6]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], 
                                           axis = 1)

In [7]:
data_with_targets

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,1,0,0,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,1,0,0,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,1,0,0,7,3,289,36,33,239.554,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,1
698,0,1,0,0,5,3,235,16,32,237.656,25,1,0,0,0


In [47]:
data_with_targets is data_preprocessed

False

In [8]:
unscaled_inputs = data_with_targets.iloc[ :, :-1]

# -> Coding CustomScaler Class with 3 methods (For Scaling, fit, Transform) on Only required dummy variables 

A big, big problem, which we are not going to like.

When we standardize the inputs, we also standardize the dummies.

This is bad practice because when we standardize, we lose the whole interpretability of a dummy.


In [9]:
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module, 
# so you can imagine that the Custom Scaler is build on it

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler


In [10]:
# create the Custom Scaler class with 3 Methods: scaling, fitting, transforming

# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module, 
# so you can imagine that the Custom Scaler is build on it

class CustomScaler(BaseEstimator,TransformerMixin): 

    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
       # CODE change     
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std
        
    
    # the fit method, which, again based on StandardScaler  
    def fit(self, X, y=None):
        #CODE added
        self.scaler = StandardScaler(copy=self.copy, with_mean=self.with_mean, with_std=self.with_std)
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling
    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[ : , ~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [11]:
# check what are all columns that we've got
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month_Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [12]:
# choose the columns to scale
# we later augmented this code and put it in comments
# columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
       #'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pet']
    
# select the columns to omit first
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [13]:
# create the columns to scale, based on the columns to omit
# use list comprehension to iterate over the list
columns_to_scale = [x for x in unscaled_inputs.columns.values 
                    if x not in columns_to_omit]

In [14]:
# declare a scaler object, specifying the columns you want to scale
absenteeism_scaler = CustomScaler(columns_to_scale)


In [15]:
# fit the data (calculate mean and standard deviation); they are automatically stored inside the object 
absenteeism_scaler.fit(unscaled_inputs)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [16]:
# standardizes the data, using the transform method 
# in the last line, we fitted the data - in other words
# we found the internal parameters of a model that will be used to transform data. 
# transforming applies these parameters to our data
# note that when you get new data, you can just call 'scaler' again and transform it in the same way as now
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [17]:
# the scaled_inputs are now an ndarray, because sklearn works with ndarrays
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,1,0,0,0.030796,-0.800950,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.030796,-0.800950,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,0,1,0,0,0.030796,-0.232900,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.030796,0.335149,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,0,1,0,0,0.030796,0.335149,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.568019,-0.232900,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.568019,-0.232900,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.568019,0.335149,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690
698,0,1,0,0,-0.568019,0.335149,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690


# -> Splitting the Data then Model Building then evaluating Training Model Accuracy

In [18]:
# import train_test_split so we can split our data into train and test
from sklearn.model_selection import train_test_split

In [19]:
# declare 4 variables for the split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

In [20]:
# check the shape of the train inputs and targets
print (x_train.shape, y_train.shape)

(560, 14) (560,)


In [21]:
# import the LogReg model from sklearn
from sklearn.linear_model import LogisticRegression

# import the 'metrics' module, which includes important metrics we may want to use
from sklearn import metrics

In [22]:
# create a logistic regression object
reg = LogisticRegression()

In [23]:
# fit our train inputs
# that is basically the whole training part of the machine learning
reg.fit(x_train,y_train)

In [24]:
# assess the train accuracy of the model
reg.score(x_train,y_train)

0.7642857142857142

## Manually Cross Verifying the Training Accuracy

In [26]:
model_outputs = reg.predict(x_train)


In [27]:
model_outputs == y_train

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True, False,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [28]:
# Accuracy = Correct predictions / Number of Observations:

np.sum((model_outputs == y_train)) / model_outputs.shape[0]
## We get the same Accuracy O/P:

0.7642857142857142

What we get the exact same result as using the np method score, however, this time we have a much better idea of what that result means.

# -> Extracting the Intercept and Coefficients from a Logistic Regression

In [29]:
reg.intercept_

array([-1.68075271])

In [30]:
reg.coef_

array([[ 2.86020799,  0.88489021,  1.69193557,  1.69193557,  0.01155936,
        -0.07965212,  0.64050779, -0.02735253, -0.16873439, -0.02869203,
         0.27821817, -0.26757397,  0.36892146, -0.289534  ]])

In [31]:
# This is the corrcet variable to extract features(as it is a dataframe)
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month_Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [32]:
# Store it in a new variable called ‘feature_name’
feature_name = unscaled_inputs.columns.values

## Creation of Summary Table:

In [33]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_) #This will create a new column called Coefficient inside, we will have the information from the regression coefficients.

summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.860208
1,Reason_2,0.884890
2,Reason_3,1.691936
3,Reason_4,1.691936
4,Month_Value,0.011559
...,...,...
9,Daily Work Load Average,-0.028692
10,Body Mass Index,0.278218
11,Education,-0.267574
12,Children,0.368921


In [34]:
summary_table.index = summary_table.index + 1

summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

summary_table = summary_table.sort_index()

summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.680753
1,Reason_1,2.860208
2,Reason_2,0.884890
3,Reason_3,1.691936
4,Reason_4,1.691936
...,...,...
10,Daily Work Load Average,-0.028692
11,Body Mass Index,0.278218
12,Education,-0.267574
13,Children,0.368921


## -> Interpreting the Logistic Regression Coefficients

In [35]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [36]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.680753,0.186234
1,Reason_1,2.860208,17.465159
2,Reason_2,0.884890,2.422718
3,Reason_3,1.691936,5.429981
4,Reason_4,1.691936,5.429981
...,...,...,...
10,Daily Work Load Average,-0.028692,0.971716
11,Body Mass Index,0.278218,1.320774
12,Education,-0.267574,0.765234
13,Children,0.368921,1.446174


In [44]:
# Un-Sorted 'Odds_ratio' Summary table
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print (summary_table)

               Feature name  Coefficient  Odds_ratio
0                 Intercept    -1.680753    0.186234
1                  Reason_1     2.860208   17.465159
2                  Reason_2     0.884890    2.422718
3                  Reason_3     1.691936    5.429981
4                  Reason_4     1.691936    5.429981
5               Month_Value     0.011559    1.011626
6           Day of the Week    -0.079652    0.923438
7    Transportation Expense     0.640508    1.897444
8          Distance to Work    -0.027353    0.973018
9                       Age    -0.168734    0.844733
10  Daily Work Load Average    -0.028692    0.971716
11          Body Mass Index     0.278218    1.320774
12                Education    -0.267574    0.765234
13                 Children     0.368921    1.446174
14                     Pets    -0.289534    0.748612


In [45]:
# SORTED summary_table
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(summary_table.sort_values('Odds_ratio', ascending=False))

               Feature name  Coefficient  Odds_ratio
1                  Reason_1     2.860208   17.465159
3                  Reason_3     1.691936    5.429981
4                  Reason_4     1.691936    5.429981
2                  Reason_2     0.884890    2.422718
7    Transportation Expense     0.640508    1.897444
13                 Children     0.368921    1.446174
11          Body Mass Index     0.278218    1.320774
5               Month_Value     0.011559    1.011626
8          Distance to Work    -0.027353    0.973018
10  Daily Work Load Average    -0.028692    0.971716
6           Day of the Week    -0.079652    0.923438
9                       Age    -0.168734    0.844733
12                Education    -0.267574    0.765234
14                     Pets    -0.289534    0.748612
0                 Intercept    -1.680753    0.186234


# -> Interpreting the Important Predictors

So by looking at the coefficients table, we will notice that the most strongly pronounced features seem to be the four reasons for absence, the transportation expense and whether a person has children, pets and education

Note that pet and education are at the bottom of the table, but their weights are still far away from zero.

------------------------------------

Pets: The odds are 1 - 0.748612 = 25% lower that the base model (no pet)
explanation may be if you have several pets, you're probably not taking care of them on your own.
Not being solely responsible for them implies somebody else can take them to the doctor if something is wrong.


--------------------------------------------------
daily work load average, distance to work and day of the week, which seem to have the smallest impact, their weight is almost zero.
So regardless of the particular values, they will barely affect our model.

-------------------------------------------


Reason_1 Various diseases

Reason_2 Pregnancy and giving birth

Reason_3 Poisoning

Reason_4 Light diseases

---------------------------------------------------

ML Engineers: Prefer models with higher accuracy, so they normally go for standardization
    
Econometricians and Statisticians: Prefer less accurate but more interpretable models, because they care about the underlying reasons behind different phenomena.
    
Data Scientists: May be in either position.
    SOmetimes, they need higher accuracy, other times - they must find the main drivers of a problem

-------------------------------------
Intercept : It is used to get more accurate predictions, but there's no specific meaning attached to it.
That's why in machine learning, you can say that it calibrates the model and you can also call it a bias.

Nevertheless, without an intercept, each prediction would be off the mark by precisely that value.


# <font color=red> -> Simplifying the Model (Backward Elimination) Again From START

We analyzed the most significant coefficients and we noticed that daily workload average, distance to work and day of the week seemed to have the lowest impact if we could even call their contribution and impact as their weights are almost zero!

There is a concept called backward elimination.

The idea is that we can simplify our model by removing all features which have close to no contribution to the model.

**Usually when we have the P values of variables, we get rid of all coefficients with P values above 0.05**


Let's go back to the checkpoint when we created the targets.
i.e data_with_targets

DROP additional 3 features now:  daily workload average, distance to work and day of the week  

daily workload average, distance to work and day of the week 

In [2]:
import pandas as pd
import numpy as np

data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                  data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

data_preprocessed['Excessive Absenteeism'] = targets

In [4]:
# create a checkpoint by dropping the unnecessary variables
# also drop the variables we 'eliminated' after exploring the weights
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the Week',
                                            'Daily Work Load Average','Distance to Work'],axis=1)

In [5]:
unscaled_inputs = data_with_targets.iloc[ :, :-1]

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class
class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
       # CODE change     
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std
        
    
    # the fit method, which, again based on StandardScale   
    def fit(self, X, y=None):
        #CODE added
        self.scaler = StandardScaler(copy=self.copy, with_mean=self.with_mean, with_std=self.with_std)
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling
    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[ : , ~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [7]:
# select the columns to omit
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

columns_to_scale = [x for x in unscaled_inputs.columns.values 
                    if x not in columns_to_omit]

In [8]:
# declare a scaler object, specifying the columns you want to scale
absenteeism_scaler = CustomScaler(columns_to_scale)

In [9]:
# fit the data (calculate mean and standard deviation); they are automatically stored inside the object 
absenteeism_scaler.fit(unscaled_inputs)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [10]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [11]:
# import train_test_split so we can split our data into train and test
from sklearn.model_selection import train_test_split

# declare 4 variables for the split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

In [12]:
# import the LogReg model from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


reg = LogisticRegression()

# fit our train inputs
# that is basically the whole training part of the machine learning
reg.fit(x_train,y_train)

In [13]:
# assess the train accuracy of the model
reg.score(x_train,y_train)

0.7696428571428572

**From 0.764 ~ 76% we tweaked the accuracy slightly to 0.7696 ~ 0.77 i.e 77%**

This shows us that the three variables we dropped were useless with or without them, we obtained practically the same results.

Either way, a simpler model is always preferable!


# -> Testing the Machine Learning Model (x_test, y_test)

So far, when referring to the model accuracy, we meant the train accuracy, at this stage, our train accuracy is around 77 percent

Our algorithm has seen this train data many times, in fact, thousands of times during the training process.
So, it has learned to model its quite well

However, it may fail miserably when provided with new data


**It is the time to use the test data.** 

That's because testing is done only once and at the very end of the machine learning process.

let's finally test it, the first task is to find the accuracy in the same manner as before reg.score, but this time the inputs are X test, while the targets we test.

In [14]:
reg.score(x_test, y_test)

0.7285714285714285

what we get is a different number, around 73%

So based on data that the model has never seen before, we can say that in 73 percent of the cases the model will predict if a person is going to be excessively absent.

Overfit:
    
By definition, if we get a higher number than we either got lucky or made a mistake.
Often it is dramatically lower than the train accuracy, something like 10 percent or even 20 percent lower, this would mean that our model overfit it.


**Outputs**

Apart from the accuracy, we can get the outputs themselves using the predict method, but we already

In [16]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

one method which will prove much more useful for our analysis instead of zero and one we can get the probability of an output being zero or one.
There is an sklearn method called “predict proba”.


**let’s predicted proba be equal to reg.predict_proba of X test.**

In [18]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.7572248 , 0.2427752 ],
       [0.607044  , 0.392956  ],
       [0.48219999, 0.51780001],
       [0.759255  , 0.240745  ],
       [0.08056244, 0.91943756],
       [0.30673682, 0.69326318],
       [0.30751711, 0.69248289],
       [0.09554002, 0.90445998],
       [0.74431087, 0.25568913],
       [0.75809631, 0.24190369],
       [0.51172276, 0.48827724],
       [0.18770963, 0.81229037],
       [0.0494019 , 0.9505981 ],
       [0.70512716, 0.29487284],
       [0.25174209, 0.74825791],
       [0.51523828, 0.48476172],
       [0.51132743, 0.48867257],
       [0.51330394, 0.48669606],
       [0.36266739, 0.63733261],
       [0.05241224, 0.94758776],
       [0.74310459, 0.25689541],
       [0.759255  , 0.240745  ],
       [0.48540722, 0.51459278],
       [0.48382656, 0.51617344],
       [0.19499174, 0.80500826],
       [0.74461186, 0.25538814],
       [0.50442424, 0.49557576],
       [0.877875  , 0.122125  ],
       [0.20605558, 0.79394442],
       [0.759255  , 0.240745  ],
       [0.

the 1st  column shows the probability our model assigned to the observation being 0.

and the 2nd shows the probability, the model assigned to the observation being 1.


summing any two numbers horizontally will give you an output of 1.

**NOTE:** what we are interested in is a probability of excessive absenteeism! i.e 2nd column(Model assigned to the observation being 1.)

In [19]:
predicted_proba[ : , 1]

array([0.2427752 , 0.392956  , 0.51780001, 0.240745  , 0.91943756,
       0.69326318, 0.69248289, 0.90445998, 0.25568913, 0.24190369,
       0.48827724, 0.81229037, 0.9505981 , 0.29487284, 0.74825791,
       0.48476172, 0.48867257, 0.48669606, 0.63733261, 0.94758776,
       0.25689541, 0.240745  , 0.51459278, 0.51617344, 0.80500826,
       0.25538814, 0.49557576, 0.122125  , 0.79394442, 0.240745  ,
       0.39144738, 0.70872379, 0.69561325, 0.48906791, 0.240745  ,
       0.59956215, 0.25599035, 0.8139451 , 0.44180188, 0.61555195,
       0.24045592, 0.49630788, 0.25478686, 0.3997764 , 0.80649313,
       0.64233357, 0.7202229 , 0.24132387, 0.2477568 , 0.24016708,
       0.49947214, 0.28256963, 0.69326318, 0.24628538, 0.82187684,
       0.39371108, 0.9204872 , 0.26502086, 0.30809474, 0.30843211,
       0.70626951, 0.69427159, 0.26594643, 0.80624703, 0.24123189,
       0.24248446, 0.06961865, 0.2562918 , 0.79856697, 0.29619042,
       0.25418651, 0.31006922, 0.88461022, 0.44102166, 0.59804

In reality, logistic regression models calculate these probabilities in the background.

If the probability is below 0.5, it places a 0, otherwise a 1.

There are other considerations we can make and other ways to see how good our model actually is.

Instead, we want to get back to the integration. From this point on, we will take several important steps:
    
•	First, we will save our model so we can use it later on.

•	We don't need to train it every time.

•	We just need to determine the weights once and then save them for later use.

•	Second, we will create our own module so that our less technical colleagues can take advantage of this model to.

•	Finally, we will get completely new data, classify it, pass it through SQL, and then analyze it in TABLEAU!

> SAVE the Model

> Create a MODULE

> Get New Data, Classify it, pass it through SQL and analyze it in TABLEAU


# -> <font color=green>Save the ML Model and Scaler object to Prepare it for Future Deployment model

we want to create a file that will store the following information.

1.	This machine learning model is a logistic regression.

2.	It has these and these coefficients and intercept.

3.	The random state that was chosen for the shuffling was 20 and so on.


Now, if you paid close attention, you would quickly realize that the object “reg”, which was an instance of the SKLearn logistic regression class, contains all this information.

1. In fact, this is the object we use to find the intercept coefficients and accuracy.
`
2. Therefore, it should not come as a surprise that saving the model is equivalent to saving the Reg object.


Pickle pickling is the process of converting a python object into a character stream.

- The main idea is that this character stream will contain sufficient information.

- Then later, when we would like to convert the character stream into a Python object, add another notebook, i.e we will unpickle.

- This means we will save the REG variable into a file.

- This file will then be loaded in a new notebook and thus will be able to use the machine learning algorithm. Simple as that!


In [20]:
import pickle

In [21]:
# pickle the model file

with open('model', 'wb') as file:
    pickle.dump(reg, file)

All right, finally, there is one more subtlety we must save the absenteeism scaler too

The absenteeism scalar object was used to standardize all numerical variables.

What it did was store the columns to scale as well as the mean and the standard deviation of each feature


## Pickling the absenteeism scalar object on a file called scaler!

**Homework:**

So, use the same pickling code.
Pickle the absenteeism scalar object on a file called scaler!

-------------------------------------------
To further explain why, we need to pickle up the scalar.

Think that up until now our code was heavily dependent on training data without training data the ML could not be executed at all!

But once the model is trained and we have obtained the coefficients, we can save it as we just have in this way. We are basically separating the model from the training data for good.

And illogically, **the information in the absenteeism scaler is needed to pre-process any new data! using the same rules as the ones applied to training data.**


In [24]:
# pickle the scaler file

with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)