# Implementing 'absenteeism_module' Part I

> - **1	Absenteeism-Integration.ipynb:** The first one is the notebook file we will use marked with the extension IPYNB.

> - **2	Absenteeism_new_data.csv:** We have the CSV file containing the information we will be working with.

> - **3	Absenteeism_module.py**: (A file containing Python definitions and statements)


> - **4 & 5: model and scaler**:
Finally, there will be supplementary files in this case, there are two of them and they are called model and scalar
they contain all the information needed for our module to use the machine learning model prepared beforehand


## Implementing the 'absenteeism_module' - Part II

**Class Scaler**
# import all libraries needed
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

## the custom scaler class 
class CustomScaler(BaseEstimator,TransformerMixin): 
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std

    def fit(self, X, y=None):
        self.scaler = StandardScaler(copy=self.copy, with_mean=self.with_mean, with_std=self.with_std)
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.array(np.mean(X[self.columns]))
        self.var_ = np.array(np.var(X[self.columns]))
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

##  absenteeism_model Class
A special class to use to predict new data

class absenteeism_model():
      
        def __init__(self, model_file, scaler_file):
            # read the 'model' and 'scaler' files which were saved
            with open('model','rb') as model_file, open('scaler', 'rb') as scaler_file:
                self.reg = pickle.load(model_file)
                self.scaler = pickle.load(scaler_file)
                self.data = None
        
        def load_and_clean_data(self, data_file):
            
            # import the data
            df = pd.read_csv(data_file, delimiter=',') # 
            # store the data in a new variable for later use
            self.df_with_predictions = df.copy()
            # drop the 'ID' column
            df = df.drop(['ID'], axis = 1)
            # to preserve the code we've created in the previous section, we will add a column with 'NaN' strings
            df['Absenteeism Time in Hours'] = 'NaN'

            # create a separate dataframe, containing dummy values for ALL avaiable reasons
            reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first = True)
            
            # split reason_columns into 4 types
            reason_type_1 = reason_columns.loc[:,1:14].max(axis=1)
            reason_type_2 = reason_columns.loc[:,15:17].max(axis=1)
            reason_type_3 = reason_columns.loc[:,18:21].max(axis=1)
            reason_type_4 = reason_columns.loc[:,22:].max(axis=1)
            
            # to avoid multicollinearity, drop the 'Reason for Absence' column from df
            df = df.drop(['Reason for Absence'], axis = 1)
            
            # concatenating df and the 4 types of reason for absence
            df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)
            
            # assign names to the 4 reason type columns
            # note: there is a more universal version of this code, however the following will best suit our current purpose             
            column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
                           'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
                           'Pets', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']
            df.columns = column_names

            # re-order the columns in df
            column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date', 'Transportation Expense', 
                                      'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education', 
                                      'Children', 'Pets', 'Absenteeism Time in Hours']
            df = df[column_names_reordered]
      
            # converting the 'Date' column into datetime
            df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

            # creating a list with month values retrieved from the 'Date' column
            list_months = []
            for i in range(df.shape[0]):
                list_months.append(df['Date'][i].month)

            # inserting the values in a new column in df, called 'Month Value'
            df['Month_Value'] = list_months

            # creating a new feature called 'Day of the Week'
            df['Day of the Week'] = df['Date'].apply(lambda x: x.weekday())


            # droping the 'Date' column from df
            df = df.drop(['Date'], axis = 1)

            # re-order the columns in df
            column_names_upd = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month_Value', 'Day of the Week',
                                'Transportation Expense', 'Distance to Work', 'Age',
                                'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
                                'Pets', 'Absenteeism Time in Hours']
            df = df[column_names_upd]


            # map 'Education' variables; the result is a dummy
            df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})

            # replace the NaN values
            df = df.fillna(value=0)

            # droping the original absenteeism time
            df = df.drop(['Absenteeism Time in Hours'],axis=1)
            
            # droping the variables we decide we don't need
            df = df.drop(['Day of the Week','Daily Work Load Average','Distance to Work'],axis=1)
            
            # included this line of code if you want to call the 'preprocessed data'
            self.preprocessed_data = df.copy()
            
            # Need this line so we can use it in the next functions
            self.data = self.scaler.transform(df)
    
        # a function which outputs the probability of a data point to be 1
        def predicted_probability(self):
            if (self.data is not None):  
                pred = self.reg.predict_proba(self.data)[:,1]
                return pred
        
        # a function which outputs 0 or 1 based on our model
        def predicted_output_category(self):
            if (self.data is not None):
                pred_outputs = self.reg.predict(self.data)
                return pred_outputs
        
        # predicting the outputs and the probabilities and add columns with these values at the end of the new data
        def predicted_outputs(self):
            if (self.data is not None):
                self.preprocessed_data['Probability'] = self.reg.predict_proba(self.data)[:,1]
                self.preprocessed_data ['Prediction'] = self.reg.predict(self.data)
                return self.preprocessed_data

# TESTING with new Unseen data using the 2 CLASSES which has modules using model and scaler objects and all Preprocessing Steps
### delivering the probability that a certain individual will be absent from work for more than three hours.

In [1]:
from absenteeism_module import *

In [2]:
model = absenteeism_model('model', 'scaler')


In [3]:
model.load_and_clean_data('Absenteeism_new_data.csv')


In [4]:
model.predicted_outputs()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Probability,Prediction
0,0,0.0,0,1,6,179,30,19,1,0,0,0.237107,0
1,1,0.0,0,0,6,361,28,27,0,1,4,0.871285,1
2,0,0.0,0,1,6,155,34,25,0,2,0,0.448511,0
3,0,0.0,0,1,6,179,40,22,1,2,0,0.360140,0
4,1,0.0,0,0,6,155,34,25,0,2,0,0.721178,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,1,0.0,0,0,7,289,33,30,0,2,1,0.913706,1
36,1,0.0,0,0,7,235,37,29,1,1,1,0.749982,1
37,0,0.0,0,0,7,118,40,34,0,1,8,0.016180,0
38,0,0.0,0,0,7,231,39,35,0,2,2,0.247648,0


Probability column:

The probability column contains float values, only these values represent the probability that a given individual is expected to be absent from work for more than three hours or as we call it, excessively

Prediction column:

The other column called Prediction, is like a continuation of the probability column, in it, we see the value of one if the probability obtained was 50 percent or higher.
And zero, if it wasn't.


My goal in this notebook was to observe the necessary programming steps when using and how to load and use the ‘absenteeism_module.py’ ends here! 

Storing the following four lines of code will be more than enough, then all i have to do is insert the name of the file containing the data about new observations and run the cell.


Storing the predicted outputs obtained by the .absenteeism_model() in a variable called df_new_obs, standing for “a DataFrame with new observations”.

In [6]:
df_new_obs = model.predicted_outputs()

In [7]:
df_new_obs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Probability,Prediction
0,0,0.0,0,1,6,179,30,19,1,0,0,0.237107,0
1,1,0.0,0,0,6,361,28,27,0,1,4,0.871285,1
2,0,0.0,0,1,6,155,34,25,0,2,0,0.448511,0
3,0,0.0,0,1,6,179,40,22,1,2,0,0.360140,0
4,1,0.0,0,0,6,155,34,25,0,2,0,0.721178,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,1,0.0,0,0,7,289,33,30,0,2,1,0.913706,1
36,1,0.0,0,0,7,235,37,29,1,1,1,0.749982,1
37,0,0.0,0,0,7,118,40,34,0,1,8,0.016180,0
38,0,0.0,0,0,7,231,39,35,0,2,2,0.247648,0


## Convert the 'Reason_2' Column values Float to int

In [8]:
## from float to int using DataFrame.apply(np.int64)
df_new_obs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Reason_1                40 non-null     uint8  
 1   Reason_2                40 non-null     float64
 2   Reason_3                40 non-null     uint8  
 3   Reason_4                40 non-null     uint8  
 4   Month_Value             40 non-null     int64  
 5   Transportation Expense  40 non-null     int64  
 6   Age                     40 non-null     int64  
 7   Body Mass Index         40 non-null     int64  
 8   Education               40 non-null     int64  
 9   Children                40 non-null     int64  
 10  Pets                    40 non-null     int64  
 11  Probability             40 non-null     float64
 12  Prediction              40 non-null     int32  
dtypes: float64(2), int32(1), int64(7), uint8(3)
memory usage: 3.2 KB


In [9]:
df_new_obs['Reason_2'] = df_new_obs['Reason_2'].apply(np.int64)

In [10]:
df_new_obs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Probability,Prediction
0,0,0,0,1,6,179,30,19,1,0,0,0.237107,0
1,1,0,0,0,6,361,28,27,0,1,4,0.871285,1
2,0,0,0,1,6,155,34,25,0,2,0,0.448511,0
3,0,0,0,1,6,179,40,22,1,2,0,0.36014,0
4,1,0,0,0,6,155,34,25,0,2,0,0.721178,1


# Saving the Data into feather file for futher analysis for MySQL Integration(Data transfer!)

In [12]:
#Now, save the df Save it to feather
df_new_obs.to_feather("df_new_obs.feather")

In [13]:
# my_df

df_new_obs = pd.read_feather("df_new_obs.feather")
## NOTE : the Size if less of feather data file < than CSV file  (LOAD + SAVE time is VERY less)


In [14]:
df_new_obs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month_Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Probability,Prediction
0,0,0,0,1,6,179,30,19,1,0,0,0.237107,0
1,1,0,0,0,6,361,28,27,0,1,4,0.871285,1
2,0,0,0,1,6,155,34,25,0,2,0,0.448511,0
3,0,0,0,1,6,179,40,22,1,2,0,0.360140,0
4,1,0,0,0,6,155,34,25,0,2,0,0.721178,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,1,0,0,0,7,289,33,30,0,2,1,0.913706,1
36,1,0,0,0,7,235,37,29,1,1,1,0.749982,1
37,0,0,0,0,7,118,40,34,0,1,8,0.016180,0
38,0,0,0,0,7,231,39,35,0,2,2,0.247648,0
