## Creating Logistic Regression to Predict Absenteeism from Work
Download Dataset from https://utdallas.box.com/s/9a3avd33k0nrjowatn3p114azsoqb2jx

### Import Data

In [1]:
import pandas as pd

In [2]:
raw_csv_data = pd.read_csv('Absenteeism_data.csv')

In [3]:
df = raw_csv_data.copy()

In [4]:
df.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


## Drop ID

In [5]:
df = df.drop(['ID'],axis=1)

In [6]:
df['Reason for Absence'].unique()

array([26,  0, 23,  7, 22, 19,  1, 11, 14, 21, 10, 13, 28, 18, 25, 24,  6,
       27, 17,  8, 12,  5,  9, 15,  4,  3,  2, 16], dtype=int64)

In [7]:
len(df['Reason for Absence'].unique())

28

In [8]:
reason_columns = pd.get_dummies(df['Reason for Absence'],drop_first=True)

In [9]:
df = df.drop(['Reason for Absence'],axis=1)

### Group Similar Reason to reduce nummber of variables
- Reason_Type 1 corresponds to reasons related various diseases
- Reason_Type 2 corresponds to pregnancy and giving birth
- Reason_Type 3 corresponds to Poisoning
- Reason_Type 4 corresponds to ligh diseases

In [10]:
reason_columns = pd.DataFrame(reason_columns)
reason_type1 = reason_columns.loc[:,1:14].max(axis=1)
reason_type2 = reason_columns.loc[:,15:17].max(axis=1)
reason_type3 = reason_columns.loc[:,18:21].max(axis=1)
reason_type4 = reason_columns.loc[:,22:].max(axis=1)

In [11]:
df = pd.concat([df,reason_type1,reason_type2,reason_type3,reason_type4],axis=1)
df

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,23/05/2018,179,22,40,237.656,22,2,2,0,8,1,0,0,0
696,23/05/2018,225,26,28,237.656,24,1,1,2,3,1,0,0,0
697,24/05/2018,330,16,28,237.656,25,2,0,0,8,1,0,0,0
698,24/05/2018,235,16,32,237.656,25,3,0,0,2,0,0,0,1


In [12]:
df.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 0, 1, 2, 3],
      dtype=object)

In [13]:
column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours','Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']

In [14]:
df.columns = column_names
df.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_1,Reason_2,Reason_3,Reason_4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


In [15]:
reorder_columns = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
df = df[reorder_columns]
df.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [16]:
df_mod = df.copy()

In [17]:
type(df_mod['Date'][0])

str

In [18]:
df_mod['Date'] = pd.to_datetime(df_mod['Date'],format='%d/%m/%Y')
df_mod['Date']

0     2015-07-07
1     2015-07-14
2     2015-07-15
3     2015-07-16
4     2015-07-23
         ...    
695   2018-05-23
696   2018-05-23
697   2018-05-24
698   2018-05-24
699   2018-05-31
Name: Date, Length: 700, dtype: datetime64[ns]

In [19]:
list_month=[]
for i in range(df_mod.shape[0]):
    list_month.append(df_mod['Date'][i].month)

In [20]:
df_mod['Month'] = list_month
df_mod

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018-05-23,179,22,40,237.656,22,2,2,0,8,5
696,1,0,0,0,2018-05-23,225,26,28,237.656,24,1,1,2,3,5
697,1,0,0,0,2018-05-24,330,16,28,237.656,25,2,0,0,8,5
698,0,0,0,1,2018-05-24,235,16,32,237.656,25,3,0,0,2,5


In [21]:
def date_to_weekday(date_value):
    return date_value.weekday()

In [22]:
df_mod['Day of week'] = df_mod['Date'].apply(date_to_weekday)

In [23]:
df_mod.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day of week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7,3


## Grouping Education
- Value 0 corresponds to highschool degree
- Value 1 corresponds to postgraduate, PhD or other degree higher than highschool

In [24]:
df_mod['Education'] = df_mod['Education'].map({1:0,2:1,3:1,4:1})

In [25]:
df_mod['Education'].value_counts()

0    583
1    117
Name: Education, dtype: int64

In [26]:
df_preprocessed = df_mod.copy()

In [27]:
df_preprocessed = df_preprocessed.drop(['Date'],axis=1)

In [28]:
df_preprocessed.columns.values


array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Month',
       'Day of week'], dtype=object)

In [29]:
columns_names_upd = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Month',
       'Day of week','Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours' ]

In [30]:
df_preprocessed = df_preprocessed[columns_names_upd]
df_preprocessed.shape
df_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [31]:
df_preprocessed.to_csv('Absenteeism_preprocessed.csv',index=False)