In [None]:
# Dependancies

# Data manipulation
import numpy as np
import pandas as pd

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Data Mining
!pip install apyori
from apyori import apriori

In [None]:
# Styling
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['xtick.labelsize'] = 'large'

## Load data

In [None]:
# Load dataset
df = pd.read_csv('../input/absenteeism-at-work-uci-ml-repositiory/Absenteeism_at_work.csv', sep=';')

In [None]:
# Rename colums - Kerstin Wagner [1]
df.rename(columns={
    'Reason for absence': 'Reason',
    'Month of absence': 'Month',
    'Day of the week': 'Weekday',
    'Seasons': 'Season',
    'Disciplinary failure': 'Failure',
    'Transportation expense': 'Expense',
    'Distance from Residence to Work': 'Distance',
    'Service time': 'ServiceTime',
    'Work load Average/day ': 'Workload',
    'Hit target': 'HitTarget',
    'Son': 'Child',
    'Social drinker': 'Drinker',
    'Social smoker': 'Smoker',
    'Body mass index': 'BMI',
    'Absenteeism time in hours': 'AbsH'}, 
    inplace = True)

In [None]:
# Set attributes as category
cat_attributes = ['ID', 'Reason', 'Month', 'Weekday', 'Season', 'Education', 'Failure', 'Drinker', 'Smoker']
for att in cat_attributes:
    df[att] = df[att].astype('category')

## Exploratory Analysis & Data Cleaning

In [None]:
# Rows & Cols
df.info()

In [None]:
# Missing data
df.isna().apply(sum)

In [None]:
# Numeric variables description
df.describe(include=np.number)

In [None]:
# Categorical variables description
df.describe(include='category')

In [None]:
# Update month
df.loc[df['Month'] == 0, 'Month'] = 1

In [None]:
# Hist of AbsH
plt.title('Histogram of Absence Duration')
g = df['AbsH'].plot(kind='hist', bins=120);
g.set_xlabel('Time (h)')
plt.legend()
g.set_xticks(range(0, 125, 5));

In [None]:
# Boxplot AbsH Reason
plt.title('Distribution of Absence Duration by Reason')
sns.boxplot(data=df, x='Reason', y='AbsH');

In [None]:
# Cleaning - Keep short absences
df = df[df['AbsH'] <= 8]

In [None]:
# Total number and time of absences
plt.title('Total Missed Hours by Employees')
data_ = df.groupby('ID').sum().reset_index()
sns.barplot(data=data_, x='ID', y='AbsH');

In [None]:
plt.title('Total Number of Absences by Employees')
data_ = df[df['Reason'] != 0].groupby('ID').apply(len).rename('AbsNumber').reset_index()
sns.barplot(data=data_, x='ID', y='AbsNumber');

In [None]:
# People

personal_attributes_count = df.groupby('ID').nunique() == 1

def mapcol(val):
    col = 'green' if val else 'red'
    return f'background-color: {col}'

personal_attributes_count.style.applymap(mapcol)

In [None]:
# Let's take a closer look at number 29
df[df['ID'] == 29][['ID', 'Age', 'Education', 'Child', 'ServiceTime']]

In [None]:
# Delete rows 51
df = df.drop(51)

## Processing

In [None]:
# Employee data 

# Delete 29
employees = df[~((df['ID'] == 29) & (df['Age'] == 28))]

# Get personal attributes
personal_attributes = ['ID', 'Expense', 'Distance', 'ServiceTime', 'Age', 'Education', 'Child', 'Drinker', 'Smoker', 'Pet', 'Weight', 'Height', 'BMI']
employees = employees[personal_attributes]

# Delete dupliactes
employees = employees.drop_duplicates().set_index('ID')

In [None]:
# Absences with Employees data

cdf = df

# Delete recors with unvalid values

# Invalid Month
cdf.loc[cdf['Month'] == 0, 'Month'] = 1

In [None]:
# Absence records

# Get absence attributes
record_attributes = ['ID', 'AbsH', 'Failure', 'HitTarget', 'Month', 'Reason', 'Season', 'Weekday', 'Workload']
absences = cdf[record_attributes]

# Delete present people
absences = absences[(absences['Reason'] != 0) & (absences['AbsH'] != 0)]

In [None]:
# Add absenteeism information to employees

employees['AbsNumber'] = 0
employees['AbsTime'] = 0

for _, row in absences.iterrows():
    if row['ID'] in employees.index:
        employees.loc[row['ID'], 'AbsNumber'] += 1
        employees.loc[row['ID'], 'AbsTime'] += row['AbsH']

## Data Analysis

In [None]:
# Total number and time of absences
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(17, 11))
ax1.title.set_text('Total Number of Absences by Employees')
ax2.title.set_text('Total Missed Hours by Employees')
sns.barplot(x=employees.index, y=employees['AbsTime'], ax=ax1)
sns.barplot(x=employees.index, y=employees['AbsNumber'], ax=ax2);

In [None]:
# Correlation
plt.figure(figsize=(12, 10))
plt.title('Correlation of employees attributes')
sns.heatmap(employees.corr(), annot=True, cmap="YlGnBu", fmt='.0%', square=True, linewidths =.5);

In [None]:
# Correlation
plt.figure(figsize=(12, 10))
plt.title('Correlation of absences attributes')
sns.heatmap(absences.corr(), annot=True, cmap="YlGnBu", fmt='.0%', square=True, linewidths =.5);

In [None]:
# Scatter plot
plt.title('Total Missed Hours by Reason')

groups = absences.groupby('Reason')['AbsH']
number = groups.apply(len).iloc[1:]
time = groups.sum().iloc[1:]

p = sns.scatterplot(x=number, y=time)

for i in number.index:
        p.text(number.loc[i]+0.5, time.loc[i], i, size='medium', weight='semibold')
    
p.set_ylabel('Total Time of Absences')
p.set_xlabel('Total Number of Absences');

In [None]:
# Reason & Time
absences['Month'] = absences['Month'].astype(int).astype('category')
plt.title('Number of Absences by Month and by Reason')
sns.heatmap(absences.groupby(['Month', 'Reason'])['AbsH'].sum().unstack(), annot=True, cmap="YlGnBu", square=True, linewidths =.5);

In [None]:
# Reason & Number
plt.figure(figsize=(16, 8))
data = absences.groupby(['Month', 'Reason']).apply(len) / absences.groupby('Reason').apply(len)
data = data.unstack()
data = data.loc[:, sorted(data.columns)]
sns.heatmap(data, annot=True, cmap="YlGnBu", square=True, linewidths =.5, fmt='.0%');

In [None]:
# Reason & Number
plt.figure(figsize=(16, 8))
data = absences.groupby(['Month', 'Reason']).apply(len) / absences.groupby('Month').apply(len)
data = data.unstack()
data = data.loc[:, sorted(data.columns)]
sns.heatmap(data, annot=True, cmap="YlGnBu", square=True, linewidths =.5, fmt='.0%');

## Data Mining - Rules

### Absences

In [None]:
# Additional Processing

scdf = cdf.copy()
scdf = scdf[scdf['AbsH'] > 0]

for col in ['Expense', 'Distance', 'ServiceTime', 'Age', 'Workload', 'HitTarget']:
    idx = scdf[col] > scdf[col].mean()
    scdf.loc[idx, col] = 'SUP'
    scdf.loc[~idx, col] = 'INF'
    
# BMI
col = 'BMI'
idx_no = scdf[col] <= 18
idx_inf = (18 < scdf[col]) & (scdf[col] < 24) 
idx_sup = 24 <= scdf[col]

scdf.loc[idx_no, col] = 'Too Less'
scdf.loc[idx_inf, col] = 'Good'
scdf.loc[idx_sup, col] = 'Too Much'

scdf = scdf.astype(str)

scdf = scdf.drop(['Height', 'Weight', 'ID', 'Month'], axis=1)

for col in scdf.columns:
    scdf.loc[:, col] =  str(col) + '-' + scdf[col].str[:]
    
records = scdf.values.tolist()

In [None]:
# Mine
association_rules = apriori(records, 
                            min_support=0.15, 
                            min_confidence=0.6,
                            min_lift=3, 
                            min_length=2)

# Print

rules_name = set()
rules = set()

for item in association_rules:
    
    # first index of the inner list
    # Contains base item and add item
    pair = item[0] 
    items = [x for x in pair]
    
    srule = "Rule: " + items[0] + " -> " + items[1]
    ssup = "Support: " + str(item[1])
    sconf = "Confidence: " + str(item[2][0][2])
    
    rule = '\n'.join([srule, ssup, sconf])
    
    if srule in rules_name:
        continue
        
    rules_name.add(srule)
    rules.add(rule)
    print(rule)
    print()

### Employees

In [None]:
# Additional Processing

scdf = employees.copy()

for col in ['Expense', 'Distance', 'ServiceTime', 'Age']:
    idx = scdf[col] > scdf[col].mean()
    scdf.loc[idx, col] = 'SUP'
    scdf.loc[~idx, col] = 'INF'
    
# AbsTime
idx_no = scdf['AbsTime'] == 0
idx_inf = scdf['AbsTime'] < scdf['AbsTime'].mean()
idx_sup = scdf['AbsTime'] > scdf['AbsTime'].mean()

scdf.loc[idx_no, 'AbsTime'] = 'No'
scdf.loc[idx_inf, 'AbsTime'] = 'Inf'
scdf.loc[idx_sup, 'AbsTime'] = 'Sup'

# AbsNumber
idx_no = scdf['AbsNumber'] == 0
idx_inf = scdf['AbsNumber'] < scdf['AbsNumber'].mean()
idx_sup = scdf['AbsNumber'] > scdf['AbsNumber'].mean()

scdf.loc[idx_no, 'AbsNumber'] = 'No'
scdf.loc[idx_inf, 'AbsNumber'] = 'Inf'
scdf.loc[idx_sup, 'AbsNumber'] = 'Sup'

# BMI
col = 'BMI'
idx_no = scdf[col] <= 18
idx_inf = (18 < scdf[col]) & (scdf[col] < 24) 
idx_sup = 24 <= scdf[col]

scdf.loc[idx_no, col] = 'Too Less'
scdf.loc[idx_inf, col] = 'Good'
scdf.loc[idx_sup, col] = 'Too Much'

scdf = scdf.astype(str)

scdf = scdf.drop(['Height', 'Weight'], axis=1)

for col in scdf.columns:
    scdf.loc[:, col] =  str(col) + '-' + scdf[col].str[:]
    
records = scdf.values.tolist()

In [None]:
# Mine
association_rules = apriori(records, 
                            min_support=0.05, 
                            min_confidence=0.6,
                            min_lift=3, 
                            min_length=2)

# Print

rules_name = []
rules = []

for item in association_rules:
    
    # first index of the inner list
    # Contains base item and add item
    pair = item[0] 
    items = [x for x in pair]
    
    #if ('AbsTime' not in items[1]) and ('AbsNumber' not in items[1]):
    if ('Service' not in items[0]) or ('Distance' not in items[1]):
        continue
    
    srule = "Rule: " + items[0] + " -> " + items[1]
    ssup = "Support: " + str(item[1])
    sconf = "Confidence: " + str(item[2][0][2])
    
    rule = '\n'.join([srule, ssup, sconf])
    
    if srule in rules_name:
        continue
        
    rules_name.append(srule)
    rules.append(rule)
    
print('\n\n'.join(sorted(rules)))

## Time Series Forecasting -- Too few data to get interesting insight

### Feature Engineering 

In [None]:
# Add year
absences['Year'] = -1
year = 2007
new_year = (absences['Month'].shift(1) == 12) & (absences['Month'] == 1)
for ind, ny in zip(absences.index, new_year):
    year = year + 1 if ny else year
    absences.loc[ind, 'Year'] = year

### Time Serie Creation : Monthly Total Missed Hours

In [None]:
# Create Month Index Column
absences['MonthInd'] = absences['Year'].astype(str).str[:] + '-' + absences['Month'].astype(str).str.rjust(2, '0')
absences['MonthInd'] = pd.to_datetime(absences['MonthInd'])

# Create Time Serie
monthly_total = absences.groupby('MonthInd').sum()['AbsH']
monthly_total.index = monthly_total.index.to_period(freq='M')

In [None]:
# Plot time series
plt.figure(figsize=(14, 8))
plt.title('Monthly Total Missed Hours', fontsize='18')
monthly_total.plot(style='-*', color='lightcoral', marker='D', markeredgecolor='black')

### Split Training / Testing Sets : 2 Years / 1 Year

In [None]:
# Split
y_train = monthly_total.iloc[:25]
y_test = monthly_total.iloc[25:]

In [None]:
# Plot time series
plt.figure(figsize=(14, 8))
plt.title('Monthly Total Missed Hours', fontsize='18')
y_train.plot(style='-*', color='blue', marker='D', markeredgecolor='black', label='Train')
y_test.plot(style='-*', color='green', marker='D', markeredgecolor='black', label='Test')
plt.legend();

### Forecasting

In [None]:
# Dummy Model : Last Value
y_last_value = pd.Series(y_train.iloc[-1], index=y_test.index)
y_last_value

In [None]:
# Dummy Model : Last Value Same Month
y_monthly_last_value = pd.Series(y_train.iloc[-12:].values,index=y_test.index)
y_monthly_last_value

In [None]:
# Plot time series
plt.figure(figsize=(14, 8))
plt.title('Monthly Total Missed Hours', fontsize='18')
y_test.plot(style='-*', color='green', marker='D', markeredgecolor='black', label='Test')

y_last_value.plot(style='-*', color='grey', marker='D', markeredgecolor='black', label='Last Value')
y_monthly_last_value.plot(style='-*', color='steelblue', marker='D', markeredgecolor='black', label='Monthly Last Value')
plt.legend();