# Importing Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected=True)  
import cufflinks as cf  
cf.go_offline() 
df = pd.read_csv('../input/Absenteeism_at_work.csv')
df.head()

Here we need to check is the dataset's attributes/columns have any missing values which could be replaced by a potential statistical figure such as mean/median etc

In [None]:
df.isna().sum()

So there are no missing values

In [None]:
df.info()

In [None]:
df.describe()

# EDA

In [None]:
df.iplot(kind='box')

In [None]:
cols = df.columns.tolist()
cols.pop(0)

In [None]:
for i in cols:
    print(i)
    df[i].iplot()

In [None]:
AvgR = df[cols[1:]].mean()
AvgR = AvgR.sort_values()
plt.figure(figsize=(10,7))
plt.barh(np.arange(len(cols[1:])), AvgR.values, align='center')
plt.yticks(np.arange(len(cols[1:])), AvgR.index)
plt.ylabel('Categories')
plt.xlabel('Average')
plt.title('Average')

In [None]:
sns.pairplot(df)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20)) 
sns.heatmap(df.corr(), annot = True, ax = ax)

Here we're saving a copy of the dataset for the purpose of hierarhical clustering

In [None]:
DF = df.copy()
DF.head()

# Clustering
# 1.1 KMean Clustering
For KMean clustering we used Elbow Method with WCSS (within-cluster sum of squares) 

In [None]:
vals = DF.iloc[ :, 1:].values

from sklearn.cluster import KMeans
wcss = []
for ii in range( 1, 30 ):
    kmeans = KMeans(n_clusters=ii, init="k-means++", n_init=10, max_iter=300) 
    kmeans.fit_predict( vals )
    wcss.append( kmeans.inertia_ )
    
plt.plot( wcss, 'ro-', label="WCSS")
plt.title("Computing WCSS for KMeans++")
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.show()

**Graph Description:** Observing the above graph we decided the value of k as 5

In [None]:
X = df.drop(['ID'],axis=1).values
Y = df['ID'].values

In [None]:
km = KMeans(n_clusters=5, init="k-means++", n_init=10, max_iter=500) 
y_pred = kmeans.fit_predict(X)

In [None]:
DF["Cluster"] = y_pred
cols = list(DF.columns)
cols.remove("ID")

sns.pairplot( DF[cols], hue="Cluster")

# 1.2 Hierachical Clustering

In [None]:
import scipy.cluster.hierarchy as sch
from sklearn.preprocessing import scale as s
from scipy.cluster.hierarchy import dendrogram, linkage

Below is special function designed specifically for drawing a line on the generated dendrogram, in order to bring out number of clusters

In [None]:
def fd(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram (truncated)')
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

We picked Ward Linkage for the Dendrogram as minimum variance criterion minimizes the total within-cluster variance

In [None]:
Z = sch.linkage(df,method='ward')  
den = sch.dendrogram(Z)
plt.tick_params(
    axis='x',          
    which='both',      
    bottom=False,     
    top=False,         
    labelbottom=False) 
plt.title('Hierarchical Clustering')

In [None]:
Z = linkage(df,method='ward')
fd(Z,leaf_rotation=90.,show_contracted=True,annotate_above=750,max_d=1250)
plt.tick_params(
    axis='x',          
    which='both',      
    bottom=False,     
    top=False,         
    labelbottom=False) 

**Graph Description:** Now if we draw a horizontal line from 1250, we come across the 2 clusters generated by Ward hierarhical clustering 

In a very similar manner we also did hierarhical clustering with Complete Linkage as well. In Complete Linkage it's the similarity of two clusters is the similarity of their most dissimilar members.

In [None]:
Z = sch.linkage(df,method='complete')  
den = sch.dendrogram(Z)
plt.tick_params(
    axis='x',          
    which='both',      
    bottom=False,     
    top=False,         
    labelbottom=False) 

In [None]:
Z = linkage(df,method='complete')
fd(Z,leaf_rotation=90.,show_contracted=True,annotate_above=160,max_d=280)
plt.tick_params(
    axis='x',          
    which='both',      
    bottom=False,     
    top=False,         
    labelbottom=False) 

We come across 2 clusters again with Complete Linkage as well.

# Predicting Absenteeism time in hours
Now since our problem deals with Absenteeism time in hours, which is regression in nature, we will be engaging algorithms and neural networks. The data would be scaled through Standard Scaler for data normalization. Model performance and score will be judged through R2-Score and Mean Squared Error. Implementation of each algorithm and neural networks has been done below:-

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import scale as s
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split as t
import sklearn.metrics as mt

In [None]:
x = df.drop(['ID','Absenteeism time in hours'],axis=1).values
y = df['Absenteeism time in hours'].values

In [None]:
x = s(x)
y = s(y)

Train Test splitting is mandatory to find model's performance on testing dataset. Data is split in 80:20 ratio

In [None]:
train_x,test_x,train_y,test_y = t(x,y,test_size=0.2)

# 2.1 Random Forest 
We choose 100 number of Trees/Estimators for our Random Forest Regressor with maximum depth of 4

In [None]:
rfr = RandomForestRegressor(n_estimators=100,max_depth=4)
rfr.fit(train_x,train_y)
print(f'Score = {rfr.score(test_x,test_y)}')
print(f'MSE = {mt.mean_squared_error(test_y,rfr.predict(test_x))}')

# 2.2 Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(train_x,train_y)
print(f'Score = {mt.r2_score(test_y,lr.predict(test_x))}')
print(f'MSE = {mt.mean_squared_error(test_y,lr.predict(test_x))}')

# 2.3 K-Nearest Neighbors
Here we took 10 number of neighbors

In [None]:
knr = KNeighborsRegressor(n_neighbors=10)
knr.fit(train_x,train_y)
print(f'Score = {mt.r2_score(test_y,knr.predict(test_x))}')
print(f'MSE = {mt.mean_squared_error(test_y,knr.predict(test_x))}')

# 2.4 Extreme Gradient Boosting

In [None]:
xgbr = XGBRegressor()
xgbr.fit(train_x,train_y)
print(f'Score = {xgbr.score(test_x,test_y)}')
print(f'MSE = {mt.mean_squared_error(test_y,xgbr.predict(test_x))}')

# 2.5 Multi-Layer Perceptron
For our Multi-Layer Perceptron Regressor we made 3 layers, with 100 nodes, followed by 1 hidden layers with 50, ending with 1 node since it's a regression problem. Maximum number of iterations is 500 at a time.

In [None]:
mlpr = MLPRegressor(hidden_layer_sizes=(100,50,1), max_iter=500)
mlpr.fit(train_x,train_y)
print(f'Score = {mlpr.score(test_x,test_y)}')
print(f'MSE = {mt.mean_squared_error(test_y,mlpr.predict(test_x))}')