### I. Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
%matplotlib inline

### II. Dataset Preparation

#### Taking a peeek at the dataset and its structure:

In [None]:
df = pd.read_csv('../input/absenteeism-at-work-an-uci-dataset/Absenteeism_at_work.csv')
df.columns

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

#### Converting feature types:
- *Features such as Employee ID, Reason for absence, Month of absence, Day of the week, Seasons, Disciplinary Failure, Education, Son, Social Drinker, Social Smoker and Pet are all categorical features according to their description and should therefore be converted.*

In [None]:
cat_cols = ['ID','Reason for absence', 'Month of absence', 
            'Day of the week', 'Seasons', 'Disciplinary failure', 
            'Education', 'Son', 'Social drinker', 'Social smoker',
            'Pet']
for col in cat_cols:
    df[col] = df[col].astype('category')

- *Checking if conversion is successfull*

In [None]:
df.info()

#### Data Cleaning
- *Let's check for some missing values*

In [None]:
sns.set_palette("GnBu_d")
plt.title("Missingess Map")
plt.rcParams['figure.figsize'] = (8.0, 5.0) #Adjust values as necessary
sns.heatmap(df.isnull(), cbar=False)

In [None]:
df.info()

#### Checking Validity of Data:
- *Upon investigation, we've noticed that there are some rows with values of "0" in the Month of absence which doesn't make sense*

In [None]:
pd.unique(df['Month of absence'])

In [None]:
df[df['Month of absence'] == 0]

- *Deleting these 3 rows (Creating a subset where these 3 rows are filtered):*

In [None]:
df = df[df['Month of absence'] != 0]
df.info()

- *Upon investigation, we've noticed that there are 41 rows with values of "0" in the Absenteeism time in hours feature. We believe these are late cases, which is why the disciplinary failure values for almost all of these cases are 1.*

In [None]:
df[(df['Absenteeism time in hours'] == 0)].head()

- *Of these 41 rows, 1 row seems suspicious because it is the only one that has a disciplinary failure value of 0 and it has a valid reason for absence.*

In [None]:
df[(df['Absenteeism time in hours'] == 0) & (df['Disciplinary failure'] == 0)]

- *To replace the 0 absenteeisim time for this row (row 134), we'll replace it with the average absenteeism time of reason for absence 27*

In [None]:
df.groupby("Reason for absence").mean()[26:27]

- *We'll replace the value with 3 hours*

In [None]:
df.loc[(df['Reason for absence'] == 27) & (df['Absenteeism time in hours'] == 0), 'Absenteeism time in hours'] = 3
df.iloc[34]['Absenteeism time in hours']

### III. Exploratory Data Analysis

#### Count Plots for Categorical Features:

In [None]:
fig, axs = plt.subplots(ncols = 3,nrows = 1, figsize = (36,8.5))
for i,j in enumerate(cat_cols[0:3]):
    sns.countplot(y = j, data = df, ax = axs[i],  
                  orient = "h", 
                  order = df[j].value_counts().index, 
                  palette = "mako")
fig.suptitle("Total Count of Filed Absences", fontsize=30)

fig, axs = plt.subplots(ncols = 8,nrows = 1, figsize = (36,4))
for i,j in enumerate(cat_cols[3:11]):
    sns.countplot(y = j, data = df, ax = axs[i],  
                  orient = "h", order = df[j].value_counts().index, 
                  palette = "mako")

- *Employee with employee ID 3 has the most number of filed absences.*
- *The top filed reason for absence is reason no. 23 which referes to Medical Consultations.*
- *The month with the most number of filed absences is 3 which refers to March.*
- *The day of the week which has the most number of filed absences is 2 which refers to Monday. The other days are not really that far.*
- *The season which has the most number of filed absences is 4 which refers to the spring season.*
- *Almost all of the filed absences have a disciplinary failure value of 0.*
- *A large chunk of the filed absences are from employees with "education value" of 1 which refers high school graduates.* 
- *Employees with "son value" of 0 which refers to employees with no chlidren, tend to file more absences than employees with children* 
- *Social drinkers have more filed cases of absence than non-social drinkers*
- *Non-social somers have more filed cases of absence than social smokers*

In [None]:
from IPython.display import display_html 

def styler(feature):
    counts = df.groupby([feature], as_index = False).count()[[feature,'Absenteeism time in hours']].sort_values('Absenteeism time in hours', ascending = False)
    counts.columns = [feature, 'Total Count of Hours']
    style = counts.head(5).style.set_table_attributes("style='display:inline'").set_caption("Top 5 " + str(feature) + "s -Counts" ).hide_index()
    return style

display_html(styler("ID")._repr_html_() + styler("Reason for absence")._repr_html_() + styler("Month of absence")._repr_html_(), raw=True)

#### Looking at total Absenteeism time in hours:

- *Now let's look at the total Absenteeism time in hours for each categorical variable*

In [None]:
fig, axs = plt.subplots(ncols = 3,nrows = 1, figsize = (36,8.5))
for i,j in enumerate(cat_cols[0:3]):
    df_sum = df.groupby([j],as_index = False).sum()
    sns.barplot(y = j, x = "Absenteeism time in hours", ax = axs[i],
                data = df_sum,orient = "h", 
                order=df_sum.sort_values('Absenteeism time in hours',ascending = False)[j],
                palette = "mako")
fig.suptitle("Total Absenteeism time in hours", fontsize=30)

fig, axs = plt.subplots(ncols = 8,nrows = 1, figsize = (36,5))
for i,j in enumerate(cat_cols[3:11]):
    df_sum = df.groupby([j],as_index = False).sum()
    sns.barplot(y = j, x = "Absenteeism time in hours", ax = axs[i],
                data = df_sum,orient = "h", 
                order=df_sum.sort_values('Absenteeism time in hours',ascending = False)[j],
                palette = "mako")

In [None]:
from IPython.display import display_html 

def styler(feature):
    counts = df.groupby([feature], as_index = False).sum()[[feature,'Absenteeism time in hours']].sort_values('Absenteeism time in hours', ascending = False)
    counts.columns = [feature, 'Sum of Hours']
    style = counts.head(5).style.set_table_attributes("style='display:inline'").set_caption("Top 5 " + str(feature) + "s - Counts" ).hide_index()
    return style

display_html(styler("ID")._repr_html_() + styler("Reason for absence")._repr_html_() + styler("Month of absence")._repr_html_(), raw=True)

- *Employee with ID 3 has the most number of absenteeism time in hours*
- *Reason 13 which refers to "Diseases of the musculoskeletal system and connective tissue" has the highest total abseenteeism time in hours (800+ hours) among all other reasons.*
- *The month with the highest total absenteeism time in hours is month 3 which refers to March.*
- *The day of the week which has the highest total number of absenteeism hours is 2 which refers to Monday.*
- *The season which has the highest total number of absenteeism hours is 3 which refers to the winter season.*
- *The total absenteeism time of employees with "education value" of 1 which refers to high school graduates is more than 4000 hours. This is the higest among all other classes in the Education category.* 
- *The total absenteeism time of employees with "son value" of 2 which refers to employees with 2 children is the higest among all other classes in the Son category.*

#### Try - Average Absenteeism time in hours:

In [None]:
fig, axs = plt.subplots(ncols = 3,nrows = 1, figsize = (36,8.5))
for i,j in enumerate(cat_cols[0:3]):
    df_sum = df.groupby([j],as_index = False).mean()
    sns.barplot(y = j, x = "Absenteeism time in hours", ax = axs[i],
                data = df_sum,orient = "h", 
                order=df_sum.sort_values('Absenteeism time in hours',ascending = False)[j],
                palette = "mako")
fig.suptitle("Average Absenteeism time in hours", fontsize=30)

fig, axs = plt.subplots(ncols = 8,nrows = 1, figsize = (36,4))
for i,j in enumerate(cat_cols[3:11]):
    df_sum = df.groupby([j],as_index = False).mean()
    sns.barplot(y = j, x = "Absenteeism time in hours", ax = axs[i],
                data = df_sum,orient = "h", 
                order=df_sum.sort_values('Absenteeism time in hours',ascending = False)[j],
                palette = "mako")

In [None]:
from IPython.display import display_html 

def styler(feature):
    counts = df.groupby([feature], as_index = False).mean()[[feature,'Absenteeism time in hours']].sort_values('Absenteeism time in hours', ascending = False)
    counts.columns = [feature, 'Average Number of Hours']
    style = counts.head(5).style.set_table_attributes("style='display:inline'").set_caption("Top 5 " + str(feature) + "s - Average" ).hide_index()
    return style

display_html(styler("ID")._repr_html_() + styler("Reason for absence")._repr_html_() + styler("Month of absence")._repr_html_(), raw=True)

#### Numerical Features:

- *Check for correlations*

In [None]:
num_cols = list(set(list(df.columns)) - set(cat_cols))
order = [1,0,2,3,4,5,6,7,8,9]
num_cols = [num_cols[i] for i in order]
first = num_cols[0:5]
second = set(num_cols) - set(first[1:])
pairplot1 = sns.pairplot(data = df[first])
pairplot1.fig.set_size_inches(8,8)
pairplot1.fig.suptitle("Pairplot 1", y = 1.03, size = 20)
pairplot2 = sns.pairplot(data = df[second])
pairplot2.fig.set_size_inches(8,8)
pairplot2.fig.suptitle("Pairplot 2", y = 1.03, size = 20)

- *Correlation Plot*

In [None]:
plt.rcParams['figure.figsize'] = (10, 8)
plt.title("Correlation Plot")
sns.heatmap(df[num_cols].corr(),cmap = "YlGnBu")

- *Among all numerical features, nothing seems to be highly correlated with Absenteeism time in hours. Weight and BMI are obviously highly correlated and is reflected in the heatmap.*

#### Looking at Sum, Average and Count using Numerical Variables

In [None]:
sel_numf = ['Age','Service time','Transportation expense','Distance from Residence to Work','Hit target', 'Pet', 'Son','Body mass index']
for f in sel_numf:
    fig, axs = plt.subplots(ncols = 3,nrows = 1, figsize = (35,4.5))
    age_mean = df.groupby(f, as_index=False).mean()
    age_sum = df.groupby(f, as_index=False).sum()
    age_count = df.groupby(f, as_index=False).count()

    sns.barplot(x = f, y = "Absenteeism time in hours",
                    data = age_mean, orient = "v", ax = axs[0],
                    order = age_mean.sort_values('Absenteeism time in hours',ascending = False)[f],
                    palette = "viridis").set_title('Average Absenteeism Hours by ' + str(f))
    sns.barplot(x = f, y = "Absenteeism time in hours",
                    data = age_sum, orient = "v", ax = axs[1],
                    order = age_sum.sort_values('Absenteeism time in hours',ascending = False)[f],
                    palette = "viridis").set_title('Total Absenteeism Hours by ' + str(f))
    sns.barplot(x = f, y = "Absenteeism time in hours",
                    data = age_count, orient = "v", ax = axs[2],
                    order = age_count.sort_values('Absenteeism time in hours',ascending = False)[f],
                    palette = "viridis").set_title('Count of Absences by ' + str(f))
    fig.suptitle(str(f) + " and Absenteeism Time", fontsize = 20)

### IV. Classification using XGB

- *Instead of regression, we'll apply classification to the dataset to predict the number of absenteeism hours of an employee given a set of features. We'll do this by segmenting the target variable into multiple classes*.

#### Mapping the Target Variable:

In [None]:
names = ['<=4', '4-8', '8-16', '>16']
df['Absenteeism time in hours'] = pd.cut(df['Absenteeism time in hours'], bins = [-1,4,8,16,np.inf], labels=names)
df.head(6)

- *Checking:*.

In [None]:
df.head(6)

In [None]:
df.groupby('Absenteeism time in hours').count()['ID']

#### XGB Classifier

- *Encoding Categorical Features* 

In [None]:
df.info()

In [None]:
df.drop(['ID'],axis =1, inplace = True)

In [None]:
from sklearn.preprocessing import OneHotEncoder
todummify = list(df.select_dtypes(include=['category']).columns)
toexclude = {'Absenteeism time in hours', 'Son', 'Pet', 'Education'}
todummify = [var for var in todummify if var not in toexclude]
enc = OneHotEncoder(drop = 'first')
enc_df = pd.DataFrame(enc.fit_transform(df[todummify]).toarray(),
                      columns = enc.get_feature_names(todummify))
df = df.join(enc_df,how='inner')
df.drop(todummify, axis = 1, inplace = True)
tocategorify = [col for col in df.columns if '_' in col]
df[tocategorify] = df[tocategorify].astype('category')
df.columns

- *Defining independent variables and target variable (X & y)*
- *We dropped the ID column as it definitely would not help in the prediction*

In [None]:
tv = 'Absenteeism time in hours'
X = df.drop([tv], axis = 1) #can use drop or the simple df[] whichever is convenient
y = df[tv]
print("Independent Variables")
display(X.head(20))
print("Target Variable")
display(y.to_frame().head(20))

- *Removing MultiCollinearity Issues from Numeric Features using Variance Inflation Factor*

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
X_numeric = X._get_numeric_data() #drop non-numeric cols
X_numeric = add_constant(X_numeric)
VIF_frame = pd.Series([variance_inflation_factor(X_numeric.values, i) 
               for i in range(X_numeric.shape[1])], 
              index=X_numeric.columns).to_frame()

VIF_frame.drop('const', axis = 0, inplace = True) 
VIF_frame.rename(columns={VIF_frame.columns[0]: 'VIF'},inplace = True)
VIF_frame[~VIF_frame.isin([np.nan, np.inf, -np.inf]).any(1)]

- *Dropping Weight Feature and checking VIFs again* 

In [None]:
X.drop('Weight',axis = 1, inplace = True)
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
X_numeric = X._get_numeric_data() #drop non-numeric cols
X_numeric = add_constant(X_numeric)
VIF_frame = pd.Series([variance_inflation_factor(X_numeric.values, i) 
               for i in range(X_numeric.shape[1])], 
              index=X_numeric.columns).to_frame()

VIF_frame.drop('const', axis = 0, inplace = True) 
VIF_frame.rename(columns={VIF_frame.columns[0]: 'VIF'},inplace = True)
VIF_frame[~VIF_frame.isin([np.nan, np.inf, -np.inf]).any(1)]

In [None]:
all_cat = list(X.select_dtypes(include=['category']).columns)
X[all_cat] = X[all_cat].astype('float')

- *Splitting to Test & Train* 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.25, 
                                                    random_state = 823)
X_train.head()

- *Overview of Feature Importance using xgbc's feature_importances_* 

In [None]:
import xgboost as xgb
xgbc = xgb.XGBClassifier(random_state = 823)
xgbc.fit(X_train,y_train)
features = X_train.columns.tolist()
feature_value = xgbc.feature_importances_
d = {'Features' : features, 'Values' : feature_value}
fi = pd.DataFrame(d).sort_values('Values', ascending = False).reset_index()
plt.rcParams['figure.figsize'] = (20.0, 10.0)
ax = sns.barplot(y = fi['Features'], x = fi['Values'], 
                 orient = "h", data = fi, palette="Blues_d")

- *Feature Scaling* 

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_numeric = X_train._get_numeric_data() #drop non-numeric cols
X_test_numeric = X_test._get_numeric_data() #drop non-numeric cols
X_train_numeric_scaled = pd.DataFrame(scaler.fit_transform(X_train_numeric), 
                                      index=X_train.index,
                                      columns=X_train_numeric.columns)
X_test_numeric_scaled = pd.DataFrame(scaler.transform(X_test_numeric), 
                                     index = X_test.index, 
                                     columns=X_test_numeric.columns)
X_train.update(X_train_numeric_scaled)
X_test.update(X_test_numeric_scaled)
display(X_train.head())
display(X_test.head())

- *Quick Peek at Train & Test Accuracy* 

In [None]:
import sklearn.metrics as metrics
y_pred_test = xgbc.predict(X_test)
y_pred_train = xgbc.predict(X_train)
print("Test Accuracy:",round(metrics.accuracy_score(y_test, y_pred_test),4))
print("Train Accuracy:",round(metrics.accuracy_score(y_train, y_pred_train),4))

- *Hyperparameter Optimization with Cross Validation* 

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'n_estimators' : [5600],
               'max_depth': [1],
               'learning_rate': [0.005]}]
xgb_clf = GridSearchCV(xgbc, parameters,scoring = 'balanced_accuracy', cv = 10)
xgb_clf.fit(X_train,y_train)
print("Best Parameter Values: ")
pd.DataFrame.from_dict(xgb_clf.best_params_,orient='index',columns=['Values'])

- *Predicting* 

In [None]:
best_xgbc_model = xgb_clf.best_estimator_
best_xgbc_model.fit(X_train,y_train)
predictions = best_xgbc_model.predict(X_test)
predictions[0:10]

### IV. Model Evaluation

- *Evaluation - Classification Report and Confusion Matrix* 

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
data = confusion_matrix(y_test, predictions)
df_cm = pd.DataFrame(data, columns=np.unique(y_test), index = np.unique(y_test))
df_cm.index.name = 'Predicted'
df_cm.columns.name = 'Actual'
plt.figure(figsize = (10,7))
sns.set(font_scale=1.5)
ax = sns.heatmap(df_cm,cmap = 'Greens', annot=True,annot_kws={"size": 16}, fmt = 'g')
ax.set_title('Confusion Matrix')
print("Classification Report: ")
print(classification_report(y_test,predictions))

- *Evaluation - ROC Curve and Area Under the Curve* 

In [None]:
import scikitplot as skplt
y_true = y_test
y_probas = best_xgbc_model.predict_proba(X_test)
skplt.metrics.plot_roc(y_true, y_probas, 
                             title = 'ROC Curve',
                             figsize = (12,8))
plt.grid(b = 'Whitegrid')

- *Evaluation - Check for Overfitting* 

In [None]:
from sklearn.metrics import accuracy_score
predictions2 = best_xgbc_model.predict(X_train)
print("Test Accuracy:",round(metrics.accuracy_score(y_test, predictions),4))
print("Train Accuracy:",round(metrics.accuracy_score(y_train, predictions2),4))