In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from math import sqrt
from sklearn.metrics import mean_squared_error
import os

In [2]:
os.chdir("F:\PRANAV\Project\Employee Absentism")

In [3]:
df = pd.read_excel("Absenteeism_at_work_Project.xls")

In [4]:
#Missing Value Analysis
missing_Val = pd.DataFrame(df.isnull().sum())
missing_Val = missing_Val.reset_index()
missing_Val = missing_Val.rename(columns = {'index':'Variables',0:'Missing_Percentage'})
missing_Val['Missing_Percentage'] = (missing_Val['Missing_Percentage']/len(df))*100
missing_Val = missing_Val.sort_values('Missing_Percentage', ascending = False).reset_index(drop=True)
df = df.fillna(df.median())

In [6]:
# removing outliers if any
df = df.rename(columns = {'Absenteeism time in hours':'Absenteeism_time_in_hours'})
q1 = df["Absenteeism_time_in_hours"].quantile(0.25)
q3 = df["Absenteeism_time_in_hours"].quantile(0.75)
iqr = q3-q1 #Interquartile range
fence_low  = q1-1.5*iqr
fence_high = q3+1.5*iqr
df = df.loc[(df["Absenteeism_time_in_hours"] > fence_low) & (df["Absenteeism_time_in_hours"] < fence_high)]

# Another way to detect outlier via 3 standard deviation rule
len(df[((df.Absenteeism_time_in_hours - df.Absenteeism_time_in_hours.mean()) / df.Absenteeism_time_in_hours.std()).abs() < 3])

678

In [7]:
# Converting categorical variables as dummies
y=df["Absenteeism_time_in_hours"].tolist()
df = pd.concat([df, pd.get_dummies(df['Reason for absence'],prefix="Reason for absence")], axis=1)
df = pd.concat([df, pd.get_dummies(df['Month of absence'],prefix="Month of absence")], axis=1)
df = pd.concat([df, pd.get_dummies(df['Day of the week'],prefix="Day of the week")], axis=1)
df = pd.concat([df, pd.get_dummies(df['Seasons'],prefix="Seasons")], axis=1)
df.drop( ['ID','Reason for absence','Month of absence','Day of the week', 'Seasons', 'Transportation expense','Son','Pet','Absenteeism_time_in_hours'], axis=1, inplace=True)
X=df.copy(deep=False)

In [8]:
# Split dataset into train-test as 80:20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Loading different models
clf1=RandomForestRegressor()
clf2=GradientBoostingRegressor()
clf3=LinearRegression()

In [10]:
# Training all 3 models
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
# RMSE for all 3 models

y_pred1=clf1.predict(X_test)
y_pred2=clf2.predict(X_test)
y_pred3=clf3.predict(X_test)

print("RMSE for Random Forest Regressor: ",sqrt(mean_squared_error(y_pred1,y_test)))
print("RMSE for Gradient Boosting Regressor: ",sqrt(mean_squared_error(y_pred2,y_test)))
print("RMSE for Linear Regression Model: ",sqrt(mean_squared_error(y_pred3,y_test)))

RMSE for Random Forest Regressor:  2.6753084861435603
RMSE for Gradient Boosting Regressor:  2.5847308173470864
RMSE for Linear Regression Model:  2.6135087587236496


In [12]:
param_grid = {
            'learning_rate': [0.1,0.01,0.001],
            'max_depth': [3,4, 6, 10, 15],
            'max_features': [0.8, 0.6,None,'log2','sqrt'],
            'n_estimators': [50,100, 200, 300, 1000]
        }
        # Create a based model
gbr = GradientBoostingRegressor()
#Instantiate the grid search model
clf = GridSearchCV(estimator = gbr, param_grid = param_grid,cv = 3)
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.1, 0.01, 0.001], 'max_depth': [3, 4, 6, 10, 15], 'max_features': [0.8, 0.6, None, 'log2', 'sqrt'], 'n_estimators': [50, 100, 200, 300, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [13]:
y_pred=clf.predict(X_test)
print("RMSE for Gradient Boosting Regressor after HP tuning: ",sqrt(mean_squared_error(y_pred,y_test)))

RMSE for Gradient Boosting Regressor after HP tuning:  2.5489801166419244


In [14]:
# Example Input file
X_test.to_csv("sample_input.csv",index=False)

In [15]:
newDF=pd.DataFrame()
newDF["Predicted_values"]=y_pred

In [16]:
newDF.to_csv("sample_output.csv",index=False)