In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/hackerearth-employee-burnout-challenge/train.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
rows = data.shape[0]
for col in data.columns:
    print('Column {} with missing value = {}%'.format(col,100*data[col].isna().sum()/rows))

In [None]:
data.dropna(subset=['Burn Rate','Mental Fatigue Score'],axis=0,inplace=True)

In [None]:
rows = data.shape[0]
for col in data.columns:
    print('Column {} with missing value = {}%'.format(col,100*data[col].isna().sum()/rows))

In [None]:
data['Date of Joining'] =pd.to_datetime(data['Date of Joining'])
recent_joining = data['Date of Joining'].max()
print(recent_joining)
data['Date of Joining'] = data['Date of Joining'].apply(lambda x : data['Date of Joining'].max()-x)
data['Date of Joining'] = data['Date of Joining'].astype('int')/86400000000000

In [None]:
data.corr()
## Resource allocation is a big factor in burn rate and mental fatigue score

In [None]:
import seaborn as sns
corr_plot = sns.heatmap(data.corr())
figure = corr_plot.get_figure()    
figure.savefig('corr_plot.png')

## Insights with this
    1. Date of Joining is not relevent (atleast this dataset says so)
    2. Resource allocation plays a big part in fatigue and burn rate scoring

In [None]:
data.drop(['Date of Joining','Employee ID'],axis=1,inplace=True)

In [None]:
target_variables = data.iloc[:,-2:]
features         = data.iloc[:,:-2]

In [None]:
train_features = pd.get_dummies(features)
train_features

##### We will be using LightGBM which can handle NA values so we don't need to worry about the NA features

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
burnrate_model = LGBMRegressor()
param_grid = {
    'n_estimators': [30, 128,200],
    'colsample_bytree': [0.3,0.7],
    'max_depth': [15,
                  25],
    'num_leaves': [50, 100,120],
    'reg_alpha': [1.1, 1.3],
    'reg_lambda': [1.1, 1.3],
    'min_split_gain': [ 0.4],
    'subsample': [0.7, 0.9],
    'subsample_freq': [20]
}

burnrate_gs = GridSearchCV(
    estimator=burnrate_model,
    param_grid=param_grid, 
    cv=5, 
    n_jobs=-1, 
    scoring='neg_mean_absolute_error',
    verbose=True
)
burnrate_fit = burnrate_gs.fit(train_features, target_variables.iloc[:,-1])

In [None]:
burnrate_fit.best_score_

In [None]:
fatigue_model = LGBMRegressor()
param_grid = {
    'n_estimators': [30, 128,200],
    'colsample_bytree': [0.3,0.7],
    'max_depth': [15,
                  25],
    'num_leaves': [50, 100,120],
    'reg_alpha': [1.1, 1.3],
    'reg_lambda': [1.1, 1.3],
    'min_split_gain': [ 0.4],
    'subsample': [0.7, 0.9],
    'subsample_freq': [20]
}

fatigue_gs = GridSearchCV(
    estimator=fatigue_model,
    param_grid=param_grid, 
    cv=5, 
    n_jobs=-1, 
    scoring='neg_mean_absolute_error',
    verbose=True
)
fatigue_fit = burnrate_gs.fit(train_features, target_variables.iloc[:,-2])

In [None]:
fatigue_fit.best_score_