In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hackerearth-employee-burnout-challenge/sample_submission.csv
/kaggle/input/hackerearth-employee-burnout-challenge/train.csv
/kaggle/input/hackerearth-employee-burnout-challenge/test.csv


In [41]:
import numpy as np
import pandas as pd

import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import optuna
import xgboost as xgb

from sklearn.metrics import r2_score
import pickle

In [3]:
train_df = pd.read_csv('/kaggle/input/hackerearth-employee-burnout-challenge/train.csv', header = 0)
test_df = pd.read_csv('/kaggle/input/hackerearth-employee-burnout-challenge/test.csv', header = 0)
train_df.head()

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2.0,3.0,3.8,0.16
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1.0,2.0,5.0,0.36
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2.0,,5.8,0.49
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1.0,1.0,2.6,0.2
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3.0,7.0,6.9,0.52


In [4]:
train_df['year'] = pd.DatetimeIndex(train_df['Date of Joining']).year

In [5]:
train_df['year'].value_counts()

2008    22750
Name: year, dtype: int64

In [6]:
train_df.drop(['Employee ID', 'Date of Joining', 'year'], axis = 1, inplace = True)
test_df.drop(['Employee ID', 'Date of Joining'], axis = 1, inplace = True)

In [7]:
train_df.head()

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,Female,Service,No,2.0,3.0,3.8,0.16
1,Male,Service,Yes,1.0,2.0,5.0,0.36
2,Female,Product,Yes,2.0,,5.8,0.49
3,Male,Service,Yes,1.0,1.0,2.6,0.2
4,Female,Service,No,3.0,7.0,6.9,0.52


In [8]:
train_df.shape

(22750, 7)

In [9]:
train_df.columns

Index(['Gender', 'Company Type', 'WFH Setup Available', 'Designation',
       'Resource Allocation', 'Mental Fatigue Score', 'Burn Rate'],
      dtype='object')

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22750 entries, 0 to 22749
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Gender                22750 non-null  object 
 1   Company Type          22750 non-null  object 
 2   WFH Setup Available   22750 non-null  object 
 3   Designation           22750 non-null  float64
 4   Resource Allocation   21369 non-null  float64
 5   Mental Fatigue Score  20633 non-null  float64
 6   Burn Rate             21626 non-null  float64
dtypes: float64(4), object(3)
memory usage: 1.2+ MB


In [11]:
train_df.isnull().sum()

Gender                     0
Company Type               0
WFH Setup Available        0
Designation                0
Resource Allocation     1381
Mental Fatigue Score    2117
Burn Rate               1124
dtype: int64

In [12]:
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Designation,22750.0,2.178725,1.135145,0.0,1.0,2.0,3.0,5.0
Resource Allocation,21369.0,4.481398,2.047211,1.0,3.0,4.0,6.0,10.0
Mental Fatigue Score,20633.0,5.728188,1.920839,0.0,4.6,5.9,7.1,10.0
Burn Rate,21626.0,0.452005,0.198226,0.0,0.31,0.45,0.59,1.0


In [13]:
train_df['Resource Allocation'].fillna(4.0, inplace = True)
train_df['Mental Fatigue Score'].fillna(5.7, inplace = True)
train_df['Burn Rate'].fillna(0.45, inplace = True)

In [14]:
train_df.isnull().sum()

Gender                  0
Company Type            0
WFH Setup Available     0
Designation             0
Resource Allocation     0
Mental Fatigue Score    0
Burn Rate               0
dtype: int64

In [15]:
test_df.isnull().sum()

Gender                  0
Company Type            0
WFH Setup Available     0
Designation             0
Resource Allocation     0
Mental Fatigue Score    0
dtype: int64

In [16]:
train_df['Gender'].value_counts()

Female    11908
Male      10842
Name: Gender, dtype: int64

In [17]:
train_df['WFH Setup Available'].value_counts()

Yes    12290
No     10460
Name: WFH Setup Available, dtype: int64

In [18]:
train_df['Company Type'].value_counts()

Service    14833
Product     7917
Name: Company Type, dtype: int64

In [19]:
cleanup_cols = {"Gender":     {"Female": 0, "Male": 1},
                "Company Type": {"Service": 0, "Product": 1},
                 "WFH Setup Available": {"Yes": 0, "No": 1}}

In [20]:
train_df = train_df.replace(cleanup_cols)
train_df.head()

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,0,0,1,2.0,3.0,3.8,0.16
1,1,0,0,1.0,2.0,5.0,0.36
2,0,1,0,2.0,4.0,5.8,0.49
3,1,0,0,1.0,1.0,2.6,0.2
4,0,0,1,3.0,7.0,6.9,0.52


In [21]:
test_df = test_df.replace(cleanup_cols)
test_df.head()

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score
0,0,0,1,2.0,5.0,7.7
1,0,1,0,1.0,2.0,5.2
2,1,1,0,1.0,3.0,5.9
3,0,0,1,3.0,6.0,4.6
4,0,1,1,2.0,5.0,6.4


In [22]:
feature_cols = train_df.drop(['Burn Rate'], axis=1).columns
x = train_df[feature_cols]
y = train_df['Burn Rate']

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [33]:
clf = xgb.XGBRegressor(random_state = 42)
clf.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=2, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [34]:
predictions = clf.predict(x_test)
errors = abs(predictions - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 3))

Mean Absolute Error: 0.054


In [26]:
def objective(trial,data=x,target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    
    # To select which parameters to optimize, please look at the XGBoost documentation:
    # https://xgboost.readthedocs.io/en/latest/parameter.html
    param = {
        'tree_method':'gpu_hist',  # Use GPU acceleration
        'lambda': trial.suggest_loguniform(
            'lambda', 1e-3, 10.0
        ),
        'alpha': trial.suggest_loguniform(
            'alpha', 1e-3, 10.0
        ),
        'colsample_bytree': trial.suggest_categorical(
            'colsample_bytree', [0.5,0.6,0.7,0.8,0.9,1.0]
        ),
        'subsample': trial.suggest_categorical(
            'subsample', [0.6,0.7,0.8,1.0]
        ),
        'learning_rate': trial.suggest_categorical(
            'learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]
        ),
        'n_estimators': trial.suggest_categorical(
            "n_estimators", [150, 200, 300, 3000]
        ),
        'max_depth': trial.suggest_categorical(
            'max_depth', [4,5,7,9,11,13,15,17]
        ),
        'random_state': 42,
        'min_child_weight': trial.suggest_int(
            'min_child_weight', 1, 300
        ),
    }
    model = xgb.XGBRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [35]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-05-15 10:54:14,918][0m A new study created in memory with name: no-name-d765944c-7814-4b22-a72a-442bcabf549b[0m
[32m[I 2021-05-15 10:54:23,010][0m Trial 0 finished with value: 0.07201447563605116 and parameters: {'lambda': 0.026251202097649265, 'alpha': 0.8832358551224228, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.009, 'n_estimators': 3000, 'max_depth': 13, 'min_child_weight': 215}. Best is trial 0 with value: 0.07201447563605116.[0m
[32m[I 2021-05-15 10:54:23,459][0m Trial 1 finished with value: 0.07827512638810963 and parameters: {'lambda': 0.00914156403527688, 'alpha': 3.085294911554924, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.012, 'n_estimators': 200, 'max_depth': 13, 'min_child_weight': 297}. Best is trial 0 with value: 0.07201447563605116.[0m
[32m[I 2021-05-15 10:54:23,908][0m Trial 2 finished with value: 0.07506079777795903 and parameters: {'lambda': 0.012065039532697408, 'alpha': 3.8373539909030008, 'colsample_byt

Number of finished trials: 50
Best trial: {'lambda': 0.001029078601455882, 'alpha': 0.4986209509535147, 'colsample_bytree': 1.0, 'subsample': 0.8, 'learning_rate': 0.016, 'n_estimators': 300, 'max_depth': 13, 'min_child_weight': 29}


In [36]:
study.trials_dataframe().head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_colsample_bytree,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_n_estimators,params_subsample,state
0,0,0.072014,2021-05-15 10:54:14.923109,2021-05-15 10:54:23.009836,0 days 00:00:08.086727,0.883236,0.9,0.026251,0.009,13,215,3000,0.8,COMPLETE
1,1,0.078275,2021-05-15 10:54:23.013689,2021-05-15 10:54:23.458320,0 days 00:00:00.444631,3.085295,0.8,0.009142,0.012,13,297,200,0.7,COMPLETE
2,2,0.075061,2021-05-15 10:54:23.462102,2021-05-15 10:54:23.908087,0 days 00:00:00.445985,3.837354,0.9,0.012065,0.014,7,172,200,0.6,COMPLETE
3,3,0.071323,2021-05-15 10:54:23.912181,2021-05-15 10:54:27.755965,0 days 00:00:03.843784,0.772591,1.0,0.007229,0.012,11,136,3000,1.0,COMPLETE
4,4,0.090536,2021-05-15 10:54:27.757499,2021-05-15 10:54:28.320288,0 days 00:00:00.562789,0.917738,1.0,0.025628,0.008,7,1,150,0.6,COMPLETE


In [37]:
study.best_params

{'lambda': 0.001029078601455882,
 'alpha': 0.4986209509535147,
 'colsample_bytree': 1.0,
 'subsample': 0.8,
 'learning_rate': 0.016,
 'n_estimators': 300,
 'max_depth': 13,
 'min_child_weight': 29}

In [38]:
best_params = study.best_params
best_params['tree_method'] = 'gpu_hist'
best_params['random_state'] = 42

clf = xgb.XGBRegressor(**(best_params))

clf.fit(x_train, y_train)

XGBRegressor(alpha=0.4986209509535147, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0,
             gamma=0, gpu_id=0, importance_type='gain',
             interaction_constraints='', lambda=0.001029078601455882,
             learning_rate=0.016, max_delta_step=0, max_depth=13,
             min_child_weight=29, missing=nan, monotone_constraints='()',
             n_estimators=300, n_jobs=2, num_parallel_tree=1, random_state=42,
             reg_alpha=0.498620957, reg_lambda=0.00102907862,
             scale_pos_weight=1, subsample=0.8, tree_method='gpu_hist',
             validate_parameters=1, verbosity=None)

In [39]:
predictions = clf.predict(x_test)
errors = abs(predictions - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 3))

Mean Absolute Error: 0.052


In [42]:
filename = 'model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [45]:
def predict(data):
    data = pd.DataFrame(data, columns = ['Gender', 'Company Type', 'WFH Setup Available', 'Designation','Resource Allocation', 'Mental Fatigue Score'])
    model = pickle.load(open('model.sav', 'rb'))
    prediction = model.predict(data)
    print('Burn Rate: ', prediction[0])

In [54]:
data = [[0, 1, 0, 2.0, 4.0, 6.8]]
predict(data)

Burn Rate:  0.508228
