<a href="https://colab.research.google.com/github/redwankarimsony/hackerearth_employee_burnout/blob/main/HackerEarth_(Employee_Burnout_Challenge_2020).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np, os
import pandas as pd
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.model_selection import train_test_split


HOME_DIR = '/content/'
DATA_DIR = os.path.join(HOME_DIR, 'data')

In [2]:
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
sub = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))

## Utility Functions

In [3]:
def get_days(d0, d1):
    d0 = pd.to_datetime(d0)
    d1 = pd.to_datetime(d1)
    delta = d1 - d0
    return delta.days

In [4]:
train_df.head()

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2.0,3.0,3.8,0.16
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1.0,2.0,5.0,0.36
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2.0,,5.8,0.49
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1.0,1.0,2.6,0.2
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3.0,7.0,6.9,0.52


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22750 entries, 0 to 22749
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           22750 non-null  object 
 1   Date of Joining       22750 non-null  object 
 2   Gender                22750 non-null  object 
 3   Company Type          22750 non-null  object 
 4   WFH Setup Available   22750 non-null  object 
 5   Designation           22750 non-null  float64
 6   Resource Allocation   21369 non-null  float64
 7   Mental Fatigue Score  20633 non-null  float64
 8   Burn Rate             21626 non-null  float64
dtypes: float64(4), object(5)
memory usage: 1.6+ MB


In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12250 entries, 0 to 12249
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           12250 non-null  object 
 1   Date of Joining       12250 non-null  object 
 2   Gender                12250 non-null  object 
 3   Company Type          12250 non-null  object 
 4   WFH Setup Available   12250 non-null  object 
 5   Designation           12250 non-null  float64
 6   Resource Allocation   12250 non-null  float64
 7   Mental Fatigue Score  12250 non-null  float64
dtypes: float64(3), object(5)
memory usage: 765.8+ KB


In [7]:
train_df.dropna(inplace=True)

In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18590 entries, 0 to 22749
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           18590 non-null  object 
 1   Date of Joining       18590 non-null  object 
 2   Gender                18590 non-null  object 
 3   Company Type          18590 non-null  object 
 4   WFH Setup Available   18590 non-null  object 
 5   Designation           18590 non-null  float64
 6   Resource Allocation   18590 non-null  float64
 7   Mental Fatigue Score  18590 non-null  float64
 8   Burn Rate             18590 non-null  float64
dtypes: float64(4), object(5)
memory usage: 1.4+ MB


In [9]:
dataset = [train_df, test_df]

for data in dataset:
    data['Date of Joining'] = pd.to_datetime(data['Date of Joining'])
    data['Gender'] = [1 if (gender == 'Male') else 0  for gender in data.Gender]
    data['Company Type'] = [1 if (ctype == 'Service') else 0  for ctype in data['Company Type']]
    data['WFH Setup Available'] = [1 if (wfh == 'Yes') else 0  for wfh in data['WFH Setup Available']]
    data['JobDuration'] = [get_days(d, '2009-2-1') for d in data['Date of Joining']]


# train_df['Date of Joining'] = pd.to_datetime(train_df['Date of Joining'])
# train_df['Gender'] = [1 if (gender == 'Male') else 0  for gender in train_df.Gender]
# train_df['Company Type'] = [1 if (ctype == 'Service') else 0  for ctype in train_df['Company Type']]
# train_df['WFH Setup Available'] = [1 if (wfh == 'Yes') else 0  for wfh in train_df['WFH Setup Available']]
# train_df['JobDuration'] = [get_days(d, '2009-2-1') for d in train_df['Date of Joining']]

In [10]:
test_df

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,JobDuration
0,fffe31003300390039003000,2008-12-10,0,1,0,2.0,5.0,7.7,53
1,fffe31003300310037003800,2008-08-14,0,0,1,1.0,2.0,5.2,171
2,fffe33003400380035003900,2008-11-13,1,0,1,1.0,3.0,5.9,80
3,fffe3100370039003200,2008-02-07,0,1,0,3.0,6.0,4.6,360
4,fffe32003600390036003700,2008-07-17,0,0,0,2.0,5.0,6.4,199
...,...,...,...,...,...,...,...,...,...
12245,fffe3900310034003700,2008-10-02,0,1,1,1.0,2.0,6.1,122
12246,fffe32003600330034003000,2008-03-31,0,0,1,2.0,4.0,5.9,307
12247,fffe31003800340039003000,2008-02-12,1,1,0,4.0,7.0,9.6,355
12248,fffe32003600380031003800,2008-02-06,1,1,0,3.0,6.0,6.7,361


In [11]:
train_df['Date of Joining'].min(), train_df['Date of Joining'].max()

(Timestamp('2008-01-01 00:00:00'), Timestamp('2008-12-31 00:00:00'))

In [12]:
test_df['Date of Joining'].min(), test_df['Date of Joining'].max()

(Timestamp('2008-01-01 00:00:00'), Timestamp('2008-12-31 00:00:00'))

In [13]:
train_df

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate,JobDuration
0,fffe32003000360033003200,2008-09-30,0,1,0,2.0,3.0,3.8,0.16,124
1,fffe3700360033003500,2008-11-30,1,1,1,1.0,2.0,5.0,0.36,63
3,fffe32003400380032003900,2008-11-03,1,1,1,1.0,1.0,2.6,0.20,90
4,fffe31003900340031003600,2008-07-24,0,1,0,3.0,7.0,6.9,0.52,192
5,fffe3300350037003500,2008-11-26,1,0,1,2.0,4.0,3.6,0.29,67
...,...,...,...,...,...,...,...,...,...,...
22743,fffe3300390030003600,2008-12-15,0,0,1,1.0,3.0,6.0,0.48,48
22744,fffe32003500370033003200,2008-05-27,1,0,0,3.0,7.0,6.2,0.54,250
22746,fffe33003000350031003800,2008-01-19,0,0,1,3.0,6.0,6.7,0.59,379
22748,fffe33003300320036003900,2008-01-10,0,1,0,2.0,5.0,5.9,0.52,388


In [14]:
train_df['Company Type'].value_counts()

1    12174
0     6416
Name: Company Type, dtype: int64

# Data Staging


In [15]:
# Selected features for training
features = ['Gender',   'JobDuration',  'Company Type',	'WFH Setup Available', 	
            'Designation', 'Resource Allocation',	'Mental Fatigue Score']

# Saving Normalizing Parameters for future use with train and test set
feature_min = train_df[features].min().values
feature_max = train_df[features].max().values

# Normalization 
train_df_norm = (train_df[features] - feature_min)/ (feature_max - feature_min)
X_test = (test_df[features] - feature_min)/ (feature_max - feature_min)

y = train_df['Burn Rate']
X_train, X_valid, y_train, y_valid = train_test_split( train_df_norm, y, test_size=0.2, random_state=1234)

# Modeling LightGBM


In [33]:
# LGBM Parameters
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l2', 'auc'],
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 128,  
    "max_bin": 512,
    "num_iterations": 100000,
    "n_estimators": 1000
}


gbm = lgb.LGBMRegressor(**hyper_params)




In [None]:
gbm.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        early_stopping_rounds=1000)

In [24]:
y_test_predicted = gbm.predict(X_test, num_iteration=gbm.best_iteration_)

In [19]:
submission = pd.DataFrame({'Employee ID' : test_df['Employee ID'], 'Burn Rate': y_test_predicted})
submission.to_csv('submission_auc_4.csv')

# Ensemble: 

In [34]:
import random 



FOLD = 20; 
for i in range(FOLD):
    X_train, X_valid, y_train, y_valid = train_test_split( train_df_norm, y, test_size=0.2, random_state=random.randint(10, 1000))

    # gbm = lgb.LGBMRegressor(**hyper_params)
    gbm.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        early_stopping_rounds=1000)
    
    y_test_predicted = gbm.predict(X_test, num_iteration=gbm.best_iteration_)

    submission = pd.DataFrame({'Employee ID' : test_df['Employee ID'], 'Burn Rate': y_test_predicted})
    submission.to_csv(f'submission_auc_{i}.csv', index = False)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1685]	valid_0's l2: 0.00295326	valid_0's auc: 0.998269
[1686]	valid_0's l2: 0.00295327	valid_0's auc: 0.998269
[1687]	valid_0's l2: 0.00295328	valid_0's auc: 0.998282
[1688]	valid_0's l2: 0.00295337	valid_0's auc: 0.998275
[1689]	valid_0's l2: 0.00295341	valid_0's auc: 0.998269
[1690]	valid_0's l2: 0.00295347	valid_0's auc: 0.998269
[1691]	valid_0's l2: 0.00295347	valid_0's auc: 0.998269
[1692]	valid_0's l2: 0.00295349	valid_0's auc: 0.998262
[1693]	valid_0's l2: 0.00295346	valid_0's auc: 0.998262
[1694]	valid_0's l2: 0.00295347	valid_0's auc: 0.998262
[1695]	valid_0's l2: 0.00295348	valid_0's auc: 0.998262
[1696]	valid_0's l2: 0.00295357	valid_0's auc: 0.998262
[1697]	valid_0's l2: 0.00295353	valid_0's auc: 0.998255
[1698]	valid_0's l2: 0.00295355	valid_0's auc: 0.998255
[1699]	valid_0's l2: 0.0029536	valid_0's auc: 0.998255
[1700]	valid_0's l2: 0.00295363	valid_0's auc: 0.998255
[1701]	valid_0's l2: 0.00295368	valid_0'

In [36]:
submission = pd.read_csv('/content/submission_auc_0.csv')

for i in range(1, FOLD):
    submission['Burn Rate'] +=  pd.read_csv(f'/content/submission_auc_{i}.csv')['Burn Rate']

submission['Burn Rate'] = submission['Burn Rate']/20
submission.to_csv('submission_ens4.csv')