In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt 

# Load data

In [None]:
train = pd.read_csv('/kaggle/input/hackerearth-employee-burnout-challenge/train.csv', index_col = 'Employee ID')
final_test = pd.read_csv('/kaggle/input/hackerearth-employee-burnout-challenge/test.csv', index_col = 'Employee ID')

print('Length of training set: ', len(train))
print('Length of final test set: ', len(final_test))

In [None]:
train.head()

# Exploration 1

In [None]:
(train['Gender'].value_counts(), '----------------------', 
train['Company Type'].value_counts(), '----------------------',
train['WFH Setup Available'].value_counts(), '----------------------',
train['Designation'].value_counts().sort_index(), '----------------------',
train['Resource Allocation'].value_counts().sort_index(), '----------------------',
)

In [None]:
train.dtypes

# Convert dtypes

In [None]:
#convert `Date of Joing` to date type
train['Date of Joining'] = pd.to_datetime(train['Date of Joining'])
final_test['Date of Joining'] = pd.to_datetime(final_test['Date of Joining'])

#convert `Gender` to binary type
gender = {'Female': 1,'Male': 0}
train['Gender'] = [gender[i] for i in train['Gender']]
final_test['Gender'] = [gender[i] for i in final_test['Gender']]

#convert `Company Type` to binary type
company_type = {'Service': 1,'Product': 0}
train['Company Type'] = [company_type[i] for i in train['Company Type']]
final_test['Company Type'] = [company_type[i] for i in final_test['Company Type']]

#convert `WFH Setup Available` to binary type
wfh_setup_avail = {'No': 0,'Yes': 1}
train['WFH Setup Available'] = [wfh_setup_avail[i] for i in train['WFH Setup Available']]
final_test['WFH Setup Available'] = [wfh_setup_avail[i] for i in final_test['WFH Setup Available']]

#convert `Designation` to integer type
train['Designation'] = train['Designation'].astype('Int64')
final_test['Designation'] = final_test['Designation'].astype('Int64')

#convert `Resource Allocation` to integer type
train['Resource Allocation'] = train['Resource Allocation'].astype('Int64')
final_test['Resource Allocation'] = final_test['Resource Allocation'].astype('Int64')

train.dtypes

# Handling Missing Values

In [None]:
train.isnull().sum() / len(train)

## Compare averages and std. deviations with and without NaN records

In [None]:
index = [ 'Gender','Company Type','WFH Setup Available','Designation','Resource Allocation','Mental Fatigue Score','Burn Rate']
columns = ['original', 'dropna']
df_ = pd.DataFrame(index=index, columns=columns)

for i in index:
    value_og = '{:.3f} ({:.3f})'.format(train[i].mean(), train[i].std())
    df_.loc[i, 'original'] = value_og
    
for i in index:
    value_na = '{:.3f} ({:.3f})'.format(train.dropna()[i].mean(), train.dropna()[i].std())
    df_.loc[i, 'dropna'] = value_na
    
df_

The values are not very different, so we can remove records which have an NaN.

In [None]:
train = train.dropna()

# Exploration 2

In [None]:
train[['Designation', 'Resource Allocation', 'Mental Fatigue Score', 'Burn Rate']].corr()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(15, 5))
axs[0].scatter(train['Designation'],train['Burn Rate'])
axs[0].set_title('Designation')
axs[0].set(ylabel='Burn Rate')
axs[1].scatter(train['Resource Allocation'],train['Burn Rate'])
axs[1].set_title('Resource Allocation')
axs[2].scatter(train['Mental Fatigue Score'],train['Burn Rate'])
axs[2].set_title('Mental Fatigue Score')
plt.show()

`Gender` and `Burn Rate`:

In [None]:
train[['Gender', 'Burn Rate']].groupby('Gender').agg('mean')

In [None]:
female_burn_rates = train[train['Gender']==1]['Burn Rate']
male_burn_rates = train[train['Gender']==0]['Burn Rate']

bins = np.linspace(-.05, 1.05, 20)

plt.hist(female_burn_rates, bins, label='female', alpha=0.5)
plt.hist(male_burn_rates, bins, label='male', alpha=0.5)
plt.legend(loc='upper right')
plt.show()

`WFH Setup Available` and `Burn Rate`:

In [None]:
train[['WFH Setup Available', 'Burn Rate']].groupby('WFH Setup Available').agg('mean')

In [None]:
wfh_burn_rates = train[train['WFH Setup Available']==1]['Burn Rate']
nwfh_burn_rates = train[train['WFH Setup Available']==0]['Burn Rate']

bins = np.linspace(-.05, 1.05, 20)

plt.hist(wfh_burn_rates, bins, label='wfh avail.', alpha=0.5)
plt.hist(nwfh_burn_rates, bins, label='NOT', alpha=0.5)
plt.legend(loc='upper right')
plt.show()

`Company Type` and `Burn Rate`:

In [None]:
train[['Company Type', 'Burn Rate']].groupby('Company Type').agg('mean')

In [None]:
service_burn_rates = train[train['Company Type']==1]['Burn Rate']
product_burn_rates = train[train['Company Type']==0]['Burn Rate']

bins = np.linspace(-.05, 1.05, 20)

plt.hist(service_burn_rates, bins, label='Service', alpha=0.5)
plt.hist(product_burn_rates, bins, label='Product', alpha=0.5)
plt.legend(loc='upper right')
plt.show()

# Feature Engineering

Create `Month of Joining`, `Quarter of Joining`, and `Season of Joining`

In [None]:
train['Month of Joining'] = train['Date of Joining'].dt.month
final_test['Month of Joining'] = final_test['Date of Joining'].dt.month

train['Quarter of Joining'] = train['Date of Joining'].dt.quarter
final_test['Quarter of Joining'] = final_test['Date of Joining'].dt.quarter

from datetime import date, datetime

seasons = {'Summer':(datetime(2008,6,21), datetime(2008,9,22)),
           'Autumn':(datetime(2008,9,23), datetime(2008,12,20)),
           'Spring':(datetime(2008,3,21), datetime(2008,6,20))}

def get_season(row):
    date = row['Date of Joining']
    for season,(season_start, season_end) in seasons.items():
        if date>=season_start and date<= season_end:
            return season
    else:
        return 'Winter'
    
train['Season of Joining'] = train.apply(lambda row: get_season(row), axis=1)
final_test['Season of Joining'] = final_test.apply(lambda row: get_season(row), axis=1)

Convert these to dummies

In [None]:
#Training data
quater_dummies_train = pd.get_dummies(train['Quarter of Joining'], prefix='Quarter', drop_first=False)
season_dummies_train = pd.get_dummies(train['Season of Joining'], prefix='Season', drop_first=False)
designation_dummies_train = pd.get_dummies(train['Designation'], prefix='Designation', drop_first=False)

train = pd.concat([train, designation_dummies_train, quater_dummies_train, season_dummies_train], axis=1)

#Final test data
quater_dummies_final_test = pd.get_dummies(final_test['Quarter of Joining'], prefix='Quarter', drop_first=False)
season_dummies_final_test = pd.get_dummies(final_test['Season of Joining'], prefix='Season', drop_first=False)
designation_dummies_final_test = pd.get_dummies(final_test['Designation'], prefix='Designation', drop_first=False)

final_test = pd.concat([final_test, designation_dummies_final_test, quater_dummies_final_test, season_dummies_final_test], axis=1)

# Exploration 3

In [None]:
train['Month of Joining'].hist(bins=12)

In [None]:
train.boxplot(column=['Burn Rate'], by='Month of Joining')

In [None]:
train['Quarter of Joining'].hist(bins=4)

In [None]:
train.boxplot(column=['Burn Rate'], by='Quarter of Joining')

In [None]:
train['Season of Joining'].hist(bins=4)

In [None]:
train.boxplot(column=['Burn Rate'], by='Season of Joining')

These date features do not seem to have signal in them, but since we have not interacted them, we can't be sure yet.

# Create X and y for modelling

In [None]:
X = train.drop(['Burn Rate','Date of Joining','Quarter of Joining','Season of Joining'], axis=1)
y = train['Burn Rate']

In [None]:
X.columns

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(interaction_only=True)

X_t = pd.DataFrame(poly.fit_transform(X[['Gender', 'Company Type', 'WFH Setup Available', 'Designation',
       'Resource Allocation', 'Mental Fatigue Score', 'Month of Joining']]))

f_list = poly.get_feature_names(['Gender', 'Company Type', 'WFH Setup Available', 'Designation',
       'Resource Allocation', 'Mental Fatigue Score', 'Month of Joining'])

X_t.columns = f_list

X_t.head()

# Modelling

## Choose features
### Linear Regression Model

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_t, y, test_size=0.25, random_state=42)

features = ['1',
 #'Gender',
 'Company Type',
 #'WFH Setup Available',
 #'Designation',
 'Resource Allocation',
 'Mental Fatigue Score',
 #'Month of Joining',
 'Gender Company Type',
 #'Gender WFH Setup Available',
 #'Gender Designation',
 #'Gender Resource Allocation',
 #'Gender Mental Fatigue Score',
 #'Gender Month of Joining',
 #'Company Type WFH Setup Available',
 #'Company Type Designation',
 #'Company Type Resource Allocation',
 #'Company Type Mental Fatigue Score',
 #'Company Type Month of Joining',
 #'WFH Setup Available Designation',
 #'WFH Setup Available Resource Allocation',
 'WFH Setup Available Mental Fatigue Score',
 #'WFH Setup Available Month of Joining',
 #'Designation Resource Allocation',
 'Designation Mental Fatigue Score',
 #'Designation Month of Joining',
 'Resource Allocation Mental Fatigue Score',
 #'Resource Allocation Month of Joining',
 #'Mental Fatigue Score Month of Joining'
               ]

X_train_lrm = X_train[features]
X_test_lrm = X_test[features]

lrm = linear_model.LinearRegression()
lrm.fit(X_train_lrm, y_train)

y_predict_lrm = lrm.predict(X_test_lrm)

print('Coefficients:')
for item in zip(list(X_train_lrm.columns), lrm.coef_):
    print(item)
print('---------------------------------------------------')
print('Train R-squared: ', lrm.score(X_train_lrm, y_train))
print('Test R-squared: ', lrm.score(X_test_lrm, y_test))

In [None]:
import statsmodels.api as sm
from scipy import stats

#X_train_b = sm.add_constant(np.array(X_train_lrm, dtype=float))

lrm1b_mod = sm.OLS(np.array(y_train), X_train_lrm)
lrm1b_res = lrm1b_mod.fit()
print(lrm1b_res.summary(xname=features))

## Find Model Parameters
### Machine Learning Perceptron Model

In [None]:
features.remove('1')

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler #MinMaxScaler

scaler = StandardScaler() #MinMaxScaler()

X_mlpm = X_t[features]
X_mlpm = pd.DataFrame(scaler.fit_transform(X_mlpm), columns=features)

alphas = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]
hlsizes= [[50,100,100,50],[50,100,100,100,50],[50,100,100,100,100,50]]

mlpm = MLPRegressor(random_state=42, max_iter=500,
                    activation='relu',
                    solver='adam')

mlpm_clf = GridSearchCV(mlpm, dict(alpha=alphas,hidden_layer_sizes=hlsizes), cv=4)
mlpm_clf.fit(X_mlpm,y)

mlpm_scores = mlpm_clf.cv_results_['mean_test_score']
mlpm_scores = np.array(mlpm_scores).reshape(len(hlsizes),len(alphas))

(np.amax(mlpm_scores), mlpm_clf.best_params_)

### Random Forest Decision Tree Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler #MinMaxScaler

scaler = StandardScaler() #MinMaxScaler()

X_rfdtm = X_t[features]
X_rfdtm = pd.DataFrame(scaler.fit_transform(X_rfdtm), columns=features)

max_depths = list(range(6,13))
min_samples_leaves = [1,10,50,100]

rfdtm = RandomForestRegressor(random_state=42)
rfdtm_reg = GridSearchCV(rfdtm, dict(max_depth=max_depths,min_samples_leaf=min_samples_leaves), cv=4)
rfdtm_reg.fit(X_rfdtm,y)

rfdtm_scores = rfdtm_reg.cv_results_['mean_test_score']
rfdtm_scores = np.array(rfdtm_scores).reshape(len(min_samples_leaves),len(max_depths))

(np.amax(rfdtm_scores), rfdtm_reg.best_params_)

### Nearest Neighbours Model

In [None]:
from sklearn import neighbors
from sklearn.preprocessing import StandardScaler #MinMaxScaler

scaler = StandardScaler() #MinMaxScaler()

X_knnm = X_t[features]
#X_knnm = pd.DataFrame(scaler.fit_transform(X_knnm), columns=features)

ks = list(range(15,46))
#weights_lst = ['uniform', 'distance']
algorithms = ['auto', 'ball_tree', 'kd_tree', 'brute']

knnm = neighbors.KNeighborsRegressor(weights='uniform')
knnm_clf = GridSearchCV(knnm, dict(algorithm=algorithms, n_neighbors=ks), cv=4)
knnm_clf.fit(X_knnm,y)

knnm_scores = knnm_clf.cv_results_['mean_test_score']
knnm_scores = np.array(knnm_scores).reshape(len(algorithms),len(ks))

(np.amax(knnm_scores), knnm_clf.best_params_)

# Produce Final Predictions

## Prepare Final Test Data

In [None]:
X_final_t = pd.DataFrame(poly.fit_transform(final_test[['Gender', 'Company Type', 'WFH Setup Available', 'Designation',
       'Resource Allocation', 'Mental Fatigue Score', 'Month of Joining']]))

X_final_t.columns = f_list

X_final_t.head()

## Create Submissions
### Create a function that gets the weighted average of scores 

In [None]:
def get_score_weighted_average(row, scores, cols): 
    val = 0
    for i, col in enumerate(cols):
        w = scores[i] / (sum(scores))
        val += w * row[col]
    return val

### MLPM

In [None]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler #MinMaxScaler

scaler = StandardScaler() #MinMaxScaler()

X_mlpm = X_t[features]
X_mlpm = pd.DataFrame(scaler.fit_transform(X_mlpm), columns=features)

X_final_t_mlpm = X_final_t[features]
X_final_t_mlpm = pd.DataFrame(scaler.fit_transform(X_final_t_mlpm), columns=features)

n_splits = 4
kf = KFold(n_splits=n_splits)
cols = ['Fold {}'.format(i+1) for i in list(range(n_splits))]

y_predicts_mlpm = pd.DataFrame(columns=cols)
scores_mlpm = []
n = 0

for trn, tst in kf.split(X_mlpm):
    X_train_mlpm, X_test_mlpm = X_mlpm.iloc[trn], X_mlpm.iloc[tst]  
    y_train_mlpm, y_test_mlpm = y.iloc[trn], y.iloc[tst]
    
    mlpm = MLPRegressor(random_state=42, max_iter=500, 
                     hidden_layer_sizes=[50, 100, 100, 100, 50],
                    activation='relu',
                    alpha=0.0001,
                    solver='adam').fit(X_train_mlpm, y_train_mlpm)
    
    scores_mlpm.append(mlpm.score(X_test_mlpm, y_test_mlpm))
    
    y_predict_final_mlpm = mlpm.predict(X_final_t_mlpm)
    
    col_name = cols[n]
    
    y_predicts_mlpm[col_name] = y_predict_final_mlpm
    
    n+=1

np.mean(scores_mlpm)

In [None]:
cols = ['Fold {}'.format(i+1) for i in list(range(len(scores_mlpm)))]
y_predicts_mlpm['Burn Rate'] = y_predicts_mlpm.apply(lambda x: get_score_weighted_average(x, scores_mlpm, cols), axis=1)
y_predicts_mlpm.index = final_test.index
y_predicts_mlpm

Plot trained predictions versus true values

In [None]:
y_predict_train_mlpm = mlpm.predict(X_mlpm)

plt.plot(sorted(y_predict_train_mlpm))
plt.plot(sorted(y))
plt.legend(['predicted', 'true'])
plt.show()

Create submission

In [None]:
submission_mlpm = y_predicts_mlpm['Burn Rate']
submission_mlpm.to_csv('submission_mlpm.csv', index=True)

### RFDT

In [None]:
X_rfdtm = X_t[features]
X_rfdtm = pd.DataFrame(scaler.fit_transform(X_rfdtm), columns=features)

X_final_t_rfdtm = X_final_t[features]
X_final_t_rfdtm = pd.DataFrame(scaler.fit_transform(X_final_t_rfdtm), columns=features)

n_splits = 4
kf = KFold(n_splits=n_splits)
cols = ['Fold {}'.format(i+1) for i in list(range(n_splits))]

y_predicts_rfdtm = pd.DataFrame(columns=cols)
scores_rfdtm = []
n = 0

for trn, tst in kf.split(X_rfdtm):
    X_train_rfdtm, X_test_rfdtm = X_rfdtm.iloc[trn], X_rfdtm.iloc[tst]  
    y_train_rfdtm, y_test_rfdtm = y.iloc[trn], y.iloc[tst]
    
    rfdtm = RandomForestRegressor(max_depth=8, random_state=42, min_samples_leaf=1)
    rfdtm = rfdtm.fit(X_train_rfdtm, y_train_rfdtm)
    
    scores_rfdtm.append(rfdtm.score(X_test_rfdtm, y_test_rfdtm))
    
    y_predict_final_rfdtm = rfdtm.predict(X_final_t_rfdtm)
    
    col_name = cols[n]
    
    y_predicts_rfdtm[col_name] = y_predict_final_rfdtm
    
    n+=1

np.mean(scores_rfdtm)

In [None]:
cols = ['Fold {}'.format(i+1) for i in list(range(len(scores_rfdtm)))]
y_predicts_rfdtm['Burn Rate'] = y_predicts_rfdtm.apply(lambda x: get_score_weighted_average(x, scores_rfdtm, cols), axis=1)
y_predicts_rfdtm.index = final_test.index
y_predicts_rfdtm

Plot trained predictions versus true values

In [None]:
y_predict_train_rfdtm = rfdtm.predict(X_rfdtm)

plt.plot(sorted(y_predict_train_rfdtm))
plt.plot(sorted(y))
plt.legend(['predicted', 'true'])
plt.show()

Create submission

In [None]:
submission_rfdtm = y_predicts_rfdtm['Burn Rate']
submission_rfdtm.to_csv('submission_rfdtm.csv', index=True)

### KNN

In [None]:
X_knnm = X_t[features]
#X_knnm = pd.DataFrame(scaler.fit_transform(X_knnm), columns=features)

X_final_t_knnm = X_final_t[features]
#X_final_t_knnm = pd.DataFrame(scaler.fit_transform(X_final_t_knnm), columns=features)

n_splits = 4
kf = KFold(n_splits=n_splits)
cols = ['Fold {}'.format(i+1) for i in list(range(n_splits))]

y_predicts_knnm = pd.DataFrame(columns=cols)
scores_knnm = []
n = 0

for trn, tst in kf.split(X_knnm):
    X_train_knnm, X_test_knnm = X_knnm.iloc[trn], X_knnm.iloc[tst]  
    y_train_knnm, y_test_knnm = y.iloc[trn], y.iloc[tst]
    
    knnm = neighbors.KNeighborsRegressor(32, weights='uniform', algorithm='ball_tree')
    knnm = knnm.fit(X_train_knnm, y_train_knnm) 
    
    scores_knnm.append(knnm.score(X_test_knnm, y_test_knnm))
    
    y_predict_final_knnm = knnm.predict(X_final_t_knnm)
    
    col_name = cols[n]
    
    y_predicts_knnm[col_name] = y_predict_final_knnm
    
    n+=1
    
np.mean(scores_knnm)

In [None]:
cols = ['Fold {}'.format(i+1) for i in list(range(len(scores_knnm)))]
y_predicts_knnm['Burn Rate'] = y_predicts_knnm.apply(lambda x: get_score_weighted_average(x, scores_knnm, cols), axis=1)
y_predicts_knnm.index = final_test.index
y_predicts_knnm

Plot trained predictions versus true values

In [None]:
y_predict_train_knnm = knnm.predict(X_knnm)

plt.plot(sorted(y_predict_train_knnm))
plt.plot(sorted(y))
plt.legend(['predicted', 'true'])
plt.show()

Create submission

In [None]:
submission_knnm = y_predicts_knnm['Burn Rate']
submission_knnm.to_csv('submission_knnm.csv', index=True)

### Ensemble of Previous Three

In [None]:
A = y_predicts_knnm.drop(['Burn Rate'], axis=1).rename(columns={'Fold 1':'knn1','Fold 2':'knn2','Fold 3':'knn3','Fold 4':'knn4'})
B = y_predicts_mlpm.drop(['Burn Rate'], axis=1).rename(columns={'Fold 1':'mlp1','Fold 2':'mlp2','Fold 3':'mlp3','Fold 4':'mlp4'})
C = y_predicts_rfdtm.drop(['Burn Rate'], axis=1).rename(columns={'Fold 1':'rfdt1','Fold 2':'rfdt2','Fold 3':'rfdt3','Fold 4':'rfdt4'})
ABC = A.join(B).join(C)

slist = scores_knnm +scores_mlpm + scores_rfdtm

abc = A.join(B).join(C)
y_predicts_abc = pd.DataFrame()
y_predicts_abc['Burn Rate'] = abc.apply(lambda x: get_score_weighted_average(x, slist, ABC.columns), axis=1)
y_predicts_abc

Plot trained predictions versus true values

In [None]:
y_ = pd.DataFrame(zip(y_predict_train_mlpm, y_predict_train_rfdtm, y_predict_train_knnm), columns=['mlpm','rfdtm','knnm'])

slist2 = [np.mean(scores_mlpm),np.mean(scores_rfdtm),np.mean(scores_knnm)]
y_['Burn Rate'] = y_.apply(lambda x: get_score_weighted_average(x, slist2,['mlpm','rfdtm','knnm']), axis=1)

plt.plot(sorted(y_['Burn Rate']))
plt.plot(sorted(y))
plt.legend(['predicted', 'true'])
plt.show()

Create submission

In [None]:
submission_abc = y_predicts_knnm['Burn Rate']
submission_abc.to_csv('submission_abc.csv', index=True)