In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Burning Out Employees

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

In [None]:
df_train = pd.read_csv("/kaggle/input/are-your-employees-burning-out/train.csv")
df_test = pd.read_csv("/kaggle/input/are-your-employees-burning-out/test.csv")

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

Train dataframe has missing values for "Resource Allocation", "Mental Fatigue Score" and "Burn Rate".

Test dataframe is complete.

In [None]:
df_train.dropna(inplace=True)

In [None]:
def Encoder(data):
    data.Gender = LabelEncoder().fit_transform(data.Gender)
    data["Company Type"] = LabelEncoder().fit_transform(data["Company Type"])
    data["WFH Setup Available"] = LabelEncoder().fit_transform(data["WFH Setup Available"])

In [None]:
Encoder(df_train)
Encoder(df_test)

## Visualization

In [None]:
plt.figure(figsize=(10,8))

plt.hist(df_train["Burn Rate"][df_train.Gender.isin([0])],color='blue',alpha=0.5,label="Male")
plt.hist(df_train["Burn Rate"][df_train.Gender.isin([1])],color='pink',alpha=0.5,label="Female")
plt.xlabel("Burn Rate")
plt.ylabel("Count")
plt.legend()
plt.title("Burn Rate by Gender")
plt.show()

Female are more burned out than male, the histogram is shifted to the right in this comparisson.

In [None]:
plt.figure(figsize=(10,8))

plt.hist(df_train["Burn Rate"][df_train["Company Type"].isin([0])],color='red',alpha=0.5,label="Product")
plt.hist(df_train["Burn Rate"][df_train["Company Type"].isin([1])],color='green',alpha=0.5,label="Service")
plt.xlabel("Burn Rate")
plt.ylabel("Count")
plt.legend()
plt.title("Burn Rate by Company Type")
plt.show()

We have less data from Service than from product but the shape is almost the same

In [None]:
plt.figure(figsize=(10,8))

plt.hist(df_train["Burn Rate"][df_train["WFH Setup Available"].isin([0])],color='red',alpha=0.5,label="No")
plt.hist(df_train["Burn Rate"][df_train["WFH Setup Available"].isin([1])],color='yellow',alpha=0.5,label="Yes")
plt.xlabel("Burn Rate")
plt.ylabel("Count")
plt.legend()
plt.title("Burn Rate by WFH Availability")
plt.show()

Clearly not having WFH setup availability turns on burning out more easily. The hisogram is shifted to the right for 'No'

In [None]:
plt.figure(figsize=(10,8))

plt.bar(x=df_train["Designation"].unique(),height=df_train.groupby('Designation')['Burn Rate'].mean())
plt.xlabel("Designation")
plt.ylabel("Mean Burn Rate")
plt.title("Mean Burn Rate by Designation type")
plt.show()

In [None]:
plt.figure(figsize=(10,8))

plt.bar(x=df_train["Resource Allocation"].unique(),height=df_train.groupby('Resource Allocation')['Burn Rate'].mean())
plt.xlabel("Resource Allocation")
plt.ylabel("Mean Burn Rate")
plt.title("Mean Burn Rate by Resource Allocation")
plt.show()

## Correlations

In [None]:
plt.figure(figsize=(10,6))

heatmap = sns.heatmap(df_train.corr(), vmin=-1,vmax=1, annot=True, cmap='viridis')

heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)
plt.show()

There are three predictors highly correlated: 
* Designation
* Resource Allocation
* Mental Fatigue Score

This three are also highly correlated to the target: *Burn Rate*

## Train Test Split

In [None]:
Predictors = df_train[df_train.columns.values.tolist()[2:7]]
Target =df_train[df_train.columns.values.tolist()[8]]

Predictors_tr, Predictors_test, Target_tr, Target_test = train_test_split(Predictors, Target, test_size = 0.2)


Pred_test = df_test[df_test.columns.values.tolist()[2:7]]

## Support Vector Regression

In [None]:
params ={
        "kernel":['rbf'],
        "gamma":[1E-3,1E-2,0.1],
        "C":[10,100,500]
    }

SVR_m = GridSearchCV(SVR(),param_grid=params, cv=5, scoring='r2')

SVR_m.fit(Predictors_tr,Target_tr)

In [None]:
means = SVR_m.cv_results_['mean_test_score']
stds =  SVR_m.cv_results_['std_test_score']
params= SVR_m.cv_results_['params']

for m,s,p in zip(means,stds,params):
    print("%0.3f (+/-%0.3f) para %r"%(m, s, p))

print('------------------------------------------------------------------------------------------------')
print('Best params: ',SVR_m.best_params_)
print('Best R2 score: ',SVR_m.best_score_)

In [None]:
Ytr_pred=SVR_m.predict(Predictors_test)
error = r2_score(Ytr_pred,Target_test)
print("r2_score metric gives an error of r^2= {:.2f}".format(error))

In [None]:
Y_pred = SVR_m.predict(Pred_test)
submit = pd.DataFrame(data=list(zip(df_test["Employee ID"],Y_pred)), columns=["Employee ID","SVR Burn Rate"])
submit.head()

## Decission Tree Regressor

In [None]:
params = {
    'splitter':['best','random'] ,
    'min_samples_split':range(2,50,5),
    'min_samples_leaf':range(1,21,2),
    'max_features':["auto","sqrt","log2"]
}


DTR =  RandomizedSearchCV(DecisionTreeRegressor(),param_distributions=params, cv=5, scoring='r2')

DTR.fit(Predictors_tr,Target_tr)

In [None]:
means = DTR.cv_results_['mean_test_score']
stds =  DTR.cv_results_['std_test_score']
params= DTR.cv_results_['params']

for m,s,p in zip(means,stds,params):
    print("%0.3f (+/-%0.3f) para %r"%(m, s, p))

print('------------------------------------------------------------------------------------------------')
print('Best params: ',DTR.best_params_)
print('Best R2 score: ',DTR.best_score_)

In [None]:
Ytr_pred=DTR.predict(Predictors_test)
error = r2_score(Ytr_pred,Target_test)
print("r2_score metric gives an error of r^2= {:.2f}".format(error))

In [None]:
Y_pred = DTR.predict(Pred_test)
submit["Tree Burn Rate"]=Y_pred
submit.head()