In [None]:
!pip install catboost 

import numpy as np, os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from IPython.display import clear_output 

# Machine learning libraries
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

""" 
In Order to Use it in your own machine, just change the HOME_DIR to your current working directory
where there is a folder named data in which the  csv files are there. 
 """

DATA_DIR = '../input/hackerearth-employee-burnout-challenge/'

train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
sub = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))


clear_output()

## Utility Functions

In [None]:
def get_days(d0, d1):
    """
    d0 : earlier data
    d1 : later date
    returns : number of days between those two dates.   
    """
    d0 = pd.to_datetime(d0)
    d1 = pd.to_datetime(d1)
    delta = d1 - d0
    return delta.days

In [None]:
train_df.head()

In [None]:
train_df.info()

# Handling Missing Values

In [None]:
# train_df['Resource Allocation'].fillna(train_df['Resource Allocation'].median() , inplace = True)
# train_df['Mental Fatigue Score'].fillna(train_df['Mental Fatigue Score'].median() , inplace = True)

train_df.dropna(inplace=True)
train_df.info()

## Burn Rate vs (Designation and Mental Fatigue)

In [None]:
train_df2 = train_df.copy()
train_df2['Burn Rate'] = train_df['Burn Rate']
sns.relplot(x="Designation", y="Mental Fatigue Score", size ='Burn Rate',  data=train_df2);

## Resource Allocation

In [None]:
train_df.info()

In [None]:
resource_counts = train_df['Resource Allocation'].value_counts()

fig = sns.barplot(y=resource_counts.values, x=resource_counts.index)
fig.set(title = 'Resource Allocation Distributions');

# <font color='blue'>Feature Engineering </font>

In [None]:
dataset = [train_df, test_df]

for data in dataset:
    data['Date of Joining'] = pd.to_datetime(data['Date of Joining'])
    data['Gender'] = [1 if (gender == 'Male') else 0  for gender in data.Gender]
    data['Company Type'] = [1 if (ctype == 'Service') else 0  for ctype in data['Company Type']]
    data['WFH Setup Available'] = [1 if (wfh == 'Yes') else 0  for wfh in data['WFH Setup Available']]
    data['JobDuration'] = [get_days(d, '2009-2-1') for d in data['Date of Joining']]
    data['JobDurationMonth'] = (data['JobDuration']/30)
    

In [None]:
train_df = train_df.astype({"Designation": int, "Resource Allocation": int, "JobDurationMonth": int})
test_df = test_df.astype({"Designation": int, "Resource Allocation": int, "JobDurationMonth": int})
test_df


In [None]:
train_df.to_csv('train_processed.csv', index=False)
test_df.to_csv('test_processed.csv', index=False)


## Burn Rate vs Job Duration in Months:
It seems that there is not that much clear relationship between Job Duration in Month and Burn Rate of the Employees. 

In [None]:
fig = train_df.groupby(['JobDurationMonth']).mean()['Burn Rate'].plot.bar()
fig.set(title = 'Burn Rate vs Job Duration in Months', ylabel = 'Mean Burn Rate');

## Fatigue Reported vs Job Duration in Months:

In [None]:
fig = train_df.groupby(['JobDurationMonth']).mean()['Mental Fatigue Score'].plot.bar()
fig.set(title = 'Mental Fatigue Score vs Job Duration in Months', ylabel = 'Reported Mental Fatigue Score');

# Data Staging


In [None]:
# Selected features for training
features = ['Gender',   'JobDuration', 'JobDurationMonth',  'Company Type',	
            'WFH Setup Available', 'Designation', 'Resource Allocation',	'Mental Fatigue Score']

cat_features = ['Gender', 'JobDurationMonth', 'Company Type', 'WFH Setup Available', 
                'Designation', 'Resource Allocation']


![](https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcSDSH7J3HcXkjC0ftWuIL8gn2Tj4ZHMIUV8ZQ&usqp=CAU)