In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 10)
import seaborn as sns
from scipy import stats

# Description Data

Employee ID: The unique ID allocated for each employee (example: fffe390032003000)

Date of Joining: The date-time when the employee has joined the organization (example: 2008-12-30)

Gender: The gender of the employee (Male/Female)

Company Type: The type of company where the employee is working (Service/Product)

WFH Setup Available: Is the work from home facility available for the employee (Yes/No)

Designation: The designation of the employee of work in the organization.

    In the range of [0.0, 5.0] bigger is higher designation.

Resource Allocation: The amount of resource allocated to the employee to work, ie. number of working hours.

    In the range of [1.0, 10.0] (higher means more resource) 

Mental Fatigue Score: The level of fatigue mentally the employee is facing.

    In the range of [0.0, 10.0] where 0.0 means no fatigue and 10.0 means completely fatigue.

Burn Rate: The value we need to predict for each employee telling the rate of Bur out while working.

    In the range of [0.0, 1.0] where the higher the value is more is the burn out

In [None]:
train = pd.read_csv('../input/are-your-employees-burning-out/train.csv')
test = pd.read_csv('../input/are-your-employees-burning-out/test.csv')

In [None]:
train.info()

In [None]:
train.duplicated().sum()

In [None]:
train['Employee ID'].nunique()

In [None]:
train['Date of Joining'].sort_values()

In [None]:
print(train['Gender'].value_counts())
print(test['Gender'].value_counts())

In [None]:
print(train['Resource Allocation'].isnull().sum())
print(test['Resource Allocation'].isnull().sum())

In [None]:
print(train['Mental Fatigue Score'].isnull().sum())
print(test['Mental Fatigue Score'].isnull().sum())

In [None]:
print(train['Burn Rate'].isnull().sum())

## There Is No Null Value In Test And Drop Null From Train :
### if you want you can handle the null value with mean or bfill or ffill

In [None]:
train1 = train.dropna()

In [None]:
train1.info()

In [None]:
train1.sort_values(by='Burn Rate', ascending=False).reset_index(drop=True)[:50]

In [None]:
train1['month'] = train1['Date of Joining'].apply(lambda x: pd.to_datetime(x).month)
train1['day'] = train1['Date of Joining'].apply(lambda x: pd.to_datetime(x).day)
# TEST 
test['month'] = test['Date of Joining'].apply(lambda x: pd.to_datetime(x).month)
test['day'] = test['Date of Joining'].apply(lambda x: pd.to_datetime(x).day)

In [None]:
train2 = train1.drop('Date of Joining', axis=1)
test2 = test.drop('Date of Joining', axis=1)

In [None]:
#train2.groupby('month')['Gender'].count().plot(kind='bar')
sns.countplot(x='month', hue='Gender', data=train2)

In [None]:
sns.countplot(x='day', hue='Gender', data=train2)

### now check The Anomaly Detection is any person has lower 'mental fatigue score' and burn rate above 0.5 ? 
###  *informationDATA:   In the range of [0.0, 1.0] where the higher the value is more is the burn out
### *In the range of [0.0, 10.0] where 0.0 means no fatigue and 10.0 means completely fatigue

In [None]:
train3 = train2.loc[~(train2['Mental Fatigue Score'] < 4.0) & ~(train2['Burn Rate'] > 0.5) & ~(train2['Resource Allocation'] > 5.0) ]

In [None]:
train3.shape

In [None]:
train4 = train3.drop(['Employee ID', 'day', 'month'], axis=1)
test4 = test2.drop(['Employee ID', 'month', 'day'], axis=1)

In [None]:
train4.shape

In [None]:
train4_dum = pd.get_dummies(train4[['Gender', 'WFH Setup Available', 'Company Type']])
train4_dum

In [None]:
train5 = pd.concat([train4, train4_dum], axis=1)

In [None]:
train6 = train5.drop(['Gender', 'Gender_Female', 'Company Type', 'Company Type_Product', 'WFH Setup Available', 'WFH Setup Available_No'], axis=1)

In [None]:
train6.describe()

In [None]:
plt.hist(train6['Burn Rate'], bins=30)