# Dictionary

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Data 

I used "Are Your Employees Burning Out?" dataset from kaggle in this notebook. You can reach the dataset by clicking [here](https://www.kaggle.com/datasets/blurredmachine/are-your-employees-burning-out?select=sample_submission.csv)

   ## Context
Understanding what will be the Burn Rate for the employee working in an organization based on the current pandemic situation where work from home is a boon and a bane. How are employees' Burn Rate affected based on various conditions provided?

In [13]:
emp_data = pd.read_csv(r"C:\Users\asafa\OneDrive\Masaüstü\patika\employee burnout\train.csv")

In [14]:
emp_data.head()

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2.0,3.0,3.8,0.16
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1.0,2.0,5.0,0.36
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2.0,,5.8,0.49
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1.0,1.0,2.6,0.2
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3.0,7.0,6.9,0.52


In [15]:
def copy_df(df):
    df = df.copy()
    
    return df

In [16]:
data = copy_df(emp_data)

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22750 entries, 0 to 22749
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           22750 non-null  object 
 1   Date of Joining       22750 non-null  object 
 2   Gender                22750 non-null  object 
 3   Company Type          22750 non-null  object 
 4   WFH Setup Available   22750 non-null  object 
 5   Designation           22750 non-null  float64
 6   Resource Allocation   21369 non-null  float64
 7   Mental Fatigue Score  20633 non-null  float64
 8   Burn Rate             21626 non-null  float64
dtypes: float64(4), object(5)
memory usage: 1.6+ MB


In [18]:
data['Date of Joining'] = pd.to_datetime(data['Date of Joining'])
data['Join year'] = data['Date of Joining'].apply(lambda x: x.year)
data['Join Month'] = data['Date of Joining'].apply(lambda x: x.month)
data['Join Day'] = data['Date of Joining'].apply(lambda x: x.day)
data = data.drop('Date of Joining', axis=1)

In [19]:
data

Unnamed: 0,Employee ID,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate,Join year,Join Month,Join Day
0,fffe32003000360033003200,Female,Service,No,2.0,3.0,3.8,0.16,2008,9,30
1,fffe3700360033003500,Male,Service,Yes,1.0,2.0,5.0,0.36,2008,11,30
2,fffe31003300320037003900,Female,Product,Yes,2.0,,5.8,0.49,2008,3,10
3,fffe32003400380032003900,Male,Service,Yes,1.0,1.0,2.6,0.20,2008,11,3
4,fffe31003900340031003600,Female,Service,No,3.0,7.0,6.9,0.52,2008,7,24
...,...,...,...,...,...,...,...,...,...,...,...
22745,fffe31003500370039003100,Female,Service,No,1.0,3.0,,0.41,2008,12,30
22746,fffe33003000350031003800,Female,Product,Yes,3.0,6.0,6.7,0.59,2008,1,19
22747,fffe390032003000,Male,Service,Yes,3.0,7.0,,0.72,2008,11,5
22748,fffe33003300320036003900,Female,Service,No,2.0,5.0,5.9,0.52,2008,1,10


In [20]:
data = data.drop('Employee ID', axis=1)

In [21]:
data

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate,Join year,Join Month,Join Day
0,Female,Service,No,2.0,3.0,3.8,0.16,2008,9,30
1,Male,Service,Yes,1.0,2.0,5.0,0.36,2008,11,30
2,Female,Product,Yes,2.0,,5.8,0.49,2008,3,10
3,Male,Service,Yes,1.0,1.0,2.6,0.20,2008,11,3
4,Female,Service,No,3.0,7.0,6.9,0.52,2008,7,24
...,...,...,...,...,...,...,...,...,...,...
22745,Female,Service,No,1.0,3.0,,0.41,2008,12,30
22746,Female,Product,Yes,3.0,6.0,6.7,0.59,2008,1,19
22747,Male,Service,Yes,3.0,7.0,,0.72,2008,11,5
22748,Female,Service,No,2.0,5.0,5.9,0.52,2008,1,10


In [22]:
{column: len(data[column].unique()) for column in data.columns} # Checking for unique values in columns

{'Gender': 2,
 'Company Type': 2,
 'WFH Setup Available': 2,
 'Designation': 6,
 'Resource Allocation': 11,
 'Mental Fatigue Score': 102,
 'Burn Rate': 102,
 'Join year': 1,
 'Join Month': 12,
 'Join Day': 31}

In [23]:
data = data.drop('Join year', axis = 1) # Dropping the 'Join year' column because it has only 1 unique value

In [24]:
data

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate,Join Month,Join Day
0,Female,Service,No,2.0,3.0,3.8,0.16,9,30
1,Male,Service,Yes,1.0,2.0,5.0,0.36,11,30
2,Female,Product,Yes,2.0,,5.8,0.49,3,10
3,Male,Service,Yes,1.0,1.0,2.6,0.20,11,3
4,Female,Service,No,3.0,7.0,6.9,0.52,7,24
...,...,...,...,...,...,...,...,...,...
22745,Female,Service,No,1.0,3.0,,0.41,12,30
22746,Female,Product,Yes,3.0,6.0,6.7,0.59,1,19
22747,Male,Service,Yes,3.0,7.0,,0.72,11,5
22748,Female,Service,No,2.0,5.0,5.9,0.52,1,10


# Missing Values

In [58]:
emp_data.isna().sum()  # Checking for missing values

Employee ID                0
Date of Joining            0
Gender                     0
Company Type               0
WFH Setup Available        0
Designation                0
Resource Allocation     1381
Mental Fatigue Score    2117
Burn Rate               1124
dtype: int64

In [29]:
# Dropping the target rows with missing values
missing_target_values = data.loc[df['Burn Rate'].isna(), :].index
data = data.drop(missing_target_values, axis=0).reset_index(drop=True)

# Filling other missing values with mean values of their columns
for column in ['Resource Allocation', 'Mental Fatigue Score']:
        data[column] = data[column].fillna(data[column].mean())

In [30]:
data

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate,Join Month,Join Day
0,Female,Service,No,2.0,3.000000,3.800000,0.16,9,30
1,Male,Service,Yes,1.0,2.000000,5.000000,0.36,11,30
2,Female,Product,Yes,2.0,4.482047,5.800000,0.49,3,10
3,Male,Service,Yes,1.0,1.000000,2.600000,0.20,11,3
4,Female,Service,No,3.0,7.000000,6.900000,0.52,7,24
...,...,...,...,...,...,...,...,...,...
20544,Female,Service,No,1.0,3.000000,5.730181,0.41,12,30
20545,Female,Product,Yes,3.0,6.000000,6.700000,0.59,1,19
20546,Male,Service,Yes,3.0,7.000000,5.730181,0.72,11,5
20547,Female,Service,No,2.0,5.000000,5.900000,0.52,1,10


In [31]:
data.isna().sum()

Gender                  0
Company Type            0
WFH Setup Available     0
Designation             0
Resource Allocation     0
Mental Fatigue Score    0
Burn Rate               0
Join Month              0
Join Day                0
dtype: int64

# Categoricals

In [32]:
data['Gender'] = data['Gender'].replace({'Male': 0, 'Female': 1})
data['Company Type'] = data['Company Type'].replace({'Service': 0, 'Product': 1})
data['WFH Setup Available'] = data['WFH Setup Available'].replace({'No': 0, 'Yes': 1})

In [33]:
data

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate,Join Month,Join Day
0,1,0,0,2.0,3.000000,3.800000,0.16,9,30
1,0,0,1,1.0,2.000000,5.000000,0.36,11,30
2,1,1,1,2.0,4.482047,5.800000,0.49,3,10
3,0,0,1,1.0,1.000000,2.600000,0.20,11,3
4,1,0,0,3.0,7.000000,6.900000,0.52,7,24
...,...,...,...,...,...,...,...,...,...
20544,1,0,0,1.0,3.000000,5.730181,0.41,12,30
20545,1,1,1,3.0,6.000000,6.700000,0.59,1,19
20546,0,0,1,3.0,7.000000,5.730181,0.72,11,5
20547,1,0,0,2.0,5.000000,5.900000,0.52,1,10


# Split and Standardization

In [46]:
# Splitting x and y(target) 
y = data['Burn Rate']
x = data.drop('Burn Rate', axis=1)

# Splitting train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=2)

# Scale x
scaler = StandardScaler()
scaler.fit(x_train)
x_train = pd.DataFrame(scaler.transform(x_train), index=x_train.index, columns=x_train.columns)
x_test = pd.DataFrame(scaler.transform(x_test), index=x_test.index, columns=x_test.columns)


StandardScaler(), standardize features by removing the mean and scaling to unit variance.

The standard score of a sample x is calculated as:

z = (x - u) / s

where u is the mean of the training samples or zero if with_mean=False, and s is the standard deviation 
of the training samples or one if with_std=False.
[More infirmation about standardscaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)

In [47]:
x_train

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Join Month,Join Day
17149,-1.061567,1.375741,0.922174,-1.036240,-0.246984,0.143535,0.435950,0.598807
17657,-1.061567,-0.726881,0.922174,-0.158296,-0.749992,0.636663,-1.592765,-0.655251
14071,0.942004,-0.726881,-1.084394,-1.036240,-1.756007,-0.459178,1.305399,-0.427241
14386,0.942004,-0.726881,0.922174,0.719648,0.759032,-1.007098,0.435950,1.054828
15549,-1.061567,-0.726881,-1.084394,-0.158296,0.759032,1.622921,0.146133,0.256791
...,...,...,...,...,...,...,...,...
1099,0.942004,-0.726881,0.922174,2.475537,2.771062,2.335217,0.725766,1.396844
18898,0.942004,-0.726881,0.922174,-1.036240,-1.252999,-0.568762,-1.013132,0.712812
11798,-1.061567,1.375741,-1.084394,-0.158296,-0.246984,0.143535,1.595215,-1.453288
6637,0.942004,-0.726881,0.922174,0.719648,0.759032,0.088743,0.146133,1.510849


In [48]:
x_test

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Join Month,Join Day
11805,0.942004,-0.726881,0.922174,-0.158296,-1.252999,-0.240010,-1.592765,1.168833
3424,-1.061567,-0.726881,0.922174,-0.158296,-0.246984,-0.294802,-1.592765,-1.567294
4391,0.942004,-0.726881,-1.084394,-0.158296,0.256024,0.855832,1.595215,1.738860
18329,0.942004,1.375741,0.922174,0.719648,-0.246984,-0.787930,-0.433499,0.712812
12202,-1.061567,-0.726881,-1.084394,0.719648,0.759032,-0.075633,0.146133,0.256791
...,...,...,...,...,...,...,...,...
12867,-1.061567,1.375741,0.922174,-1.036240,-0.246984,-0.294802,-0.723316,-1.567294
18548,-1.061567,1.375741,0.922174,1.597592,0.759032,-0.004304,1.305399,0.598807
8023,0.942004,-0.726881,0.922174,-0.158296,-0.246984,-0.349594,0.435950,-1.111272
18885,0.942004,-0.726881,-1.084394,0.719648,1.262039,0.910624,-0.723316,-0.655251


# Models and Results

In [54]:
models = {
"         Linear Regression" : LinearRegression(),
"         Random Forest    " : RandomForestRegressor()
}

for name, model in models.items():
    model.fit(x_train, y_train)

In [57]:
for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(x_test, y_test)))

         Linear Regression R^2 Score: 0.87357
         Random Forest     R^2 Score: 0.90182
