### **1.1 Importing Libraries**

In [1]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('mode.chained_assignment', None)

### **1.2 Import employee_data, employee_details_data and department_data and validate**

In [5]:
path = 'C:\\Users\\sande\\Documents\\INSAID Project\\CapstoneProject\\'

employee = pd.read_csv(path + 'employee_data.csv', index_col='employee_id')
department = pd.read_csv(path + 'department_data.csv', index_col='dept_id')
employee_details= pd.read_csv(path + 'employee_details_data.csv', index_col='employee_id')

**Check shape of tables**

In [6]:
print('department_data:', department.shape)
print('employee_data:', employee.shape)
print('employee_details_data:', employee_details.shape)

department_data: (11, 2)
employee_data: (14150, 10)
employee_details_data: (14245, 3)


### **1.3 Merge employee_data, employee_details_data and department_data in single file**

#### 1.3.1 Check data quality for index cols in department

In [22]:
employee['department'].value_counts()

D00-SS     3905
D00-ENG    2575
D00-SP     2113
D00-IT     1157
D00-PD      855
D00-MT      815
D00-FN      725
D00-MN      593
-IT         207
D00-AD      175
D00-PR      173
D00-TP      150
Name: department, dtype: int64

**Observation**
- We can see the '-IT' is incorrect and needs to be replaced with 'D00-IT'

In [23]:
employee['department'].replace(to_replace='-IT', value='D00-IT', inplace=True )

**After correcting employee['department'] values**

In [24]:
employee['department'].value_counts()

D00-SS     3905
D00-ENG    2575
D00-SP     2113
D00-IT     1364
D00-PD      855
D00-MT      815
D00-FN      725
D00-MN      593
D00-AD      175
D00-PR      173
D00-TP      150
Name: department, dtype: int64

- **Merge employee_data and employee_det with employee_id as index value** 

In [25]:
data = employee.join(employee_details).reset_index()

- **Rename department to dept_id to sync with col name in department.**
- **Merge with Department table**

In [26]:
data = data.rename(columns={'department':'dept_id'}).set_index('dept_id')
data = data.join(department).reset_index()

In [27]:
data.shape

(14150, 16)

In [28]:
data.head()

Unnamed: 0,dept_id,employee_id,avg_monthly_hrs,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,status,tenure,age,gender,marital_status,dept_name,dept_head
0,D00-AD,100360,274.0,,0.810377,4,,high,0.872978,Employed,3.0,41.0,Female,Married,Admin,Evelyn Tolson
1,D00-AD,100852,260.0,,0.955429,4,,low,0.615781,Employed,4.0,46.0,Male,Married,Admin,Evelyn Tolson
2,D00-AD,100876,254.0,1.0,0.971611,7,,medium,0.15528,Left,4.0,45.0,Male,Married,Admin,Evelyn Tolson
3,D00-AD,100909,151.0,,0.974578,5,,medium,0.873819,Employed,3.0,46.0,Female,Married,Admin,Evelyn Tolson
4,D00-AD,101075,236.0,,0.451697,3,,medium,0.370081,Employed,2.0,24.0,Male,Unmarried,Admin,Evelyn Tolson


#### 1.3.2 Create copy of data

In [29]:
from copy import deepcopy
data_backup = deepcopy(data)

In [30]:
data_backup.head()

Unnamed: 0,dept_id,employee_id,avg_monthly_hrs,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,status,tenure,age,gender,marital_status,dept_name,dept_head
0,D00-AD,100360,274.0,,0.810377,4,,high,0.872978,Employed,3.0,41.0,Female,Married,Admin,Evelyn Tolson
1,D00-AD,100852,260.0,,0.955429,4,,low,0.615781,Employed,4.0,46.0,Male,Married,Admin,Evelyn Tolson
2,D00-AD,100876,254.0,1.0,0.971611,7,,medium,0.15528,Left,4.0,45.0,Male,Married,Admin,Evelyn Tolson
3,D00-AD,100909,151.0,,0.974578,5,,medium,0.873819,Employed,3.0,46.0,Female,Married,Admin,Evelyn Tolson
4,D00-AD,101075,236.0,,0.451697,3,,medium,0.370081,Employed,2.0,24.0,Male,Unmarried,Admin,Evelyn Tolson


In [37]:
path = 'C:\\Users\\sande\\Documents\\INSAID Project\\CapstoneProject\\'
data.to_csv(path+'data.csv', index=False)

### **1.4 Identification & Handling of Missing Data**

In [None]:
print('Contains Redundant Records?:', employee.duplicated().any())
print('Duplicate Count:', employee.duplicated().sum())

In [None]:
employee['filed_complaint'] = employee['filed_complaint'].fillna(0)
employee['recently_promoted'] = employee['recently_promoted'].fillna(0)

#### **1.2.1 Identfication & Handling of Redundant Rows**

**Before Handling Duplicate Rows**

**Observation:**

- We can see that **there are redundant records present** in the data.

**Performing Operations**

In [None]:
before_shape = employee.shape
print('Data Shape [Before]:', before_shape)
employee
employee.drop_duplicates(inplace = True)

after_shape = employee.shape
print('Data Shape [After]:', after_shape)

drop_nums = before_shape[0] - after_shape[0]
drop_percent = np.round(drop_nums / before_shape[0], decimals = 2) * 100

print('Drop Ratio:', drop_percent, '%')

**After Handling Duplicate Rows**

In [None]:
print('Contains Redundant Records?:', employee.duplicated().any())
print('Duplicate Count:', employee.duplicated().sum())

#### Replace Missing Values

In [None]:
employee[['filed_complaint','recently_promoted']].isnull().sum()

In [None]:
employee.head()

In [None]:
num_feature = []

for i in data.columns.values:
  if ((data[i].dtype == int) | (data[i].dtype == float)):
    num_feature.append(i)
    
print('Total Numerical Features:', len(num_feature))
print('Features:', num_feature)

#### **3.1.1 Null Data Identification & Handling**

In [34]:
null_frame = pd.DataFrame(index = employee.columns.values)
null_frame['Null Count'] = employee.isnull().sum().values
percent = employee.isnull().sum().values/employee.shape[0]
null_frame['Missing %age'] = np.round(percent, decimals = 4) * 100
null_frame[null_frame['Null Count']>0].transpose()

Unnamed: 0,department,filed_complaint,last_evaluation,recently_promoted,satisfaction,tenure
Null Count,707.0,12104.0,1487.0,13853.0,150.0,150.0
Missing %age,5.0,85.54,10.51,97.9,1.06,1.06


**Before Handling Null Data**

**Observation:**

- **department**:
  - Problem &rarr; Action Required
- **filed_complaint**:
  - Missing Information (12104) &rarr; Replace with 0.
- **last_evaluation**:
  - Missing Information (1487) &rarr; Replace with Median value.
- **recently_promoted**:
  - Missing Information (138523) &rarr; Replace with 0.
- **satisfaction**:
  - Missing Information (150) &rarr; Replace with Median.
- **satisfaction**:
  - Missing Information (150) &rarr; Replace with Median.

**Performing Operations**