# HR Employee Records
### Problem Statement:
    Process employee data for inconsistencies, fill missing ratings, compute average salaries by department, and identify low-performing departments.
    Key Functions: .pivot_table(), .nunique(), .clip(), .qcut(), np.percentile(), .dt.year, .value_counts()

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('hr_employees.csv')
df = df.copy()
df

Unnamed: 0,Emp_ID,Name,Department,Join_Date,Salary,Performance_Rating
0,6001,Alice,Marketing,2022-01-01,40000.0,3.0
1,6002,Bob,Finance,2022-01-08,25000.0,
2,6003,Charlie,,2022-01-15,40000.0,4.0
3,6004,Charlie,Marketing,2022-01-22,,
4,6005,Frank,Admin,2022-01-29,,5.0
5,6006,Charlie,HR,2022-02-05,30000.0,4.0
6,6007,Frank,Finance,2022-02-12,40000.0,5.0
7,6008,David,HR,2022-02-19,40000.0,
8,6009,Frank,Admin,2022-02-26,50000.0,2.0
9,6010,Bob,Admin,2022-03-05,30000.0,3.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Emp_ID              50 non-null     int64  
 1   Name                50 non-null     object 
 2   Department          41 non-null     object 
 3   Join_Date           50 non-null     object 
 4   Salary              42 non-null     float64
 5   Performance_Rating  41 non-null     float64
dtypes: float64(2), int64(1), object(3)
memory usage: 2.5+ KB


In [5]:
df.isnull().sum()

Emp_ID                0
Name                  0
Department            9
Join_Date             0
Salary                8
Performance_Rating    9
dtype: int64

In [6]:
df['Department'] = df['Department'].ffill()

In [7]:
df

Unnamed: 0,Emp_ID,Name,Department,Join_Date,Salary,Performance_Rating
0,6001,Alice,Marketing,2022-01-01,40000.0,3.0
1,6002,Bob,Finance,2022-01-08,25000.0,
2,6003,Charlie,Finance,2022-01-15,40000.0,4.0
3,6004,Charlie,Marketing,2022-01-22,,
4,6005,Frank,Admin,2022-01-29,,5.0
5,6006,Charlie,HR,2022-02-05,30000.0,4.0
6,6007,Frank,Finance,2022-02-12,40000.0,5.0
7,6008,David,HR,2022-02-19,40000.0,
8,6009,Frank,Admin,2022-02-26,50000.0,2.0
9,6010,Bob,Admin,2022-03-05,30000.0,3.0


In [8]:
df.isnull().sum()

Emp_ID                0
Name                  0
Department            0
Join_Date             0
Salary                8
Performance_Rating    9
dtype: int64

In [9]:
df.fillna({'Salary':df['Salary'].mean().round()}, inplace=True)

In [10]:
df

Unnamed: 0,Emp_ID,Name,Department,Join_Date,Salary,Performance_Rating
0,6001,Alice,Marketing,2022-01-01,40000.0,3.0
1,6002,Bob,Finance,2022-01-08,25000.0,
2,6003,Charlie,Finance,2022-01-15,40000.0,4.0
3,6004,Charlie,Marketing,2022-01-22,38095.0,
4,6005,Frank,Admin,2022-01-29,38095.0,5.0
5,6006,Charlie,HR,2022-02-05,30000.0,4.0
6,6007,Frank,Finance,2022-02-12,40000.0,5.0
7,6008,David,HR,2022-02-19,40000.0,
8,6009,Frank,Admin,2022-02-26,50000.0,2.0
9,6010,Bob,Admin,2022-03-05,30000.0,3.0


In [11]:
df['Performance_Rating'].value_counts()

Performance_Rating
3.0    11
5.0    11
4.0    10
1.0     6
2.0     3
Name: count, dtype: int64

In [12]:
df.fillna({'Performance_Rating':5}, inplace = True)

In [13]:
df['Performance_Rating'] = df['Performance_Rating'].astype(int)

In [14]:
df

Unnamed: 0,Emp_ID,Name,Department,Join_Date,Salary,Performance_Rating
0,6001,Alice,Marketing,2022-01-01,40000.0,3
1,6002,Bob,Finance,2022-01-08,25000.0,5
2,6003,Charlie,Finance,2022-01-15,40000.0,4
3,6004,Charlie,Marketing,2022-01-22,38095.0,5
4,6005,Frank,Admin,2022-01-29,38095.0,5
5,6006,Charlie,HR,2022-02-05,30000.0,4
6,6007,Frank,Finance,2022-02-12,40000.0,5
7,6008,David,HR,2022-02-19,40000.0,5
8,6009,Frank,Admin,2022-02-26,50000.0,2
9,6010,Bob,Admin,2022-03-05,30000.0,3


In [15]:
df.isnull().sum()

Emp_ID                0
Name                  0
Department            0
Join_Date             0
Salary                0
Performance_Rating    0
dtype: int64

In [16]:
df.to_csv('cleaned_employee_data.csv', index = False)