# Dataset pre-processing.

## Import packages

In [1]:
import sys
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')

## Read dataset

In [2]:
dataset = pd.read_csv("Citywide_Payroll_Data__Fiscal_Year_.csv")
print(f"Dataset size in memory: {sys.getsizeof(dataset)*1e-9:.3f}","gb")
new_data = dataset[dataset.columns]
assert new_data.shape == dataset.shape

Dataset size in memory: 2.934 gb


In [3]:
dataset.head(5)

Unnamed: 0,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Mid Init,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,Pay Basis,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay
0,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0
1,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,GEAGER,VERONICA,M,09/12/2016,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0
2,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,RAMANI,SHRADDHA,,02/22/2016,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0
3,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,ROTTA,JONATHAN,D,09/16/2013,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0
4,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,WILSON II,ROBERT,P,04/30/2018,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0


### Total missing values.

In [4]:
print(f'Missing cell values : {sum(new_data.isna().sum())}')

Missing cell values : 4100946


### Missing values column wise.

In [5]:
pd.DataFrame(new_data.isna().sum(), columns=["Missing values"])

Unnamed: 0,Missing values
Fiscal Year,0
Payroll Number,1745440
Agency Name,0
Last Name,6632
First Name,6661
Mid Init,1835833
Agency Start Date,63
Work Location Borough,506229
Title Description,88
Leave Status as of June 30,0


### Columns to be handled.

In [6]:
condition = new_data.isna().sum()
pd.DataFrame(condition[condition.values>0], columns=["Missing Values"])

Unnamed: 0,Missing Values
Payroll Number,1745440
Last Name,6632
First Name,6661
Mid Init,1835833
Agency Start Date,63
Work Location Borough,506229
Title Description,88


### Drop irrelevant column 'Mid Init'.

In [7]:
new_data.drop('Mid Init', axis = 1, inplace = True)

### Every agency has an unique identification payroll number. We can fill in the payroll number by analyzing the agency name column. Added a new column called 'Payroll Number proc', the processed one for the 'Payroll Number' column.

In [8]:
new_data["Agency Name"] = new_data["Agency Name"].apply(lambda x: x.strip())
new_data.fillna(np.nan, inplace=True)

In [9]:
temp_dict = dict(new_data[["Payroll Number", "Agency Name"]].value_counts())
pa_dict = {i[1]:i[0] for i in temp_dict.keys()}

def func(x):
    if x in pa_dict.keys():
        return pa_dict[x]
    else:
        return np.nan

new_data['Payroll Number proc'] =  new_data["Agency Name"].apply(func)
new_data = new_data[new_data["Payroll Number proc"].notna()]

### Drop 'Payroll Number' and 'Work Location Borough' columns.
### Clear rows 'Last Name', 'First Name', 'Agency Start Date' and 'Title Description' with na values.  

In [10]:
new_data.drop("Payroll Number",axis=1, inplace=True)
new_data.drop("Work Location Borough",axis=1, inplace=True)
new_data.dropna(axis=0, inplace=True)

### After removing missing values.

In [11]:
condition = new_data.isna().sum()
pd.DataFrame(condition, columns=["Missing Values"])

Unnamed: 0,Missing Values
Fiscal Year,0
Agency Name,0
Last Name,0
First Name,0
Agency Start Date,0
Title Description,0
Leave Status as of June 30,0
Base Salary,0
Pay Basis,0
Regular Hours,0


## Original dataset vs Processed one (shape).

In [15]:
pd.DataFrame([dataset.shape, new_data.shape], index=["Original","Processed"], columns=["Rows","columns"])

Unnamed: 0,Rows,columns
Original,4496767,17
Processed,4429692,15
