# Dataset pre-processing.

## Import packages

In [1]:
import sys
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')

## Read dataset

In [2]:
dataset = pd.read_csv("Citywide_Payroll_Data__Fiscal_Year_.csv")
print(f"Dataset size in memory: {sys.getsizeof(dataset)*1e-9:.3f}","gb")
new_data = dataset[dataset.columns]
assert new_data.shape == dataset.shape

Dataset size in memory: 2.934 gb


In [3]:
dataset.tail(5)

Unnamed: 0,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Mid Init,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,Pay Basis,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay
4496762,2021,742.0,DEPT OF ED PEDAGOGICAL,BURGAN,WYLIE,G,10/01/1986,OTHER,ASSISTANT PRINCIPAL,CEASED,124800.0,per Annum,0.0,0.0,0.0,0.0,-71916.37
4496763,2021,902.0,BRONX DISTRICT ATTORNEY,SIMMONS,NATHANIEL,,07/02/1990,BRONX,SPECIAL ASSISTANT TO THE DISTRICT ATTORNEY,CEASED,110000.0,per Annum,-70.0,-4207.65,0.0,0.0,-75440.0
4496764,2021,742.0,DEPT OF ED PEDAGOGICAL,ROBERTSON,BRENDA,L,10/15/1985,OTHER,ASSISTANT PRINCIPAL,CEASED,126061.0,per Annum,0.0,0.0,0.0,0.0,-87436.13
4496765,2021,742.0,DEPT OF ED PEDAGOGICAL,BECKHAM,DOLORES,,11/04/1977,OTHER,PRINCIPAL,CEASED,159609.0,per Annum,0.0,0.0,0.0,0.0,-97608.11
4496766,2021,740.0,DEPARTMENT OF EDUCATION ADMIN,CORLETT,SEAN,,09/15/2013,MANHATTAN,ADMINISTRATIVE EDUCATION OFFICER,CEASED,143262.0,per Annum,0.0,-117989.06,0.0,0.0,0.0


### Total missing values.

In [4]:
print(f'Missing cell values : {sum(new_data.isna().sum())}')

Missing cell values : 4100946


### Missing values column wise.

In [5]:
pd.DataFrame(new_data.isna().sum(), columns=["Missing values"])

Unnamed: 0,Missing values
Fiscal Year,0
Payroll Number,1745440
Agency Name,0
Last Name,6632
First Name,6661
Mid Init,1835833
Agency Start Date,63
Work Location Borough,506229
Title Description,88
Leave Status as of June 30,0


### Columns to be handled.

In [6]:
condition = new_data.isna().sum()
pd.DataFrame(condition[condition.values>0], columns=["Missing Values"])

Unnamed: 0,Missing Values
Payroll Number,1745440
Last Name,6632
First Name,6661
Mid Init,1835833
Agency Start Date,63
Work Location Borough,506229
Title Description,88


### Drop irrelevant column 'Mid Init'.

In [7]:
new_data.drop('Mid Init', axis = 1, inplace = True)

### Every agency has an unique identification payroll number. We can fill in the payroll number by analyzing the agency name column. Added a new column called 'Payroll Number proc', the processed one for the 'Payroll Number' column.

In [8]:
new_data["Agency Name"] = new_data["Agency Name"].apply(lambda x: x.strip())
new_data.fillna(np.nan, inplace=True)

In [9]:
temp_dict = dict(new_data[["Payroll Number", "Agency Name"]].value_counts())
pa_dict = {i[1]:i[0] for i in temp_dict.keys()}

def func(x):
    if x in pa_dict.keys():
        return pa_dict[x]
    else:
        return np.nan

new_data['Payroll Number proc'] =  new_data["Agency Name"].apply(func)
new_data = new_data[new_data["Payroll Number proc"].notna()]

### Drop 'Payroll Number' columns.
### Clear rows 'Last Name', 'First Name', 'Agency Start Date' and 'Title Description' with na values.  

In [10]:
new_data.drop("Payroll Number",axis=1, inplace=True)
new_data.dropna(axis=0, inplace=True)

### After removing missing values.

In [11]:
condition = new_data.isna().sum()
pd.DataFrame(condition, columns=["Missing Values"])

Unnamed: 0,Missing Values
Fiscal Year,0
Agency Name,0
Last Name,0
First Name,0
Agency Start Date,0
Work Location Borough,0
Title Description,0
Leave Status as of June 30,0
Base Salary,0
Pay Basis,0


## Original dataset vs Processed one.

In [12]:
pd.DataFrame([dataset.shape, new_data.shape], index=["Original","Processed"], columns=["Rows","columns"])

Unnamed: 0,Rows,columns
Original,4496767,17
Processed,3980623,16


In [13]:
new_data.head(2)

Unnamed: 0,Fiscal Year,Agency Name,Last Name,First Name,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,Pay Basis,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay,Payroll Number proc
0,2020,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0,17.0
1,2020,OFFICE OF EMERGENCY MANAGEMENT,GEAGER,VERONICA,09/12/2016,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0,17.0


### Changing Payroll number column and renaming it.

In [14]:
new_data.insert(loc=1, column="Payroll Number", value=new_data["Payroll Number proc"])
new_data.drop("Payroll Number proc", axis=1, inplace=True)

In [15]:
new_data.head(2)

Unnamed: 0,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,Pay Basis,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay
0,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0
1,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,GEAGER,VERONICA,09/12/2016,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0


### Grouping regular hours. Max is 4160 hours, min is -1260 hours.

In [16]:
new_data["Regular Hours"].describe().to_frame()

Unnamed: 0,Regular Hours
count,3980623.0
mean,662.5017
std,886.1797
min,-1260.0
25%,0.0
50%,0.0
75%,1820.0
max,4160.0


In [17]:
def RG_func(x):
    r = [0,500,1000,1500,2000,2500,3000,3500,4000,4500]
    for e, i in enumerate(r):
        if x < 0:
            return '<0'
        elif x<i:
            return f'{r[e-1]}-{i}'

new_data['Regular Hours category'] = new_data["Regular Hours"].apply(RG_func)
new_data.head(1)

Unnamed: 0,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,Pay Basis,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay,Regular Hours category
0,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0,1500-2000


### Add column called 'Total Paid', after combining Regular Gross Paid, Total OT Paid and Total Other Pay.

In [18]:
new_data["Total Paid"] = new_data["Regular Gross Paid"] + new_data["Total OT Paid"] + new_data["Total Other Pay"]
new_data.head(1)

Unnamed: 0,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,Pay Basis,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay,Regular Hours category,Total Paid
0,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0,1500-2000,84698.21


### Grouping OT hours. Max is 3147, min is -209.

In [19]:
new_data["OT Hours"].describe()

count    3.980623e+06
mean     6.116369e+01
std      1.568587e+02
min     -2.090000e+02
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      3.147000e+03
Name: OT Hours, dtype: float64

In [20]:
def RG_func(x):
    r = [0,500,1000,1500,2000,2500,3000,3500]
    for e, i in enumerate(r):
        if x < 0:
            return '<0'
        elif x<i:
            return f'{r[e-1]}-{i}'

new_data['OT Hours category'] = new_data["OT Hours"].apply(RG_func)
new_data.head(1)

Unnamed: 0,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,Pay Basis,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay,Regular Hours category,Total Paid,OT Hours category
0,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0,1500-2000,84698.21,0-500


### Calculate hourly pay. Regular Gross Paid / Regular Hours.

In [21]:
def RG_func(x):
    if x[0] <= 0:
        return 0
    else:
        return x[1]/x[0]

new_data["Hourly Pay"] = new_data[["Regular Hours", "Regular Gross Paid"]].apply(RG_func, axis=1)
new_data.head(1)

Unnamed: 0,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,Pay Basis,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay,Regular Hours category,Total Paid,OT Hours category,Hourly Pay
0,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0,1500-2000,84698.21,0-500,46.537478


### Creating Hourly Pay category.

In [22]:
def RG_func(x):
    r = list(range(0, 1001, 100))
    for e, i in enumerate(r):
        if x < 0:
            return '<0'
        elif x > 1000:
            return '>1000'
        elif x<i:
            return f'{r[e-1]}-{i}'

new_data['Hourly Pay category'] = new_data["Hourly Pay"].apply(RG_func)
new_data.head(1)

Unnamed: 0,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,...,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay,Regular Hours category,Total Paid,OT Hours category,Hourly Pay,Hourly Pay category
0,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,...,1820.0,84698.21,0.0,0.0,0.0,1500-2000,84698.21,0-500,46.537478,0-100


### Create category for Total Paid.

In [23]:
new_data["Total Paid"].describe()

count    3.980623e+06
mean     4.870030e+04
std      4.720703e+04
min     -1.740880e+05
25%      3.622245e+03
50%      3.929264e+04
75%      8.204792e+04
max      6.727313e+05
Name: Total Paid, dtype: float64

In [24]:
def RG_func(x):
    r = list(range(0, 300001, 30000))
    for e, i in enumerate(r):
        if x < 0:
            return '<0'
        elif x > max(r):
            return f'>{max(r)}'
        elif x<i:
            return f'{r[e-1]}-{i}'

new_data['Total Pay category'] = new_data["Total Paid"].apply(RG_func)
new_data.head(1)

Unnamed: 0,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,...,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay,Regular Hours category,Total Paid,OT Hours category,Hourly Pay,Hourly Pay category,Total Pay category
0,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,...,84698.21,0.0,0.0,0.0,1500-2000,84698.21,0-500,46.537478,0-100,60000-90000


### Convert category columns to string.

In [25]:
cols = ["Regular Hours category","OT Hours category","Hourly Pay category","Total Pay category"]
for col in cols:
    new_data[col] = new_data[col].apply(lambda x: str(x))

### Save processed dataframe in csv format.

In [27]:
new_data.to_csv("NYC-proc.csv")

### Saving the processed data in feather format.

In [31]:
new_data.reset_index(inplace=True)
new_data.to_feather("NYC-proc.feather")

In [33]:
new_data.head(1)

Unnamed: 0,index,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,...,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay,Regular Hours category,Total Paid,OT Hours category,Hourly Pay,Hourly Pay category,Total Pay category
0,0,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,...,84698.21,0.0,0.0,0.0,1500-2000,84698.21,0-500,46.537478,0-100,60000-90000


### Saving few rows to display on github. The entire dataset is more than 300 Mb.

In [34]:
new_data[:100].to_feather("100-NYC-rows.feather")