# NYC Payroll Data EDA

### Import Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
nycp = pd.read_csv('../input/nyc-payroll-data/k397-673e.csv')
nycp.head()

### Data Cleaning

In [None]:
nycp.nunique()

In [None]:
nycp.isna().sum()

Fiscal Year, Payroll Number, and Agency Name don't give us much information at all - they are all the same, so we'll drop them. Personal information, including name and start date, have been made public under the Freedom of Information Law, but still raise an ethical / privacy concern for me - I'll be dropping each of those, too. 

Luckily, the only null values are in the Middle Initial, which have no real value for this analysis and are going to be dropped anyways.

In [None]:
nycp = nycp.drop(columns=['fiscal_year', 'payroll_number', 'agency_name', 'last_name',
                         'first_name', 'mid_init', 'agency_start_date'])
nycp.columns = ['Borough', 'Job', 'Leave Status', 'Salary', 'Pay Basis', 'Reg Hours',
                'Reg Gross Pay', 'OT Hours', 'OT Pay', 'Other Pay']
nycp.head()

### Employees by Burough

In [None]:
nycpb = nycp.Borough.value_counts()

fig, ax = plt.subplots(figsize = [14,6])

ax.bar(nycpb.keys(), nycpb.values, color=['maroon','gray','gray','gray','gray'], alpha=.8)
ax.set_title('Employees per Borough', fontsize = 18)
ax.spines[['right', 'top']].set_visible(False)

plt.show()

Manhattan is where a majority of work and services takes place; more than the other four boroughs combined. 

### Leave Status

In [None]:
nycpl = nycp['Leave Status'].value_counts()

fig, ax = plt.subplots(figsize = [14,6])

ax.bar(nycpl.keys(), nycpl.values, color=['maroon','gray','gray','black'], alpha=.8)
ax.set_title('Employee Leave Status as of July 31st', fontsize = 18)
ax.spines[['right', 'top']].set_visible(False)

plt.show()

A vast majority of workers are active, and of those who have taken leave, most have already returned from it as of July 31st.

### Pay Statistics

In [None]:
paydf = nycp[['Salary', 'Reg Gross Pay', 'OT Pay', 'Other Pay']]

In [None]:
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(14,15))

# Violin
violin_parts = axs[0].violinplot(paydf)
axs[0].set_title('Pay by Type',fontsize=18)

for partname in ('cbars','cmins','cmaxes'):
    vp = violin_parts[partname]
    vp.set_edgecolor('black')
    vp.set_linewidth(1)
    
for pc in violin_parts['bodies']:
    pc.set_facecolor('red')
    pc.set_edgecolor('black')

# Box
box_parts = axs[1].boxplot(paydf)
axs[1].set_title('Pay by Type, including Outliers', fontsize=18)
for box in box_parts['boxes']:
    box.set(color='maroon', linewidth=1)

# Box w/o outliers
box_parts = axs[2].boxplot(paydf, showfliers = False)
axs[2].set_title('Pay by Type, Without Outliers', fontsize=18)

for box in box_parts['boxes']:
    box.set(color='maroon', linewidth=1)
    
for ax in axs:
    ax.yaxis.grid(False)
    ax.spines[['right', 'top']].set_visible(False)
    
plt.setp(axs, xticks=[y + 1 for y in range(len(paydf.columns))],
         xticklabels=paydf.columns)
plt.subplots_adjust(hspace=.4)

plt.show()

Most salary tends to be made up of regular gross pay, with OT and other pay bringing the average salary up above the average gross pay.

### Pay Basis

In [None]:
nypb = nycp['Pay Basis'].value_counts()

fig, ax = plt.subplots(figsize = [14,6])

ax.bar(nypb.keys(), nypb.values, color=['slateblue','firebrick','firebrick'], alpha=.8)
ax.set_title('Pay Basis Count', fontsize = 18)
ax.spines[['right', 'top']].set_visible(False)

plt.show()

nypb

Less than 1% of employees are not salaried.

### Hours Worked Statistics

In [None]:
hwdf = nycp[['Reg Hours' , 'OT Hours']]

fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(14,15))

# Violin
violin_parts = axs[0].violinplot(hwdf)
axs[0].set_title('Hours Worked by Type',fontsize=18)

for partname in ('cbars','cmins','cmaxes'):
    vp = violin_parts[partname]
    vp.set_edgecolor('gray')
    vp.set_linewidth(1)
    
for pc in violin_parts['bodies']:
    pc.set_facecolor('slateblue')
    pc.set_edgecolor('gray')

# Box
box_parts = axs[1].boxplot(hwdf)
axs[1].set_title('Hours Worked, including Outliers', fontsize=18)

for box in box_parts['boxes']:
    box.set(color='slateblue', linewidth=1)

# Box w/o outliers
box_parts = axs[2].boxplot(hwdf, showfliers = False)
axs[2].set_title('Hours Worked, Without Outliers', fontsize=18)

for box in box_parts['boxes']:
    box.set(color='slateblue', linewidth=1)
    
for ax in axs:
    ax.yaxis.grid(False)
    ax.spines[['right', 'top']].set_visible(False)
    
plt.setp(axs, xticks=[y + 1 for y in range(len(hwdf.columns))],
         xticklabels=hwdf.columns)
plt.subplots_adjust(hspace=.4)

plt.show()

Most hours worked are between 30 and 40 hours a week, and the average employee puts in a few hours of OT per week. 

There are a few outliers who put in an average of over 30 hours per week OT, and there are some employees who work more "part time" hours.