# Exploratory Data Analysis - Part 1

   - Employee Salaries

In [1]:
import pandas as pd

### Read CSV Files

In [4]:
# Loading data (csv, html, excel, sql)

df_salary = pd.read_csv('salaries.csv') # From Kaggle.com

In [5]:
df_salary.head()

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011,,San Francisco,
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.6,,335279.91,335279.91,2011,,San Francisco,
3,4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916.0,56120.71,198306.9,,332343.61,332343.61,2011,,San Francisco,
4,5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134401.6,9737.0,182234.59,,326373.19,326373.19,2011,,San Francisco,


In [6]:
# Check how many entries

df_salary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148654 entries, 0 to 148653
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Id                148654 non-null  int64  
 1   EmployeeName      148654 non-null  object 
 2   JobTitle          148654 non-null  object 
 3   BasePay           148045 non-null  float64
 4   OvertimePay       148650 non-null  float64
 5   OtherPay          148650 non-null  float64
 6   Benefits          112491 non-null  float64
 7   TotalPay          148654 non-null  float64
 8   TotalPayBenefits  148654 non-null  float64
 9   Year              148654 non-null  int64  
 10  Notes             0 non-null       float64
 11  Agency            148654 non-null  object 
 12  Status            0 non-null       float64
dtypes: float64(8), int64(2), object(3)
memory usage: 14.7+ MB


## What is the average base pay?

In [7]:
df_salary['BasePay'].mean()

66325.44884050643

## What is the highest amount of OvertimePay in the dataset?

In [8]:
df_salary['OvertimePay'].max()

245131.88

## How much does Joseph Driscoll make including benefits?

In [12]:
df_salary[df_salary['EmployeeName'] == 'Joseph Driscoll']['TotalPayBenefits']

36198    331834.79
Name: TotalPayBenefits, dtype: float64

## Who is the highest paid person including benefits?

In [15]:
df_salary[df_salary['TotalPayBenefits'] == df_salary['TotalPayBenefits'].max()]['EmployeeName']

0    NATHANIEL FORD
Name: EmployeeName, dtype: object

In [18]:
# Alternative

df_salary.sort_values(by = 'TotalPayBenefits', ascending = False)[['EmployeeName', 'TotalPayBenefits']].head(1)                                                               

Unnamed: 0,EmployeeName,TotalPayBenefits
0,NATHANIEL FORD,567595.43


## Who is the lowest paid person including benefits?

In [19]:
df_salary[df_salary['TotalPayBenefits'] == df_salary['TotalPayBenefits'].min()]['EmployeeName']

148653    Joe Lopez
Name: EmployeeName, dtype: object

## What was the average BasePay of all employees per year? (2011 - 2014)

In [21]:
df_salary.groupby(by = 'Year').mean()['BasePay']

Year
2011    63595.956517
2012    65436.406857
2013    69630.030216
2014    66564.421924
Name: BasePay, dtype: float64

## What was the average BasePay of all employees per year? (2011, 2013 only)

In [24]:
df_filtered = df_salary.groupby('Year').mean()
df_filtered = df_filtered.query("Year == '2011' | Year == '2013'")
df_filtered['BasePay']

Year
2011    63595.956517
2013    69630.030216
Name: BasePay, dtype: float64

## How many unique job titles are there?

In [28]:
df_salary['JobTitle'].nunique()

2159

## What are the top 3 most common jobs?

In [30]:
df_salary['JobTitle'].value_counts().head(3)

Transit Operator    7036
Special Nurse       4389
Registered Nurse    3736
Name: JobTitle, dtype: int64

## How many Job Titles were represented by only one person in 2013?

In [36]:
(df_salary[df_salary['Year'] == 2013]['JobTitle'].value_counts() == 1).sum()

202

## How many people have the word 'Chief' in their job title?

In [66]:
def finding_job(title):
    
    if 'chief' in title.lower().split():
        
        return True
    
    else:
        
        return False

In [68]:
df_salary['JobTitle'].apply(finding_job).sum()

477

## Is there a correlation between length of the Job Title and Salary?

     - This is an advanced question, a preview of next semester :-)
     - 1 Perfect (+ rel), 0.8 (+ fairly), 0.6 (+ moderate), 0 (none)

In [74]:
df_salary['Title Length'] = df_salary['JobTitle'].apply(len)

In [78]:
df_salary[['TotalPayBenefits','Title Length']].corr()

Unnamed: 0,TotalPayBenefits,Title Length
TotalPayBenefits,1.0,-0.036878
Title Length,-0.036878,1.0
