# Pandas Operations

In [None]:
import numpy as np
import pandas as pd

## Sample Dataset

In [None]:
d = {
    'Name'  : ['Wanda','Eric','John','Andy','Deborah','Charles','James'],
    'Sex': ['Female', 'Male', 'Male', 'Male','Female', 'Male','Male'],
    'City'  : ['Stamford', 'Stamford', 'New York', 'Philadelphia', 'Philadelphia','Stamford','New York'],
    'Frequency': ['Weekly', 'Daily','Weekly', 'Monthly', 'Daily', 'Weekly','Daily'],
    'Item'     : ['Burger', 'Chalupa', 'Sushi','Ice Cream', 'Chalupa', 'Sushi','Donut'],
    'Spend'    : [15,10,42,11,23,93,37]
}

In [None]:
df = pd.DataFrame(d)
df

Unnamed: 0,Name,Sex,City,Frequency,Item,Spend
0,Wanda,Female,Stamford,Weekly,Burger,15
1,Eric,Male,Stamford,Daily,Chalupa,10
2,John,Male,New York,Weekly,Sushi,42
3,Andy,Male,Philadelphia,Monthly,Ice Cream,11
4,Deborah,Female,Philadelphia,Daily,Chalupa,23
5,Charles,Male,Stamford,Weekly,Sushi,93
6,James,Male,New York,Daily,Donut,37


In [None]:
# .head .tail() .sample()

df.sample()

Unnamed: 0,Name,Sex,City,Frequency,Item,Spend
3,Andy,Male,Philadelphia,Monthly,Ice Cream,11


In [None]:
# .unique

df['Item'].unique() # show exact description of unique items

array(['Burger', 'Chalupa', 'Sushi', 'Ice Cream', 'Donut'], dtype=object)

In [None]:
len(df['Item'].unique()) # number of unique items
# df['Item'].nunique() # alternative

5

In [None]:
df.nunique() # summary of each columns number of unique entities

Unnamed: 0,0
Name,7
Sex,2
City,3
Frequency,3
Item,5
Spend,7


In [None]:
# .value counts

df['Item'].value_counts() # count number of values

Unnamed: 0_level_0,count
Item,Unnamed: 1_level_1
Chalupa,2
Sushi,2
Burger,1
Ice Cream,1
Donut,1


In [None]:
# .apply()

def vat(x):

  return x * 0.02

In [None]:
df['Tax'] = df['Spend'].apply(vat)
df

Unnamed: 0,Name,Sex,City,Frequency,Item,Spend,Tax
0,Wanda,Female,Stamford,Weekly,Burger,15,0.3
1,Eric,Male,Stamford,Daily,Chalupa,10,0.2
2,John,Male,New York,Weekly,Sushi,42,0.84
3,Andy,Male,Philadelphia,Monthly,Ice Cream,11,0.22
4,Deborah,Female,Philadelphia,Daily,Chalupa,23,0.46
5,Charles,Male,Stamford,Weekly,Sushi,93,1.86
6,James,Male,New York,Daily,Donut,37,0.74


In [None]:
# .sum, .mean() .std() .var() .count()

df['Spend'].count()

7

In [None]:
# .columns, .index

df.columns

Index(['Name', 'Sex', 'City', 'Frequency', 'Item', 'Spend', 'Tax'], dtype='object')

In [None]:
df.index

RangeIndex(start=0, stop=7, step=1)

In [None]:
# .pivot_table (PT)

df

Unnamed: 0,Name,Sex,City,Frequency,Item,Spend,Tax
0,Wanda,Female,Stamford,Weekly,Burger,15,0.3
1,Eric,Male,Stamford,Daily,Chalupa,10,0.2
2,John,Male,New York,Weekly,Sushi,42,0.84
3,Andy,Male,Philadelphia,Monthly,Ice Cream,11,0.22
4,Deborah,Female,Philadelphia,Daily,Chalupa,23,0.46
5,Charles,Male,Stamford,Weekly,Sushi,93,1.86
6,James,Male,New York,Daily,Donut,37,0.74


In [None]:
# Sample PT - 1

df.pivot_table(values = 'Spend', columns = ['Item'], aggfunc = 'sum')

Item,Burger,Chalupa,Donut,Ice Cream,Sushi
Spend,15,33,37,11,135


In [None]:
# Sample PT - 2

df.pivot_table(values = ['Spend', 'Tax'], columns = ['Item'], aggfunc = 'sum')

Item,Burger,Chalupa,Donut,Ice Cream,Sushi
Spend,15.0,33.0,37.0,11.0,135.0
Tax,0.3,0.66,0.74,0.22,2.7


In [None]:
# Sample PT - 3

df.pivot_table(values = ['Spend'], columns = ['City'], aggfunc = 'mean')

City,New York,Philadelphia,Stamford
Spend,39.5,17.0,39.333333


In [None]:
# Sample PT -4

df.pivot_table(values = ['Spend', 'Tax'], columns = ['Sex'], aggfunc = 'sum')

Sex,Female,Male
Spend,38.0,193.0
Tax,0.76,3.86


# Data Input (Pandas)

In [None]:
# Reading of CSV files

df1 = pd.read_csv('https://raw.githubusercontent.com/renatomaaliw3/public_files/refs/heads/master/Data%20Sets/foods.csv') # .json, sql, .dat, etc.....
df1

Unnamed: 0,First Name,Gender,City,Frequency,Item,Spend
0,Wanda,Female,Stamford,Weekly,Burger,15.66
1,Eric,Male,Stamford,Daily,Chalupa,10.56
2,Charles,Male,New York,Never,Sushi,42.14
3,Anna,Female,Philadelphia,Once,Ice Cream,11.01
4,Deborah,Female,Philadelphia,Daily,Chalupa,23.49
...,...,...,...,...,...,...
995,Donna,Female,New York,Monthly,Sushi,83.53
996,Albert,Male,Philadelphia,Daily,Sushi,72.88
997,Jean,Female,Stamford,Weekly,Donut,5.85
998,Jessica,Female,New York,Daily,Chalupa,43.19


In [None]:
df1.head(5)

Unnamed: 0,First Name,Gender,City,Frequency,Item,Spend
0,Wanda,Female,Stamford,Weekly,Burger,15.66
1,Eric,Male,Stamford,Daily,Chalupa,10.56
2,Charles,Male,New York,Never,Sushi,42.14
3,Anna,Female,Philadelphia,Once,Ice Cream,11.01
4,Deborah,Female,Philadelphia,Daily,Chalupa,23.49


In [None]:
df1.tail()

Unnamed: 0,First Name,Gender,City,Frequency,Item,Spend
995,Donna,Female,New York,Monthly,Sushi,83.53
996,Albert,Male,Philadelphia,Daily,Sushi,72.88
997,Jean,Female,Stamford,Weekly,Donut,5.85
998,Jessica,Female,New York,Daily,Chalupa,43.19
999,Brian,Male,New York,Daily,Chalupa,28.36


In [None]:
df1.loc[:, 'Item':]

Unnamed: 0,Item,Spend
0,Burger,15.66
1,Chalupa,10.56
2,Sushi,42.14
3,Ice Cream,11.01
4,Chalupa,23.49
...,...,...
995,Sushi,83.53
996,Sushi,72.88
997,Donut,5.85
998,Chalupa,43.19


In [None]:
df1.iloc[1:5,1:3]

Unnamed: 0,Gender,City
1,Male,Stamford
2,Male,New York
3,Female,Philadelphia
4,Female,Philadelphia


# Exploratory Data Analysis - Part 1

In [None]:
pd.set_option('display.max_columns', None) # see all columns

df_salary = pd.read_csv('https://raw.githubusercontent.com/renatomaaliw3/public_files/refs/heads/master/Data%20Sets/salaries.csv', index_col = 'Id')
df_salary.head(3)

Unnamed: 0_level_0,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011,,San Francisco,
2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,
3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.6,,335279.91,335279.91,2011,,San Francisco,


In [None]:
# Check how many entries

df_salary.info() # number of columns and number of row entries

<class 'pandas.core.frame.DataFrame'>
Index: 148654 entries, 1 to 148654
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   EmployeeName      148654 non-null  object 
 1   JobTitle          148654 non-null  object 
 2   BasePay           148045 non-null  float64
 3   OvertimePay       148650 non-null  float64
 4   OtherPay          148650 non-null  float64
 5   Benefits          112491 non-null  float64
 6   TotalPay          148654 non-null  float64
 7   TotalPayBenefits  148654 non-null  float64
 8   Year              148654 non-null  int64  
 9   Notes             0 non-null       float64
 10  Agency            148654 non-null  object 
 11  Status            0 non-null       float64
dtypes: float64(8), int64(1), object(3)
memory usage: 14.7+ MB


In [None]:
# What is the average base pay?

df_salary['BasePay'].mean()

66325.4488404877

In [None]:
# What is the highest amount of OvertimePay in the dataset?

df_salary['OvertimePay'].max()

245131.88

In [None]:
# What is the Job title of Joseph Driscoll?

df_salary[df_salary['EmployeeName'] == 'Joseph Driscoll']['JobTitle']

Unnamed: 0_level_0,JobTitle
Id,Unnamed: 1_level_1
36199,"Captain, Fire Suppression"


In [None]:
# How much does Joseph Driscoll make including benefits?

df_salary[df_salary['EmployeeName'] == 'Joseph Driscoll']['TotalPayBenefits']

Unnamed: 0_level_0,TotalPayBenefits
Id,Unnamed: 1_level_1
36199,331834.79


In [None]:
# Who is the highest paid person including benefits?

df_salary[df_salary['TotalPayBenefits'] == df_salary['TotalPayBenefits'].max()][['EmployeeName', 'TotalPayBenefits']]

Unnamed: 0_level_0,EmployeeName,TotalPayBenefits
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,NATHANIEL FORD,567595.43


In [None]:
# Alternative

df_salary.sort_values(by = 'TotalPayBenefits', ascending = False)[['EmployeeName', 'TotalPayBenefits']].head(1)

Unnamed: 0_level_0,EmployeeName,TotalPayBenefits
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,NATHANIEL FORD,567595.43


In [None]:
# What was the average BasePay of all employees per year? (2011 - 2014)

df_salary[['Year', 'BasePay']].groupby('Year').mean()

Unnamed: 0_level_0,BasePay
Year,Unnamed: 1_level_1
2011,63595.956517
2012,65436.406857
2013,69630.030216
2014,66564.421924


In [None]:
# What was the average BasePay of all employees per year? (2011, 2013 only)

# df_filtered = df_salary.groupby('Year').mean()
# df_filtered = df_filtered.query("Year == 2011 | Year == 2013")
# df_filtered['BasePay']

df_salary[['Year', 'BasePay']].groupby('Year').mean().query('Year == 2011 | Year == 2013')

Unnamed: 0_level_0,BasePay
Year,Unnamed: 1_level_1
2011,63595.956517
2013,69630.030216


In [None]:
# How many unique job titles are there?

df_salary['JobTitle'].nunique()

2159

In [None]:
# What are the top 3 most common jobs?

df_salary['JobTitle'].value_counts().head(3)

Unnamed: 0_level_0,count
JobTitle,Unnamed: 1_level_1
Transit Operator,7036
Special Nurse,4389
Registered Nurse,3736


In [None]:
# How many Job Titles were represented by only one person in 2013?

(df_salary[df_salary['Year'] == 2013]['JobTitle'].value_counts() == 1).sum()

202

In [None]:
# How many people have the word 'Chief' in their job title? (Case will not matter, count 'chief' and 'Chief', etc.)

df_salary['JobTitle'].apply(lambda x: 'chief' in str(x).lower()).sum()

627

In [None]:
# How many Employees has the last name 'Smith', case will not matter

df_salary[df_salary['EmployeeName'].str.lower().str.endswith('smith')]

In [None]:
# How many Employees has the last name 'Smith', case will not matter, with BasePay of >= 150000

# len(df_salary[df_salary['EmployeeName'].str.lower().str.endswith('smith')].query('BasePay >= 150000'))
df_salary[df_salary['EmployeeName'].str.lower().str.endswith('smith') & (df_salary['BasePay'] >= 150000)]

Unnamed: 0_level_0,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
98,KEVIN SMITH,"BATTALION CHIEF, (FIRE DEPARTMENT)",168692.67,35402.71,22261.95,,226357.33,226357.33,2011,,San Francisco,
111,MARK SMITH,ANESTHETIST,206057.69,3431.01,10921.33,,220410.03,220410.03,2011,,San Francisco,
139,JESSE SMITH,CONFIDENTIAL CHIEF ATTORNEY II (CIVIL & CRIMINAL),213907.72,0.0,1080.96,,214988.68,214988.68,2011,,San Francisco,
1517,JON SMITH,ASSISTANT MEDICAL EXAMINER,153902.71,0.0,7019.05,,160921.76,160921.76,2011,,San Francisco,
36220,Mark Smith,Anesthetist,221084.95,3892.42,9874.61,75463.4,234851.98,310315.38,2012,,San Francisco,
36229,Kevin Smith,"Battlion Chief, Fire Suppressi",176515.52,45860.99,27969.77,53182.45,250346.28,303528.73,2012,,San Francisco,
36282,Jesse Smith,"Cfdntal Chf Atty 2,(Cvl&Crmnl)",214883.95,0.0,3008.0,65439.18,217891.95,283331.13,2012,,San Francisco,
37619,Bridget Smith,Engineer/Architect Principal,163253.6,0.0,60.0,52667.12,163313.6,215980.72,2012,,San Francisco,
38210,David Smith,Lieutenant 3,155320.05,4076.97,4954.45,39673.22,164351.47,204024.69,2012,,San Francisco,
72981,Kevin W Smith,"Battalion Chief, Fire Suppress",186236.47,47959.88,33067.83,57855.78,267264.18,325119.96,2013,,San Francisco,
