# Goal: Use Ask a Manager survey to analyze different trends in the data.



In [43]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 
import warnings
warnings.filterwarnings("ignore")


In [44]:
#load the data
df = pd.read_csv('Ask A Manager Salary Survey 2023.csv')
print(df.columns)


Index(['Timestamp', 'How old are you?', 'Industry', 'Functional area of job',
       'Job title', 'Job title - additional context', 'Annual salary (gross)',
       'Additional monetary compensation', 'Currency', 'Currency - other',
       'Income - additional context', 'Country', 'State', 'City',
       'Remote or on-site?', 'Years of experience, overall',
       'Years of experience in field', 'Highest level of education completed',
       'Gender', 'Race'],
      dtype='object')


In [15]:
df.head(2)

Unnamed: 0,Timestamp,How old are you?,Industry,Functional area of job,Job title,Job title - additional context,Annual salary (gross),Additional monetary compensation,Currency,Currency - other,Income - additional context,Country,State,City,Remote or on-site?,"Years of experience, overall",Years of experience in field,Highest level of education completed,Gender,Race
0,4/11/2023 11:02:00,35-44,Government & Public Administration,Engineering or Manufacturing,Materials Engineer,,125000,800.0,USD,,,United States,California,Ridgecrest,On-site,11-20 years,11-20 years,College degree,Man,White
1,4/11/2023 11:02:07,25-34,"Galleries, Libraries, Archives & Museums","Galleries, Libraries, Archives & Museums",Assistant Branch Manager,,71000,0.0,USD,,,United States,Virginia,Fairfax County,On-site,8-10 years,5-7 years,Master's degree,Man,White


In [45]:
#convert to lower case to get unique records
df['job_title'] = df['Job title'].str.lower()
df['education'] = df['Highest level of education completed'].str.lower()


#standardize values in job title by removing white space/ renaming senior

df['job_title'] = df['job_title'].str.strip()

title_remapping={'sr':'senior', 'sr.': 'senior'}

df['job_title'] = df['job_title'].apply(lambda x: title_remapping.get(x,x))

# create total compenstation column 
df['total_comp'] = df['Annual salary (gross)'] + df['Additional monetary compensation']


Explore the data

In [46]:
print('How many observations?', df.shape[0])

How many observations? 16888


In [49]:
print('How many unique job titles?', df.job_title.nunique())

How many unique job titles? 8819


In [60]:
print('Top 5 most represented countries in percent:')
(df['Country'].value_counts(normalize=True)*100).to_frame().head(5)


Submissions by Top 5 most represented countries, percent:


Unnamed: 0_level_0,proportion
Country,Unnamed: 1_level_1
United States,82.828044
Canada,6.501658
United Kingdom,5.086452
Australia,1.652061
Germany,0.787541


In [78]:

print('Average salary by country & count of submissions:')
df.groupby('Country').agg({
    'Annual salary (gross)':'mean', 'Timestamp':'count'
    }).sort_values(by='Timestamp', ascending=False).round(0).head(10)


Average salary by country & count of submissions:


Unnamed: 0_level_0,Annual salary (gross),Timestamp
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
United States,101354.0,13988
Canada,90466.0,1098
United Kingdom,54926.0,859
Australia,114679.0,279
Germany,69540.0,133
Ireland,69719.0,73
New Zealand,104053.0,64
Sweden,465980.0,31
Netherlands,70323.0,27
France,45354.0,26


In [81]:
df.groupby('Country')['Timestamp'].count().sort_values(ascending=False)

Country
United States                                                  13988
Canada                                                          1098
United Kingdom                                                   859
Australia                                                        279
Germany                                                          133
                                                               ...  
International- mainly US, Canada, Australia, India, Italy          1
International Company with remote work and travel in the US        1
Indonesia                                                          1
Iceland for a U.S. company                                         1
MENA                                                               1
Name: Timestamp, Length: 119, dtype: int64

In [33]:
df['job_title'].unique()

array(['materials engineer', 'assistant branch manager',
       'director of financial aid', ..., 'dance instructor',
       'wedding photographer', 'powertrain integration engineer'],
      dtype=object)

In [27]:
df['education'].value_counts(dropna=False)

education
college degree                                       7623
master's degree                                      5646
some college                                         1147
phd                                                  1080
professional degree (md, jd, etc.)                    882
                                                     ... 
professional quals                                      1
mba student                                             1
professional degree (mlis) and phd                      1
education specialist (ed.s.                             1
associates degree in social & behavioral sciences       1
Name: count, Length: 178, dtype: int64