In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read file

In [None]:
data = pd.read_csv('../input/data-analyst-jobs/DataAnalyst.csv', index_col="Unnamed: 0")

data.info()

In [None]:
data.head()

# Group Salary by Job Title

Rename similar job titles to the most frequent ones.

In [None]:
data['Job Title'] = data['Job Title'].replace({'Sr. Data Analyst': 'Senior Data Analyst', 
                                               'Sr Data Analyst': 'Senior Data Analyst',
                                               'Sr Analyst, Data': 'Senior Data Analyst',
                                               'Data Analyst Junior': 'Junior Data Analyst'})
data['Job Title'].nunique()

Get the top 30 most common data analyst jobs:

In [None]:
data['Job Title'].value_counts()[:30]

Split up the salary estimate into minimum and maximum salary:

In [None]:
test = data['Salary Estimate'].str.split('-', expand=True)
data['MinSalary'] = pd.to_numeric(test[0].str.extract('(\d+)', expand=False))
data['MaxSalary'] = pd.to_numeric(test[1].str.extract('(\d+)', expand=False))

data.head()

Create box plots for min and max salary by job title:

In [None]:
data['Job Title'].value_counts()[:30]

In [None]:
def plot_salary(y, salary, amt=30, title=''):
    l = list(data[y].value_counts().iloc[:amt].index)
    temp = data[data[y].isin(l)]
    
    salary_order = temp.groupby(y)[salary].mean().sort_values(ascending=False).index.values
    
    chart = sns.catplot(data=temp, y=y, x=salary, kind='box', 
                    order=salary_order, 
                    height=10, aspect=1)
    
    plt.title(title)
#     chart.set_xticklabels(rotation=45)

def least_freq(group, target, min_counts=3, max_counts=11):
    counts = Counter(data[group])
    sal = data[data[group]\
                   .isin([x for x in counts if (counts[x] > min_counts & counts[x] < max_counts)])]\
    .groupby(group)[target].mean().reset_index()

    return sal.sort_values(by=(target),ascending=False).head(10)

In [None]:
plot_salary('Job Title', 'MinSalary', amt=12, title='Minimum Salary by Job Title')

In [None]:
plot_salary('Job Title', 'MaxSalary', amt=12, title='Maximum Salary by Job Title')

For the top 12 most frequent data analyst jobs, Lead Data Analyst has the highest average minimum and maximum salary. Below the least frequent highest average minimum and maximum salary jobs are shown.

In [None]:
least_freq('Job Title', 'MinSalary')

In [None]:
least_freq('Job Title', 'MaxSalary')

# Group Salary by Location

In [None]:
data['Location'].nunique()

In [None]:
data['Location'].value_counts()[:30]

In [None]:
plot_salary('Location', 'MinSalary', title='Minimum salary (in x$10k) by location')

In [None]:
plot_salary('Location', 'MaxSalary', title='Maximum salary (in x$10k) by location')

For the most frequent locations, San Jose, CA has the highest average minimum and maximum salary. Below are the least frequent locations and their minimum and maximum average salaries:

In [None]:
least_freq('Location', 'MinSalary')

In [None]:
least_freq('Location', 'MaxSalary')

The highest average minimum and maxiumum salaries are in less frequent locations but are mostly all in California.

In [None]:
plot_salary('Headquarters', 'MinSalary', title='Minimum salary (in x$10k) by headquarters')

In [None]:
plot_salary('Headquarters', 'MaxSalary', title='Maximum salary (in x$10k) by headquarters')

In [None]:
least_freq('Headquarters', 'MinSalary')

In [None]:
least_freq('Headquarters', 'MaxSalary')

# Split city and country/state

Let's try splitting up location and headquarters by their city and state/country:

In [None]:
split = data['Location'].str.split(',', expand=True)
data['City'] = split[0]
data['Country_State'] = split[1].str.strip()

data.head()

In [None]:
split = data['Headquarters'].str.split(',', expand=True)
data['CityHQ'] = split[0]
data['Country_State_HQ'] = split[1].str.strip()

data.head()

Arapahoe is a county in Colorado, so let's replace it with the state:

In [None]:
data['Country_State'] = data['Country_State'].replace({'Arapahoe': 'CO'})
data['Country_State'].nunique()

In [None]:
data['Country_State'].value_counts()

In [None]:
data['Country_State_HQ'].nunique()

In [None]:
data['Country_State_HQ'].value_counts()[:30]

In [None]:
plot_salary('Country_State', 'MinSalary', title='Minimum salary (in x$10k) by country/state')

In [None]:
plot_salary('Country_State', 'MaxSalary', title='Maximum salary (in x$10k) by country/state')

Illinois has the highest average minimum salary, and California has the highest average maximum salary.

**Top 30 average minimum and maximum salaries by headquarters**

In [None]:
plot_salary('Country_State_HQ', 'MinSalary', title='Minimum salary (in x$10k) by headquarters')

In [None]:
plot_salary('Country_State_HQ', 'MaxSalary', title='Maximum salary (in x$10k) by headquarters')

In [None]:
least_freq('Country_State_HQ', 'MinSalary', min_counts=0, max_counts=7)

In [None]:
least_freq('Country_State_HQ', 'MaxSalary', min_counts=0, max_counts=7)

# Group Salary by Sector

In [None]:
data['Sector'].nunique()

In [None]:
data['Sector'].value_counts()

In [None]:
plot_salary('Sector', 'MinSalary', title='Minimum salary (in x$10k) by sector')

In [None]:
plot_salary('Sector', 'MaxSalary', title='Maximum salary (in x$10k) by sector')

The Biotech and Pharmaceuticals sector has the highest average minimum and maximum salaries.

# Salary by Job Title and Sector

In [None]:
def by_multiple(group, target, min_counts=11):
    counts = Counter(data[group[0]])
    sal = data[data[group[0]]\
               .isin([x for x in counts if counts[x] > min_counts])]\
               .groupby(group)[target[0]].mean().reset_index()

    return sal.sort_values(by=(target),ascending=False).head(10)

In [None]:
by_multiple(['Job Title', 'Sector'], ['MinSalary'])

In [None]:
by_multiple(['Sector', 'Job Title'], ['MinSalary'])

In [None]:
by_multiple(['Job Title', 'Sector'], ['MaxSalary'])

In [None]:
by_multiple(['Sector', 'Job Title'], ['MaxSalary'])

In [None]:
chart = sns.catplot(data=data, x='Job Title', y='MinSalary', kind='bar', hue='Sector',
                    order=data['Job Title'].value_counts().iloc[:5].index, 
                    height=10, aspect=2)

chart.set_xticklabels(rotation=45)

In [None]:
chart = sns.catplot(data=data, x='Job Title', y='MaxSalary', kind='bar', hue='Sector',
                    order=data['Job Title'].value_counts().iloc[:5].index, 
                    height=10, aspect=2)

chart.set_xticklabels(rotation=45)

## Salaries of least popular job titles by sector

In [None]:
chart = sns.catplot(data=data, x='Job Title', y='MinSalary', kind='bar', hue='Sector',
                    order=data['Job Title'].value_counts().iloc[5:10].index, 
                    height=10, aspect=2)

chart.set_xticklabels(rotation=45)

In [None]:
chart = sns.catplot(data=data, x='Job Title', y='MaxSalary', kind='bar', hue='Sector',
                    order=data['Job Title'].value_counts().iloc[5:10].index, 
                    height=10, aspect=2)

chart.set_xticklabels(rotation=45)

In [None]:
chart = sns.catplot(data=data, x='Job Title', y='MinSalary', kind='bar', hue='Sector',
                    order=data['Job Title'].value_counts().iloc[10:15].index, 
                    height=10, aspect=2)

chart.set_xticklabels(rotation=45)

In [None]:
chart = sns.catplot(data=data, x='Job Title', y='MaxSalary', kind='bar', hue='Sector',
                    order=data['Job Title'].value_counts().iloc[10:15].index, 
                    height=10, aspect=2)

chart.set_xticklabels(rotation=45)

In [None]:
chart = sns.catplot(data=data, x='Job Title', y='MinSalary', kind='bar', hue='Sector',
                    order=data['Job Title'].value_counts().iloc[15:20].index, 
                    height=10, aspect=2)

chart.set_xticklabels(rotation=45)

In [None]:
chart = sns.catplot(data=data, x='Job Title', y='MaxSalary', kind='bar', hue='Sector',
                    order=data['Job Title'].value_counts().iloc[15:20].index, 
                    height=10, aspect=2)

chart.set_xticklabels(rotation=45)

# Highest salary jobs by location and rating

These queries prioritizes the highest average min/max salary, then rating. It also shows other important columns like the company name, job title, sector and the country/state that the job is located.

In [None]:
min_sal = data.groupby(['Company Name', 'Job Title', 'Rating', 'Sector', 'Country_State'])['MinSalary'].mean().reset_index()
min_sal.sort_values(by=(['Rating', 'MinSalary']),ascending=False).head(10)

In [None]:
min_sal = data.groupby(['Company Name', 'Job Title', 'Rating', 'Sector', 'Country_State'])['MaxSalary'].mean().reset_index()
min_sal.sort_values(by=(['Rating', 'MaxSalary']),ascending=False).head(10)

These queries prioritizes **most frequent job titles** then the average min/max salary, then the rating.

In [None]:
by_multiple(['Job Title', 'Company Name', 'Rating', 'Sector', 'Country_State'], ['MinSalary', 'Rating'])

In [None]:
by_multiple(['Job Title', 'Company Name', 'Rating', 'Sector', 'Country_State'], ['MaxSalary', 'Rating'])

## For Junior Data Analyst

If you're starting out as a Junior Data Analyst, the company with the highest salaries and rating are:

In [None]:
min_sal = data[data['Job Title']=='Junior Data Analyst'].groupby(['Company Name', 'Rating', 'Sector', 'Country_State'])['MinSalary'].mean().reset_index()
min_sal.sort_values(by=(['Rating', 'MinSalary']),ascending=False).head(20)

In [None]:
max_sal = data[data['Job Title']=='Junior Data Analyst'].groupby(['Company Name', 'Rating', 'Sector', 'Country_State'])['MaxSalary'].mean().reset_index()
max_sal.sort_values(by=(['Rating', 'MaxSalary']),ascending=False).head(20)

Staffigo Technical Servies has 14 different locations in the US for a Junior Data Analyst in the Information Technology sector.

### Why is the 'Arts, Entertainment & Recreation' sector a high-paying industry?

PeopleFun is a mobile game development company. Roar Digital and FanDuel are about online fantasy sports with betting.

In [None]:
min_sal = data[data['Sector']=='Arts, Entertainment & Recreation']\
                .groupby(['Company Name', 'Rating', 'Sector', 'Country_State'])['MinSalary']\
                .mean().reset_index()

min_sal.sort_values(by=(['Rating', 'MinSalary']),ascending=False).head(20)

In [None]:
max_sal = data[data['Sector']=='Arts, Entertainment & Recreation']\
            .groupby(['Company Name', 'Rating', 'Sector', 'Country_State'])['MaxSalary']\
            .mean().reset_index()

max_sal.sort_values(by=(['Rating', 'MaxSalary']),ascending=False).head(20)