## Dependencies

In [None]:
# %pip install -q numpy pandas plotly nbformat matplotlib pywaffle
# !python --version > python_version.txt
# %pip freeze > requirements.txt
%pip install -q -r requirements.txt

## Imports

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt
from pywaffle import Waffle

In [None]:
df = pd.read_csv('jobs_in_data_2024.csv')
df.head()

## Data Quality

### Missing values

In [None]:
missing = df.isna().sum()
pd.DataFrame({'Missing': missing})

### Static Values

In [None]:
_ = df.isna().sum()
static = _ == 1
pd.DataFrame({'Is static?': static})

## Data Standardization - Salaries in USD

In [None]:
# In this case salaries are already standardized to USD!
df[['salary', 'salary_currency', 'salary_in_usd']].head(10)

## Exploratory Data Analysis - EDA

### Numeric features

In [None]:
df.describe()

### Categorical features

In [None]:
df.describe(include=[object])

### Box Plots

Distributions of a continuous variable (here salary) across different categories or groups.

#### Distribution of salary

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(
    y=df['salary_in_usd'],
    boxpoints='outliers', # only outliers
    jitter=0.3
))
# Add labels and title
fig.update_layout(
    title='Distribution of Salary',
    xaxis_title='All Employees',
    yaxis_title='Salary (USD)',
)

#### Outliers in Salary distribution

In [None]:
# Calculate the first quartile (Q1), third quartile (Q3), and the interquartile range (IQR)
Q1 = df['salary_in_usd'].quantile(0.25)
Q3 = df['salary_in_usd'].quantile(0.75)
IQR = Q3 - Q1

# Identify the outlier rows
df_top = df[df['salary_in_usd'] > (Q3 + 1.5 * IQR)]\
    .sort_values('salary_in_usd', ascending=False)\
        .reset_index(drop=True)

df_top.head()

##### Waffle chart of highest paid employees

In [None]:
df_top_by_country = df_top.loc[:, ['employee_residence', 'salary']].groupby('employee_residence').count()
df_top_by_country
with plt.style.context('seaborn-v0_8-bright'):
    fig = plt.figure(
        FigureClass = Waffle,
        rows = 20, columns = 30, #pass the number of rows and columns for the waffle 
        values = df_top_by_country['salary'], #pass the data to be used for display
        legend = {
            'labels': [f"{k} ({v})" for k, v in zip(df_top_by_country.index.values,df_top_by_country.salary)],
            'loc': 'upper right', 
            # 'bbox_to_anchor':(0,-0.2),
            'ncol': 4,
            'prop': {'size': 16}
        },
        figsize=(16,9)
        #notice the use of list comprehension for creating labels 
        #from index and total of the dataset
    )

#### Distribution of salary across experience level

In [None]:
# distribution of salary across experience_level, job_title, job_category, company_size, company_location
category = 'experience_level'
df_by = df.loc[:, [category, 'salary_in_usd']].groupby(category).median().reset_index()
df_by.head()

fig = go.Figure()
fig.add_trace(go.Bar(y=df_by['salary_in_usd'], x=df_by[category]))
# Add labels and title
fig.update_layout(
    title=f'Distribution of Salary across {category.capitalize().replace('_', ' ')}',
    xaxis_title=category.capitalize().replace('_', ' '),
    yaxis_title='Salary (USD)',
)

## Geospatial Maps

In [None]:
from urllib.request import urlopen
import json

world_geo = r'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DV0101EN-SkillsNetwork/Data%20Files/world_countries.json'

with urlopen(world_geo) as response:
    geodata = json.load(response)

### Transform countries in 'employee_residence' to 'geodata' like

In [None]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(df):
    # Replace all instances of "Bolivia, Plurinational State of" with "Bolivia, Bosnia and Herzegovina, Botswana" in column: 'employee_residence'
    df['employee_residence'] = df['employee_residence'].str.replace("Bolivia, Plurinational State of", "Bolivia, Bosnia and Herzegovina, Botswana", case=False, regex=False)
    # Replace all instances of "Bosnia and Herzegovina" with "Bolivia, Bosnia and Herzegovina, Botswana" in column: 'employee_residence'
    df['employee_residence'] = df['employee_residence'].str.replace("Bosnia and Herzegovina", "Bolivia, Bosnia and Herzegovina, Botswana", case=False, regex=False)
    # Replace all instances of "Iran, Islamic Republic of" with "Iran" in column: 'employee_residence'
    df['employee_residence'] = df['employee_residence'].str.replace("Iran, Islamic Republic of", "Iran", case=False, regex=False)
    # Replace all instances of "Korea, Republic of" with "South Korea" in column: 'employee_residence'
    df['employee_residence'] = df['employee_residence'].str.replace("Korea, Republic of", "South Korea", case=False, regex=False)
    # Replace all instances of "Moldova, Republic of" with "Moldova" in column: 'employee_residence'
    df['employee_residence'] = df['employee_residence'].str.replace("Moldova, Republic of", "Moldova", case=False, regex=False)
    # Replace all instances of "Serbia" with "Republic of Serbia" in column: 'employee_residence'
    df['employee_residence'] = df['employee_residence'].str.replace("Serbia", "Republic of Serbia", case=False, regex=False)
    # Replace all instances of "Türkiye" with "Turkey" in column: 'employee_residence'
    df['employee_residence'] = df['employee_residence'].str.replace("Türkiye", "Turkey", case=False, regex=False)
    # Replace all instances of "United States" with "United States of America" in column: 'employee_residence'
    df['employee_residence'] = df['employee_residence'].str.replace("United States", "United States of America", case=False, regex=False)
    # Replace all instances of "Viet Nam" with "Vietnam" in column: 'employee_residence'
    df['employee_residence'] = df['employee_residence'].str.replace("Viet Nam", "Vietnam", case=False, regex=False)
    return df

df_geo= clean_data(df.copy())

In [None]:
# Group by employee_residence and count
df_geo_by_emp_res = df_geo.loc[:, ['employee_residence', 'salary_in_usd']].groupby('employee_residence').median().reset_index()
df_geo_by_emp_res.rename(columns={'salary_in_usd': 'median_salary_in_usd'}, inplace=True)
df_geo_by_emp_res.head()

### 2024 Data Scientist Asia Median Salary

In [None]:
fig = px.choropleth(
    geojson=geodata,
    featureidkey="properties.name",
    data_frame=df_geo_by_emp_res, 
    locations="employee_residence", 
    color="median_salary_in_usd",
    labels={'median_salary_in_usd':'Median salary (USD)'},
    title='World Median salary for Data Scientists 2024'
    # scope='asia'
)
fig.update_geos(projection_type="natural earth")