In [1]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
import calendar

# loading data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

#Data cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [4]:
# we are calculating the projected salary next year assuming 3% increment for all roles
# 1st using the define function and return function
def inflation(salary):
    return salary * 1.03
df['projected_salary_next_year'] = df['salary_year_avg'].apply(inflation)
df[pd.notna(df['salary_year_avg'])][['salary_year_avg', 'projected_salary_next_year']].head()

Unnamed: 0,salary_year_avg,projected_salary_next_year
28,109500.0,112785.0
77,140000.0,144200.0
92,120000.0,123600.0
100,228222.0,235068.66
109,89000.0,91670.0


In [7]:
#simply using lambda function
df['projected_salary_next_year'] = df['salary_year_avg'].apply(lambda x: x * 1.03)
df[pd.notna(df['salary_year_avg'])][['salary_year_avg', 'projected_salary_next_year']]

Unnamed: 0,salary_year_avg,projected_salary_next_year
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [8]:
# it could also be done using vectorized operation
df['projected_salary_next_year'] = df['salary_year_avg'] * 1.03
df[pd.notna(df['salary_year_avg'])][['salary_year_avg', 'projected_salary_next_year']]

Unnamed: 0,salary_year_avg,projected_salary_next_year
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [10]:
# example 2, calculating infalted salary but if its senior role then 5% increment, else 3%
def inflation_role(row):
    if 'senior' in row['job_title_short']:
        return row['salary_year_avg'] * 1.05
    else:
        return row['salary_year_avg'] * 1.03

df['projected_salary_next_year'] = df.apply(inflation_role, axis=1)
df[pd.notna(df['salary_year_avg'])][['job_title_short', 'salary_year_avg', 'projected_salary_next_year']]

Unnamed: 0,job_title_short,salary_year_avg,projected_salary_next_year
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00


In [11]:
# with lambda function
df['projected_salary_next_year'] = df.apply(lambda row: row['salary_year_avg'] * 1.05 if 'senior' in row['job_title_short'] else row['salary_year_avg'] * 1.03, axis=1)
df[pd.notna(df['salary_year_avg'])][['job_title_short', 'salary_year_avg', 'projected_salary_next_year']]

Unnamed: 0,job_title_short,salary_year_avg,projected_salary_next_year
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00


In [12]:
# literal_eval function to convert string representation of list to actual list
import ast
df['job_skills'] = df['job_skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df['job_skills'].head()

0                                                   []
1           [r, python, sql, nosql, power bi, tableau]
2    [python, sql, c#, azure, airflow, dax, docker,...
3    [python, c++, java, matlab, aws, tensorflow, k...
4    [bash, python, oracle, aws, ansible, puppet, j...
Name: job_skills, dtype: object