In [1]:
# Import Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [2]:
df[pd.notna(df['salary_year_avg'])]['salary_year_avg']

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

In [None]:
df_salary = df[pd.notna(df['salary_year_avg'])].copy() # creates a new df that has salary_year_avg with no NaN values

def projected_salary(salary): # defines function to use in the .apply() function
    return salary * 1.03

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(projected_salary) 
# this is th apply function and us assigning it to a new column we created called salary_year_inflated

df_salary[['salary_year_avg', 'salary_year_inflated']] # this is use just calling the 2 columns to view them

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [None]:
# rather than use a defined function we are now going to use what is called an anonymous function

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)

df_salary[['salary_year_avg', 'salary_year_inflated']]

# in this example rather than define a function we simply used a lambda function and that has reduced out code by 2 lines
# this is a really simple example of the apply method. you don't actuall need to define a function or use lambda
# you could write this code below, but this was just to ease into the idea of the apply function

df_salary['salary_year_inflated'] = df['salary_year_avg'] * 1.03

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [None]:
# for the example we are going to take the skills column that is actually a string at the moment and convert it into a list

import ast

ast.literal_eval(df['job_skills'][1])

# have to pay attention for None values otherwise it will cause an error as the ast.literal_eval is looking
# for a string not a None value or other type of data

In [None]:
def clean_list(skill_list):
    if pd.notna(skill_list):
        return ast.literal_eval(skill_list)

df['job_skills'] = df['job_skills'].apply(clean_list)

In [46]:
df['job_skills'][3]

['python', 'c++', 'java', 'matlab', 'aws', 'tensorflow', 'keras', 'pytorch']

### Calculate Salary Next Year
- Senior Roles get 5% Increase
- Other Roles get 3% increase 

In [None]:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)

df_salary[['salary_year_avg', 'salary_year_inflated']]

In [49]:
def projected_salary(row):
    if "Senior" in row['job_title_short']:
        return 1.05 * row['salary_year_avg']
    else:
        return 1.03 * row['salary_year_avg']

df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis=1)

df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']].sample(15)

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
698855,Senior Data Engineer,147500.0,154875.0
775094,Data Analyst,90000.0,92700.0
397032,Senior Data Analyst,135000.0,141750.0
288007,Data Scientist,90000.0,92700.0
575632,Senior Data Analyst,85000.0,89250.0
137807,Data Scientist,92433.0,95205.99
220311,Data Scientist,200000.0,206000.0
732706,Senior Data Engineer,147500.0,154875.0
264795,Senior Data Engineer,175000.0,183750.0
28918,Data Scientist,81500.0,83945.0
