In [73]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt  

# Loading Data
dataset = load_dataset('coachprerakmehta/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [74]:
df['salary_year_avg']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
785736   NaN
785737   NaN
785738   NaN
785739   NaN
785740   NaN
Name: salary_year_avg, Length: 785741, dtype: float64

In [75]:
df_salary = df[pd.notna(df['salary_year_avg'])].copy()

In [76]:
help(df.apply)

Help on method apply in module pandas.core.frame:

apply(func: 'AggFuncType', axis: 'Axis' = 0, raw: 'bool' = False, result_type: "Literal['expand', 'reduce', 'broadcast'] | None" = None, args=(), by_row: "Literal[False, 'compat']" = 'compat', engine: "Literal['python', 'numba']" = 'python', engine_kwargs: 'dict[str, bool] | None' = None, **kwargs) method of pandas.core.frame.DataFrame instance
    Apply a function along an axis of the DataFrame.
    
    Objects passed to the function are Series objects whose index is
    either the DataFrame's index (``axis=0``) or the DataFrame's columns
    (``axis=1``). By default (``result_type=None``), the final return type
    is inferred from the return type of the applied function. Otherwise,
    it depends on the `result_type` argument.
    
    Parameters
    ----------
    func : function
        Function to apply to each column or row.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Axis along which the function is applied:
 

In [77]:
def projected_salary(salary):
    return salary * 1.07

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(projected_salary)

df_salary[['salary_year_inflated', 'salary_year_avg']]

Unnamed: 0,salary_year_inflated,salary_year_avg
28,117165.00,109500.0
77,149800.00,140000.0
92,128400.00,120000.0
100,244197.54,228222.0
109,95230.00,89000.0
...,...,...
785624,148961.12,139216.0
785641,160500.00,150000.0
785648,237406.25,221875.0
785682,168525.00,157500.0


In [78]:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.07)

df_salary[['salary_year_inflated', 'salary_year_avg']]

Unnamed: 0,salary_year_inflated,salary_year_avg
28,117165.00,109500.0
77,149800.00,140000.0
92,128400.00,120000.0
100,244197.54,228222.0
109,95230.00,89000.0
...,...,...
785624,148961.12,139216.0
785641,160500.00,150000.0
785648,237406.25,221875.0
785682,168525.00,157500.0


In [79]:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'] * 1.07

df_salary[['salary_year_inflated', 'salary_year_avg']]

Unnamed: 0,salary_year_inflated,salary_year_avg
28,117165.00,109500.0
77,149800.00,140000.0
92,128400.00,120000.0
100,244197.54,228222.0
109,95230.00,89000.0
...,...,...
785624,148961.12,139216.0
785641,160500.00,150000.0
785648,237406.25,221875.0
785682,168525.00,157500.0


In [80]:
type(df['job_skills'][1])

str

In [81]:
list(df['job_skills'][1])

['[',
 "'",
 'r',
 "'",
 ',',
 ' ',
 "'",
 'p',
 'y',
 't',
 'h',
 'o',
 'n',
 "'",
 ',',
 ' ',
 "'",
 's',
 'q',
 'l',
 "'",
 ',',
 ' ',
 "'",
 'n',
 'o',
 's',
 'q',
 'l',
 "'",
 ',',
 ' ',
 "'",
 'p',
 'o',
 'w',
 'e',
 'r',
 ' ',
 'b',
 'i',
 "'",
 ',',
 ' ',
 "'",
 't',
 'a',
 'b',
 'l',
 'e',
 'a',
 'u',
 "'",
 ']']

In [82]:
import ast

In [83]:
ast.literal_eval(df['job_skills'][1])

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [84]:
def clean_list(skill_list):
    if pd.notna(skill_list):
        return ast.literal_eval(skill_list)

df['job_skills'] = df['job_skills'].apply(clean_list)

In [85]:
type(df['job_skills'][1])

list

In [102]:
df['job_skills'] = df['job_skills'].apply(lambda skill_list: ast.literal_eval(skill_list) if isinstance(skill_list, str) and pd.notna(skill_list) else skill_list)

In [103]:
type(df['job_skills'][1])

list

In [None]:
df[pd.isna(df['job_skills'])]

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
0,Senior Data Engineer,Senior Clinical Data Engineer / Principal Clin...,"Watertown, CT",via Work Nearby,Full-time,False,"Texas, United States",2023-06-16 13:44:15,False,False,United States,,,,Boehringer Ingelheim,,
21,Data Scientist,Stage - data scientist – génération ia de donn...,"Asnières-sur-Seine, France",via Jobijoba,Full-time,False,France,2023-02-23 13:41:21,False,False,France,,,,Credit Agricole,,
26,Data Engineer,Data Engineer,United States,via LinkedIn,Full-time,False,Georgia,2023-09-15 13:56:18,True,False,United States,,,,Infinite Computer Solutions,,
29,Data Scientist,Data Scientist,"Pune, Maharashtra, India",via Exusia,Full-time,False,India,2023-03-26 13:16:20,False,False,India,,,,Exusia,,
36,Data Analyst,Data Analyst,"Des Moines, IA",via Trabajo.org,Full-time,False,"Illinois, United States",2023-11-06 13:01:22,False,True,United States,,,,Assuredpartners,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785713,Cloud Engineer,"Data Center Chief Engineer, AWS Infrastructure...","Bangkok, Thailand",melalui LinkedIn,Pekerjaan tetap,False,Thailand,2023-03-12 06:37:47,False,False,Thailand,,,,Amazon Web Services (AWS),,
785715,Data Analyst,Amul Careers 2023 - Apply Online - Data Analys...,India,melalui Jobsleworld - Jobs In India - Job Vaca...,Pekerjaan tetap,False,India,2023-03-13 06:16:28,False,False,India,,,,Amul,,
785718,Business Analyst,Senior Performance QA Analyst,India,melalui BeBee India,Pekerjaan tetap,False,India,2023-03-13 06:16:28,False,False,India,,,,Diebold Nixdorf,,
785724,Data Engineer,Junior Consultant Data Engineering,"Koln, Jerman",melalui BeBee Deutschland,Pekerjaan tetap,False,Germany,2023-03-13 06:19:07,False,False,Germany,,,,Target Reply,,


# Calculate projected salary next year

* Senior roles assume 10%

* Other roles assume 7%

In [None]:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.07)

df_salary[['salary_year_avg', 'salary_year_inflated']]

In [112]:
def projected_salary(row):
    if "Senior" in row['job_title_short']:
        return 1.1 * row['salary_year_avg']
    else:
        return 1.07 * row['salary_year_avg']

df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis=1)

df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,117165.00
77,Data Engineer,140000.0,149800.00
92,Data Engineer,120000.0,128400.00
100,Data Scientist,228222.0,244197.54
109,Data Analyst,89000.0,95230.00
...,...,...,...
785624,Data Engineer,139216.0,148961.12
785641,Data Engineer,150000.0,160500.00
785648,Data Scientist,221875.0,237406.25
785682,Data Scientist,157500.0,168525.00
