In [1]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt  
# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [2]:


df[pd.notna(df['salary_year_avg'])]['salary_year_avg']

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

In [3]:
help(df.apply)

Help on method apply in module pandas.core.frame:

apply(func: 'AggFuncType', axis: 'Axis' = 0, raw: 'bool' = False, result_type: "Literal['expand', 'reduce', 'broadcast'] | None" = None, args=(), by_row: "Literal[False, 'compat']" = 'compat', engine: "Literal['python', 'numba']" = 'python', engine_kwargs: 'dict[str, bool] | None' = None, **kwargs) method of pandas.core.frame.DataFrame instance
    Apply a function along an axis of the DataFrame.
    
    Objects passed to the function are Series objects whose index is
    either the DataFrame's index (``axis=0``) or the DataFrame's columns
    (``axis=1``). By default (``result_type=None``), the final return type
    is inferred from the return type of the applied function. Otherwise,
    it depends on the `result_type` argument.
    
    Parameters
    ----------
    func : function
        Function to apply to each column or row.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Axis along which the function is applied:
 

In [4]:
df_sal = df[pd.notna(df['salary_year_avg'])].copy()


def projected_salary(salary):
    return salary * 1.03

df_sal['salary_year_inflated'] = df_sal['salary_year_avg'].apply(lambda x: x * 1.03)

df_sal[['salary_year_avg','salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [5]:
type(df['job_skills'][2])

str

In [6]:
import ast

ast.literal_eval(df['job_skills'][2])

['python',
 'sql',
 'c#',
 'azure',
 'airflow',
 'dax',
 'docker',
 'kubernetes',
 'jenkins']

In [7]:
import ast
import pandas as pd

def clean_list(skill_list):
    if pd.notna(skill_list):
        # ast.literal_eval safely evaluates a string containing a Python literal
        return ast.literal_eval(skill_list)
    else:
        # Return an empty list or None for missing values
        return []

# Apply the function to the DataFrame column
df['job_skills'] = df['job_skills'].apply(clean_list)


In [8]:
df[pd.isna(df['job_skills'])]

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills


In [9]:
df.loc[df['job_skills'].isna(), 'job_skills']

Series([], Name: job_skills, dtype: object)

In [10]:
type(df['job_skills'][2])

list

In [11]:
df['job_skills'] = df['job_skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

ValueError: The truth value of an empty array is ambiguous. Use `array.size > 0` to check that an array is not empty.

In [12]:
df['job_skills'] 


0                                                        []
1                [r, python, sql, nosql, power bi, tableau]
2         [python, sql, c#, azure, airflow, dax, docker,...
3         [python, c++, java, matlab, aws, tensorflow, k...
4         [bash, python, oracle, aws, ansible, puppet, j...
                                ...                        
785736    [bash, python, perl, linux, unix, kubernetes, ...
785737                               [sas, sas, sql, excel]
785738                                  [powerpoint, excel]
785739    [python, go, nosql, sql, mongo, shell, mysql, ...
785740                                          [aws, flow]
Name: job_skills, Length: 785741, dtype: object

# Calculate Projected Salary Next Year

* Senior roles assume 5%
* Other roles assume 3%



In [13]:
def project_salary(row):
    if 'Senior' in row['job_title_short']:
        return row['salary_year_avg'] * 1.05
    else:
        return row['salary_year_avg'] * 1.03


df_sal['salary_year_inflated']= df_sal.apply(projected_salary, axis=1)

df[['job_title_short','salary_year_avg','salary_year_inflated']]

TypeError: can't multiply sequence by non-int of type 'float'

In [16]:
# Convert the 'salary_year_avg' column to a numeric type
# The errors='coerce' argument will turn any non-numeric values into NaN
df_sal['salary_year_avg'] = pd.to_numeric(df_sal['salary_year_avg'], errors='coerce')

# Define your function
def project_salary(row):
    if 'Senior' in row['job_title_short']:
        return row['salary_year_avg'] * 1.05
    else:
        return row['salary_year_avg'] * 1.03

# Apply the function to create the new column
df_sal['salary_year_inflated'] = df_sal.apply(project_salary, axis=1)

# Display the desired columns
df_sal[['job_title_short', 'salary_year_avg', 'salary_year_inflated']].sample(10)

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
364593,Business Analyst,80000.0,82400.0
389018,Senior Data Scientist,157500.0,165375.0
207949,Senior Data Engineer,162500.0,170625.0
332955,Data Scientist,157500.0,162225.0
225679,Data Scientist,132368.0,136339.04
739370,Data Engineer,84859.5,87405.285
699112,Data Engineer,121680.0,125330.4
376311,Data Engineer,125000.0,128750.0
374731,Data Engineer,154000.0,158620.0
110474,Data Scientist,154000.0,158620.0


In [18]:
df_sal['salary_year_inflated'] = df_sal.apply(lambda row: row['salary_year_avg'] * 1.05 if 'Senior' in row['job_title_short'] else row['salary_year_avg'] * 1.03, axis=1)

# Display the desired columns
df_sal[['job_title_short', 'salary_year_avg', 'salary_year_inflated']].sample(10)

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
194156,Machine Learning Engineer,227500.0,234325.0
587158,Business Analyst,55000.0,56650.0
536668,Data Scientist,85000.0,87550.0
751882,Senior Data Analyst,89100.0,93555.0
255600,Senior Data Scientist,180000.0,189000.0
674703,Data Analyst,54165.0,55789.95
430965,Data Scientist,102500.0,105575.0
774264,Data Scientist,166419.5,171412.085
412715,Data Engineer,112143.5,115507.805
375045,Data Engineer,187500.0,193125.0
