In [2]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas() 

df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [6]:
df[pd.notna(df['salary_year_avg'])]['salary_year_avg']

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

In [11]:
df_salary = df[pd.notna(df['salary_year_avg'])].copy()

def projected_salary(salary):
    return salary * 1.03

df_salary['salary_year_avg_projected'] = df_salary['salary_year_avg'].apply(projected_salary)
df_salary[['salary_year_avg', 'salary_year_avg_projected']]

Unnamed: 0,salary_year_avg,salary_year_avg_projected
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [12]:
df_salary['salary_year_avg_projected'] = df_salary['salary_year_avg'].apply(lambda x: x * 1.03)
df_salary[['salary_year_avg', 'salary_year_avg_projected']]

Unnamed: 0,salary_year_avg,salary_year_avg_projected
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [14]:
df_salary['salary_year_avg_projected'] = df_salary['salary_year_avg'] * 1.03
df_salary[['salary_year_avg', 'salary_year_avg_projected']]

Unnamed: 0,salary_year_avg,salary_year_avg_projected
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [None]:
type(df['job_skills'][0])
df_filtered = df[df['job_skills'].notna()].copy()
df_filtered.reset_index(drop=True, inplace=True)
df_filtered
df_filtered['job_skills'][0]

"['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']"

In [None]:
import ast
ast.literal_eval(df_filtered['job_skills'][0]) #converted string representation of list to an actual list

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [32]:
def clean_list(skills_str):
    if pd.isna(skills_str):
        return []
    else:
        return ast.literal_eval(skills_str)

df['job_skills_cleaned'] = df['job_skills'].apply(clean_list).copy()

df['job_skills_cleaned'][0]

[]

In [36]:
for i in range(10):
    if i < 5:  # Print the first 5 rows for demonstration
        print(f"Row {i} skills: {df['job_skills'][i]} cleaned skills: {df['job_skills_cleaned'][i]}")

Row 0 skills: nan cleaned skills: []
Row 1 skills: ['r', 'python', 'sql', 'nosql', 'power bi', 'tableau'] cleaned skills: ['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']
Row 2 skills: ['python', 'sql', 'c#', 'azure', 'airflow', 'dax', 'docker', 'kubernetes', 'jenkins'] cleaned skills: ['python', 'sql', 'c#', 'azure', 'airflow', 'dax', 'docker', 'kubernetes', 'jenkins']
Row 3 skills: ['python', 'c++', 'java', 'matlab', 'aws', 'tensorflow', 'keras', 'pytorch'] cleaned skills: ['python', 'c++', 'java', 'matlab', 'aws', 'tensorflow', 'keras', 'pytorch']
Row 4 skills: ['bash', 'python', 'oracle', 'aws', 'ansible', 'puppet', 'jenkins', 'gitlab', 'git'] cleaned skills: ['bash', 'python', 'oracle', 'aws', 'ansible', 'puppet', 'jenkins', 'gitlab', 'git']


In [37]:
type(df['job_skills_cleaned'][0])

list

In [38]:
help(df.apply)

Help on method apply in module pandas.core.frame:

apply(func: 'AggFuncType', axis: 'Axis' = 0, raw: 'bool' = False, result_type: "Literal['expand', 'reduce', 'broadcast'] | None" = None, args=(), by_row: "Literal[False, 'compat']" = 'compat', engine: "Callable | None | Literal['python', 'numba']" = None, engine_kwargs: 'dict[str, bool] | None' = None, **kwargs) method of pandas.DataFrame instance
    Apply a function along an axis of the DataFrame.
    
    Objects passed to the function are Series objects whose index is
    either the DataFrame's index (``axis=0``) or the DataFrame's columns
    (``axis=1``). By default (``result_type=None``), the final return type
    is inferred from the return type of the applied function. Otherwise,
    it depends on the `result_type` argument. The return type of the applied
    function is inferred based on the first computed result obtained after
    applying the function to a Series object.
    
    Parameters
    ----------
    func : functio

In [42]:
def projected_salary(row):
    if pd.notna(row['salary_year_avg']):
        if "Senior" in row['job_title_short']:
            return row['salary_year_avg'] * 1.05
        return row['salary_year_avg'] * 1.03
    else:
        return None
    
df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis=1)
df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00
