# Pandas: Apply Functions

* [pandas.DataFrame.apply](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html)
* [ast.literal_eval](https://docs.python.org/3/library/ast.html#ast.literal_eval)

In [47]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

## Example

### `apply()` - Calculate the Project Salary Next Year

Using a ficticious 3% inflation rate

In [9]:
salary_year_avg = df[pd.notna(df['salary_year_avg'])]['salary_year_avg']
salary_year_avg


28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

In [11]:
# Alternative way to write the above
salary_year_avg = df.loc[pd.notna(df['salary_year_avg']), 'salary_year_avg']

salary_year_avg

28        109500.0
77        140000.0
92        120000.0
100       228222.0
109        89000.0
            ...   
785624    139216.0
785641    150000.0
785648    221875.0
785682    157500.0
785692    157500.0
Name: salary_year_avg, Length: 22003, dtype: float64

In [12]:
help(df.apply)

Help on method apply in module pandas.core.frame:

apply(func: 'AggFuncType', axis: 'Axis' = 0, raw: 'bool' = False, result_type: "Literal['expand', 'reduce', 'broadcast'] | None" = None, args=(), by_row: "Literal[False, 'compat']" = 'compat', engine: "Literal['python', 'numba']" = 'python', engine_kwargs: 'dict[str, bool] | None' = None, **kwargs) method of pandas.core.frame.DataFrame instance
    Apply a function along an axis of the DataFrame.
    
    Objects passed to the function are Series objects whose index is
    either the DataFrame's index (``axis=0``) or the DataFrame's columns
    (``axis=1``). By default (``result_type=None``), the final return type
    is inferred from the return type of the applied function. Otherwise,
    it depends on the `result_type` argument.
    
    Parameters
    ----------
    func : function
        Function to apply to each column or row.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Axis along which the function is applied:
 

In [16]:
# Define function which calculates the proejected salary
def projected_salary(salary):
    return salary * 1.03

# For demo to see this in action, remove na values in  a new dataframe
df_salary = df[pd.notna(df['salary_year_avg'])].copy()

#Apply function to the dataframe salary year average column to estimate next years salary
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(projected_salary)

df_salary[['salary_year_avg', 'salary_year_inflated']]


Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


### `apply()` with lambda

In [22]:
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)
df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


### `apply()` with Complex Scenario

In [None]:
# Demo of function to parse the skills into a list
import ast # Needed for parsing the skillset comman seperated list into a list object

skills = ast.literal_eval(df['job_skills'][1])

print(skills)

type(skills)

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']


list

In [48]:
import ast

# Function to convert the comma seperated list of skills into a list object
def clean_list(skill_list):
    if pd.notna(skill_list):
        return ast.literal_eval(skill_list)


# Apply the function to the skill list column
df['job_skills'] = df['job_skills'].apply(clean_list)

df['job_skills'][1]

['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [None]:
# Reset dataset since we overrode it last example

# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [51]:
# Rewritten as a lambda
import ast
df['job_skills'] = df['job_skills'].apply(lambda skill_list: ast.literal_eval(skill_list) if pd.notna(skill_list) else skill_list)

## Calculate Projected salary next year

* Senior roles assume 5%
* Other roles assume 3%

In [53]:
# Previous example with a flat rate
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03 )
df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [54]:
# Utilize multiple columns to solution the problem statement

# projected salary function for calculating the future salary based on the job_title_short
def projected_salary(row):
    if "Senior" in row['job_title_short']:
        return row['salary_year_avg'] * 1.05
    else:
        return row['salary_year_avg'] * 1.03

#apply to the entire data frame so we have access to all columns
df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis=1)

df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00


In [55]:
# Rewritten as a lambda function
df_salary['salary_year_inflated'] = df_salary.apply(lambda row: row['salary_year_avg'] * 1.05 if "Senior" in row['job_title_short'] else row['salary_year_avg'] * 1.03, axis=1)
df_salary[['job_title_short', 'salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00


# Problems

In [56]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

## Convert Date to String (2.10.1) - Problem

In [70]:
df['job_posted_date_str'] = df['job_posted_date'].apply(lambda job_posted_date: job_posted_date.strftime("%Y-%m-%d") )

df[['job_posted_date', 'job_posted_date_str']]

Unnamed: 0,job_posted_date,job_posted_date_str
0,2023-06-16 13:44:15,2023-06-16
1,2023-01-14 13:18:07,2023-01-14
2,2023-10-10 13:14:55,2023-10-10
3,2023-07-04 13:01:41,2023-07-04
4,2023-08-07 14:29:36,2023-08-07
...,...,...
785736,2023-03-13 06:16:16,2023-03-13
785737,2023-03-12 06:18:18,2023-03-12
785738,2023-03-12 06:32:36,2023-03-12
785739,2023-03-12 06:32:15,2023-03-12


## Days Since Posted (2.10.2) - Problem

In [77]:
from datetime import datetime

current_date = datetime.now()


df['days_since_posted'] = df['job_posted_date'].apply(lambda posted_date: (current_date - posted_date).days)

df[['job_posted_date', 'days_since_posted']]

Unnamed: 0,job_posted_date,days_since_posted
0,2023-06-16 13:44:15,839
1,2023-01-14 13:18:07,992
2,2023-10-10 13:14:55,723
3,2023-07-04 13:01:41,821
4,2023-08-07 14:29:36,787
...,...,...
785736,2023-03-13 06:16:16,934
785737,2023-03-12 06:18:18,935
785738,2023-03-12 06:32:36,935
785739,2023-03-12 06:32:15,935


## Salary Category (2.10.3) - Problem

In [89]:
df_filtered = df.dropna(subset='salary_year_avg').copy()

df_filtered['salary_category'] = df_filtered['salary_year_avg'].apply(
    lambda salary_year_avg: 'Low' if salary_year_avg < 60_000 or pd.isna(salary_year_avg) 
    else ('Medium' if salary_year_avg < 100_000 
          else 'High'))

df_filtered[['salary_year_avg', 'salary_category']]

Unnamed: 0,salary_year_avg,salary_category
28,109500.0,High
77,140000.0,High
92,120000.0,High
100,228222.0,High
109,89000.0,Medium
...,...,...
785624,139216.0,High
785641,150000.0,High
785648,221875.0,High
785682,157500.0,High
