Get data

In [75]:
import requests

api_url = "https://justjoin.it/api/offers"
r = requests.get(api_url)

In [76]:
import json
import pandas as pd

data = json.loads(r.text)
type(data)

df = pd.DataFrame.from_dict(data)

dropped = False

df.head()
len(df)

16761

Drop unimportant columns

In [77]:
if not dropped:
    df = df.drop(["street", "address_text", "marker_icon", "company_url", "latitude", "longitude", "remote_interview", "open_to_hire_ukrainians", "company_logo_url", "multilocation", "way_of_apply"], axis=1)

dropped = True

Find all employment types to unpivot the salaries into a new columns

In [78]:
def get_types_to_set(dict_list_arg, types_set: set):
    for l in dict_list_arg:
        emp_dict = dict(l)
        types_set.add(emp_dict.get("type"))

In [79]:
emp_types_set = set()
df["employment_types"].apply(get_types_to_set, types_set=emp_types_set)
print(emp_types_set)

{'mandate_contract', 'permanent', 'b2b'}


We have three types of employement:
 - B2B - code 'b2b'
 - CoE - code 'permanent'
 - CoM - code 'mandate_contract'

Unnest type and salaries into new nullable columns:
- b2b_min_salary
- b2b_max_salary
- contract_of_employment_min_salary
- contract_of_employment_max_salary
- contract_of_mandate_min_salary
- contract_of_mandate_max_salary

In [80]:
class EmploymentType:
    def __init__(self, emp_type: str, emp_code: str):
        self.emp_type = emp_type
        self.emp_code = emp_code

    def get_salary_dict(self, emp_type_dict: dict) -> dict:
        min_salary = None
        max_salary = None
        currency = None

        try:
            if emp_type_dict is not None and emp_type_dict.get("type") == self.emp_code:
                salary_str = emp_type_dict.get("salary")
                if salary_str is not None:
                    salary_dict = dict(salary_str)
                    min_salary = salary_dict.get("from")
                    max_salary = salary_dict.get("to")
                    currency = salary_dict.get("currency")
        except ValueError:
            pass

        return ({
            f"{self.emp_type}_min_salary": min_salary,
            f"{self.emp_type}_max_salary": max_salary,
            f"{self.emp_type}_currency": currency})

b2b_emp_type = EmploymentType("b2b", "b2b")
coe_emp_type = EmploymentType("contract_of_employment", "permanent")
com_emp_type = EmploymentType("contract_of_mandate", "mandate_contract")

def unnest_salaries(emp_types_list: dict):
    b2b_dict = b2b_emp_type.get_salary_dict(dict())
    coe_dict = coe_emp_type.get_salary_dict(dict())
    com_dict = com_emp_type.get_salary_dict(dict())

    for emp_type_dict_entry in emp_types_list:
        if emp_type_dict_entry is None:
            continue
        try:
            emp_type_dict = dict(emp_type_dict_entry)
            emp_type = emp_type_dict.get("type")
        except ValueError:
            continue

        if emp_type == b2b_emp_type.emp_code:
            b2b_dict = b2b_emp_type.get_salary_dict(emp_type_dict)
        elif emp_type == coe_emp_type.emp_code:
            coe_dict = coe_emp_type.get_salary_dict(emp_type_dict)
        elif emp_type == com_emp_type.emp_code:
            com_dict = com_emp_type.get_salary_dict(emp_type_dict)

    final_dict = dict(**b2b_dict, **coe_dict, **com_dict)
    return final_dict

df1 = df.apply(lambda row: unnest_salaries(row.employment_types), axis=1, result_type='expand')
final_df = pd.concat([df, df1], axis=1)

Unnamed: 0,title,city,country_code,workplace_type,company_name,company_size,experience_level,published_at,id,display_offer,...,remote,b2b_min_salary,b2b_max_salary,b2b_currency,contract_of_employment_min_salary,contract_of_employment_max_salary,contract_of_employment_currency,contract_of_mandate_min_salary,contract_of_mandate_max_salary,contract_of_mandate_currency
0,.Net Fullstack Engineer,Gdańsk,PL,partly_remote,XtraMile,10-20,mid,2022-11-24T12:29:00.000Z,xtramile-net-fullstack-engineer-mid-senior-gdansk,True,...,False,15000.0,19000.0,pln,,,,,,
1,Senior .Net Fullstack Engineer,Gdańsk,PL,partly_remote,XtraMile,10-20,senior,2022-11-24T12:29:00.000Z,xtramile-senior-net-fullstack-engineer,True,...,False,20000.0,26000.0,pln,,,,,,
2,Kierownik Projektu Technicznego,Warszawa,PL,remote,LINK4 TU S.A.,800-900,mid,2022-11-24T12:00:51.980Z,link4-tu-s-a-kierownik-projektu-technicznego,True,...,True,,,,,,,,,
3,Senior SAP FI Consultant,Wrocław,PL,remote,Atos,110 000+,senior,2022-11-24T12:00:45.074Z,atos-senior-sap-fi-consultant,True,...,True,,,,,,,,,
4,Senior/Lead Cloud DevOps (AWS or Azure),Warszawa,PL,partly_remote,Endava,10500,senior,2022-11-24T12:00:15.139Z,endava-senior-lead-cloud-devops-aws-or-azure,True,...,False,,,,19000.0,30000.0,pln,,,


In [92]:
final_df = final_df.drop(['employment_types'], axis=1)
final_df.head()

Unnamed: 0,title,city,country_code,workplace_type,company_name,company_size,experience_level,published_at,id,display_offer,...,b2b_min_salary,b2b_max_salary,b2b_currency,contract_of_employment_min_salary,contract_of_employment_max_salary,contract_of_employment_currency,contract_of_mandate_min_salary,contract_of_mandate_max_salary,contract_of_mandate_currency,url
0,.Net Fullstack Engineer,Gdańsk,PL,partly_remote,XtraMile,10-20,mid,2022-11-24T12:29:00.000Z,xtramile-net-fullstack-engineer-mid-senior-gdansk,True,...,15000.0,19000.0,pln,,,,,,,https://justjoin.it/offers/xtramile-net-fullst...
1,Senior .Net Fullstack Engineer,Gdańsk,PL,partly_remote,XtraMile,10-20,senior,2022-11-24T12:29:00.000Z,xtramile-senior-net-fullstack-engineer,True,...,20000.0,26000.0,pln,,,,,,,https://justjoin.it/offers/xtramile-senior-net...
2,Kierownik Projektu Technicznego,Warszawa,PL,remote,LINK4 TU S.A.,800-900,mid,2022-11-24T12:00:51.980Z,link4-tu-s-a-kierownik-projektu-technicznego,True,...,,,,,,,,,,https://justjoin.it/offers/link4-tu-s-a-kierow...
3,Senior SAP FI Consultant,Wrocław,PL,remote,Atos,110 000+,senior,2022-11-24T12:00:45.074Z,atos-senior-sap-fi-consultant,True,...,,,,,,,,,,https://justjoin.it/offers/atos-senior-sap-fi-...
4,Senior/Lead Cloud DevOps (AWS or Azure),Warszawa,PL,partly_remote,Endava,10500,senior,2022-11-24T12:00:15.139Z,endava-senior-lead-cloud-devops-aws-or-azure,True,...,,,,19000.0,30000.0,pln,,,,https://justjoin.it/offers/endava-senior-lead-...


In [82]:
def get_salaries(x):
    b2b_min_salary = None
    b2b_max_salary = None
    coe_min_salary = None
    coe_max_salary = None
    com_min_salary = None
    com_max_salary = None

    for dic in x:
        if dic is None:
            continue
        try:
            emp_type_dict = dict(dic)
            emp_type = emp_type_dict.get("type")
            salary = emp_type_dict.get("salary")
            if salary is None:
                return (b2b_min_salary, b2b_max_salary, coe_min_salary, coe_max_salary, com_min_salary, com_max_salary)

            salary_dict = dict(salary)
            if emp_type == "b2b":
                b2b_min_salary, b2b_max_salary = get_min_max_salary(salary_dict)
            elif emp_type == "permanent":
                coe_min_salary, coe_max_salary = get_min_max_salary(salary_dict)
            elif emp_type == "mandate_contract":
                com_min_salary, com_max_salary = get_min_max_salary(salary_dict)
        except ValueError:
            return (b2b_min_salary, b2b_max_salary, coe_min_salary, coe_max_salary, com_min_salary, com_max_salary)
        
        return (b2b_min_salary, b2b_max_salary, coe_min_salary, coe_max_salary, com_min_salary, com_max_salary)

def get_min_max_salary(salary_dict):
    min_salary = None
    max_salary = None
    if salary_dict is None:
        return (min_salary, max_salary)

    min_salary = salary_dict.get("from")
    max_salary = salary_dict.get("to")

    return (min_salary, max_salary)

In [83]:
df[["b2b_min_salary", "b2b_max_salary", "coe_min_salary", "coe_max_salary", "com_min_salary", "com_max_salary"]] = df.apply(lambda row: get_salaries(row.employment_types), axis=1, result_type='expand')

In [100]:
final_df.id.nunique() / final_df.id.count()

1.0

Add URL column

In [84]:
final_df["url"] = final_df["id"].apply(lambda x: f"https://justjoin.it/offers/{x}")

In [98]:
final_df.to_csv(f"justjoinit_{date.today()}.csv", encoding='UTF-8')

In [96]:
from datetime import date
date.today()

datetime.date(2022, 11, 24)

In [2]:
df_existing_offers

NameError: name 'df_existing_offers' is not defined