In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import json
import pandas as pd
import pigeonXT as pixt

### Load the job data

In [81]:
with Path('../data/job_listings.jsonl').open('r') as f:
    job_data = [json.loads(job_listing) for job_listing in f.readlines()]
print(f'{len(job_data):,} jobs loaded.')

51 jobs loaded.


In [82]:
records = [dct for dct in job_data]
df = pd.DataFrame.from_records(records).drop_duplicates('url')
df['cleaned_text'] = df.cleaned_text.str.split('\n')
df = df.explode('cleaned_text').reset_index(drop=True)
print(f'Extracted {len(df):,} paragraphs.')

Extracted 1,298 paragraphs.


### Annotate the job data

In [85]:
annotations = pixt.annotate(df,
                            task_type='multilabel-classification',
                            options=['title_or_tasks', 'requirements', 'bad'],
                            example_column='cleaned_text',
                            buttons_in_a_row=3)

HTML(value='0 of 1298 Examples annotated, Current Position: 0 ')

VBox(children=(HBox(children=(ToggleButton(value=False, description='title_or_tasks'), ToggleButton(value=Fals…

Output()

### Store the job data

In [88]:
labels = annotations.copy()
labels = (labels.query('changed == True')
                .drop(columns=['text', 'changed'])
                .groupby('url')
                .agg(lambda x: list(x))
                .reset_index())
labels

Unnamed: 0,url,cleaned_text,title_or_tasks,requirements,bad
0,https://apply.recman.no/job_post.php?id=260717...,[the requested page does not exist. please che...,[False],[False],[False]
1,https://boards.greenhouse.io/agcbiologics/jobs...,[scientist for analytical support in vaccine a...,"[True, False, False, False, False, False, True...","[False, False, False, False, False, False, Fal...","[True, False, False, False, False, False, True..."
2,https://boards.greenhouse.io/agcbiologics/jobs...,[laboratory technicians with experience in bio...,"[True, False, False, False, True, False, False...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal..."
3,https://boards.greenhouse.io/agcbiologics/jobs...,"[technician, microbial downstream, manufacturi...","[True, False, False, False, True, False, False...","[False, False, False, False, False, False, Tru...","[False, False, False, False, False, False, Fal..."
4,https://boards.greenhouse.io/agcbiologics/jobs...,"[senior msat product steward, in the newly imp...","[True, False, False, False, False, False, True...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal..."
5,https://boards.greenhouse.io/agcbiologics/jobs...,[laborant praktikant med start 1. september 20...,"[True, False, False, False, True, False, False...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal..."
6,https://candidate.hr-manager.net/ApplicationIn...,[your current state: deny. your current state:...,"[False, False, True, False, False, True, True,...","[False, False, False, False, False, False, Fal...","[False, False, True, False, False, True, True,..."
7,https://employment.ku.dk/all-vacancies?show=15...,"[postdoc position in molecular metabolism, map...","[True, False, False, False, False, False, Fals...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal..."
8,https://employment.ku.dk/all-vacancies?show=15...,[postdoc in adipose g protein-coupled receptor...,"[True, False, False, False, False, False, Fals...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal..."
9,https://employment.ku.dk/all-vacancies?show=15...,[laboratory manager for department of chemistr...,"[True, False, False, False, False, False, Fals...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal..."


In [89]:
with Path('../data/job_listing_paragraphs.jsonl').open('w') as f:
    for dct in labels.to_dict('records'):
        f.write(json.dumps(dct))
        f.write('\n')
print(f'Stored {len(labels):,} job listing paragraphs.')

Stored 51 job listing paragraphs.
