In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import json
import pandas as pd
import pigeonXT as pixt

### Load the job data

Load all the job listings:

In [4]:
with Path('../data/job_listings.jsonl').open('r') as f:
    job_listings = [json.loads(dct) for dct in f.readlines()]
print(f'{len(job_listings):,} jobs loaded.')

58 jobs loaded.


Filter the job listings, keeping only the unannotated ones:

In [7]:
with Path('../data/job_listing_paragraphs.jsonl').open('r') as f:
    labelled_urls = [json.loads(dct)['url'] for dct in f.readlines()]
job_listings = [dct for dct in job_listings if dct['url'] not in labelled_urls]
print(f'{len(job_listings):,} jobs not labelled.')

7 jobs not labelled.


In [8]:
records = [dct for dct in job_listings]
df = pd.DataFrame.from_records(records).drop_duplicates('url')
df['cleaned_text'] = df.cleaned_text.str.split('\n')
df = df.explode('cleaned_text').reset_index(drop=True)
print(f'Extracted {len(df):,} paragraphs.')

Extracted 84 paragraphs.


### Annotate the job data

In [9]:
annotations = pixt.annotate(df,
                            task_type='multilabel-classification',
                            options=['title_or_tasks', 'requirements', 'bad'],
                            example_column='cleaned_text',
                            buttons_in_a_row=3)

HTML(value='0 of 84 Examples annotated, Current Position: 0 ')

VBox(children=(HBox(children=(ToggleButton(value=False, description='title_or_tasks'), ToggleButton(value=Fals…

Output()

### Store the job data

In [10]:
labels = annotations.copy()
labels = (labels.query('changed == True')
                .drop(columns=['text', 'changed'])
                .groupby('url')
                .agg(lambda x: list(x))
                .reset_index())
labels

Unnamed: 0,url,cleaned_text,title_or_tasks,requirements,bad
0,https://apply.recman.no/job_post.php?id=256949...,[the requested page does not exist. please che...,[False],[False],[False]
1,https://thehub.io/jobs/618a75aa216452398acafa78,"[join a fast growing music tech scale-up, wher...",[True],[True],[True]
2,https://thehub.io/jobs/62351d2d69d1e654775e6c0a,[product project playground open for planning ...,[False],[False],[False]
3,https://www.dtu.dk/english/about/job-and-caree...,[phd scholarship in cardiac vector flow ultras...,"[True, False, False, False, True, True, False,...","[False, False, False, False, False, False, Fal...","[True, False, False, False, False, False, Fals..."
4,https://www.dtu.dk/om-dtu/job-og-karriere/ledi...,[phd scholarship in diamond sensing and optica...,"[True, False, False, False, True, False, False...","[False, False, False, False, False, True, Fals...","[True, False, False, False, True, False, False..."
5,https://www.dtu.dk/om-dtu/job-og-karriere/ledi...,[phd scholarship in eco-friendly corrosion inh...,"[True, False, False, False, False, True, True,...","[False, False, False, False, False, False, Fal...","[True, False, False, False, False, False, Fals..."
6,https://www.dtu.dk/om-dtu/job-og-karriere/ledi...,"[tirsdag 15 mar 22, frist 29. marts 2022, we a...","[False, False, True, False, False, False, Fals...","[False, False, False, True, False, False, True...","[False, False, False, True, False, False, Fals..."


In [11]:
with Path('../data/job_listing_paragraphs.jsonl').open('a') as f:
    for dct in labels.to_dict('records'):
        f.write(json.dumps(dct))
        f.write('\n')
print(f'Stored {len(labels):,} job listing paragraphs.')

Stored 7 job listing paragraphs.
