In [23]:
%reload_ext autoreload
%autoreload 2

In [24]:
from pathlib import Path
import json
import pandas as pd
import pigeonXT as pixt

### Load the job data

In [25]:
with Path('../data/job_listings.jsonl').open('r') as f:
    job_data = [json.loads(job_listing) for job_listing in f.readlines()]
print(f'{len(job_data):,} jobs loaded.')

46 jobs loaded.


In [26]:
records = [dct for dct in job_data if 'label' not in dct]
job_data_no_labels = pd.DataFrame.from_records(records).drop_duplicates('url')
print(f'There are {len(job_data_no_labels):,} jobs that need to be labelled.')

There are 46 jobs that need to be labelled.


In [28]:
for dct in job_data:
    if dct['cleaned_text'].startswith('the hub'):
        print(dct['url'])

https://thehub.io/jobs/60e31b30160e022987f13a1b
https://thehub.io/jobs/621eabac2ec0c0f66713858c
https://thehub.io/jobs/61800ea65a845545af5fb17f
https://thehub.io/jobs/622156b62e9f0cf67f5f212f
https://thehub.io/jobs/61fbf49adf8da32676446d4f
https://thehub.io/jobs/61dc28220e604c235b0cf60d
https://thehub.io/jobs/619f7b847868de40a00444dc
https://thehub.io/jobs/62277d32542690823c3d74fd
https://thehub.io/jobs/6218beaa714ad5ad96744b15
https://thehub.io/jobs/621f356c2ec0c0f6673a665a
https://thehub.io/jobs/62066cf97d3d85764a3fbb2d
https://thehub.io/jobs/620a097ff2508b00c9d20b7a
https://thehub.io/jobs/622a0720542690823c28c363
https://thehub.io/jobs/61deaefe92ab65380d0e8098
https://thehub.io/jobs/6151878a473d4e03d5374f64
https://thehub.io/jobs/61681beaef2718314864cfd8
https://thehub.io/jobs/622be283fbc3526f42568ad9
https://thehub.io/jobs/61d401adf6427d22275e6990


### Annotate the job data

In [27]:
annotations = pixt.annotate(job_data_no_labels,
                            options=['Relevant', 'Irrelevant'],
                            example_column='cleaned_text',
                            display_fn=lambda x: print(x.replace('\n', '\n\n')))

HTML(value='0 of 46 Examples annotated, Current Position: 0 ')

HBox(children=(Button(description='Relevant', style=ButtonStyle()), Button(description='Irrelevant', style=But…

Output()

### Store the job data

In [30]:
labels = annotations.copy()
labels = (labels.query('changed == True')
                .drop(columns=['text', 'cleaned_text', 'changed'])
                .set_index('url')
                .label)

In [11]:
with Path('../data/job_listings.jsonl').open('w') as f:
    for dct in job_data:
        if 'label' not in dct and dct['url'] in labels:
            dct['label'] = labels[dct['url']]
        f.write(json.dumps(dct))
        f.write('\n')
print(f'Stored {len(job_data):,} job listings.')

Stored 658 job listings.
