In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import json
import pandas as pd
import pigeonXT as pixt

### Load the job data

In [3]:
with Path('../data/job_listings.jsonl').open('r') as f:
    job_data = [json.loads(job_listing) for job_listing in f.readlines()]
print(f'{len(job_data):,} jobs loaded.')

51 jobs loaded.


In [4]:
records = [dct for dct in job_data if 'label' not in dct]
job_data_no_labels = pd.DataFrame.from_records(records).drop_duplicates('url')
print(f'There are {len(job_data_no_labels):,} jobs that need to be labelled.')

There are 51 jobs that need to be labelled.


### Annotate the job data

In [5]:
annotations = pixt.annotate(job_data_no_labels,
                            options=['Relevant', 'Irrelevant'],
                            example_column='cleaned_text',
                            display_fn=lambda x: print(x.replace('\n', '\n\n')))

HTML(value='0 of 51 Examples annotated, Current Position: 0 ')

HBox(children=(Button(description='Relevant', style=ButtonStyle()), Button(description='Irrelevant', style=But…

Output()

### Store the job data

In [6]:
labels = annotations.copy()
labels = (labels.query('changed == True')
                .drop(columns=['text', 'cleaned_text', 'changed'])
                .set_index('url')
                .label)

In [7]:
with Path('../data/job_listings.jsonl').open('w') as f:
    for dct in job_data:
        if 'label' not in dct and dct['url'] in labels:
            dct['label'] = labels[dct['url']]
        f.write(json.dumps(dct))
        f.write('\n')
print(f'Stored {len(job_data):,} job listings.')

Stored 51 job listings.
