In [24]:
import json
import jsonlines

In [25]:
predicted_path='/home/ryparmar/pyserini/ctk/retrieval-nfc/predicted.dev.tsv'
org_dev='/mnt/data/factcheck/CTK/par5/ctk-data/dev.jsonl'
output_path='/mnt/data/factcheck/CTK/par5/predictions/dev_anserini_k500.jsonl'

In [26]:
def load_jsonl(input_path: str) -> list:
    """Read list of objects from a JSON lines file."""
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    return data

In [27]:
def save_jsonl(data: list, output_path: str, append=False):
    """Write list of objects to a JSON lines file."""
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

In [28]:
def load_anserini_predictions(path: str):
    with open(path) as fr:
        predictions = {}
        for line in fr.readlines():
            idx, pred_id, _ = line.split('\t')
            if idx not in predictions:
                predictions[idx] = [pred_id]
            else:
                predictions[idx].append(pred_id)
    return predictions

In [29]:
dev = load_jsonl(org_dev)

In [30]:
dev

[{'id': 24,
  'verifiable': 'VERIFIABLE',
  'label': 'SUPPORTS',
  'claim': 'Společnost Bühler Motor Hradec Králové vlastní výrobní halu za 90 miliónů korun.',
  'evidence': [[[-1, 15, '20020322E03049_1', -1]],
   [[-1, 260, '20020322E03049_1', -1]],
   [[-1, 957, '20020322E03049_1', -1]]],
  'orig_par_id': '20020322E03049_1'},
 {'id': 70,
  'verifiable': 'VERIFIABLE',
  'label': 'SUPPORTS',
  'claim': 'Andrzej Žulawski zemřel na rakovinu.',
  'evidence': [[[-1, 28, 'T201602170468301_1', -1]]],
  'orig_par_id': 'T201602170468301_2'},
 {'id': 27,
  'verifiable': 'VERIFIABLE',
  'label': 'SUPPORTS',
  'claim': 'Společnost Bühler Motor otevřela výrobní halu za 90 miliónů korun v Hradci Králové.',
  'evidence': [[[-1, 37, '20020322E03049_1', -1]],
   [[-1, 243, '20020322E03049_1', -1]]],
  'orig_par_id': '20020322E03049_1'},
 {'id': 32,
  'verifiable': 'VERIFIABLE',
  'label': 'REFUTES',
  'claim': 'Christof Furtwängler nepracuje pro firmu Bühler Motor.',
  'evidence': [[[-1, 40, '20020322

In [31]:
predicted = load_anserini_predictions(predicted_path)

In [32]:
predicted['24']

['20020322E03049_1',
 '20010820E01855_10',
 '20001025F03296_8',
 '20001025E02353_1',
 '20020322E03049_0',
 '20001025E02353_0',
 'T200607270350401_1',
 '20001010F02102_1',
 '20001025E02353_4',
 'T200803190462701_9',
 '20011010F03425_1',
 '20010608F02683_3',
 'T201301070209701_0',
 'T202001200859401_1',
 '20011111E00670_1',
 'T201311110568501_4',
 '20020628F02139_1',
 '20020630E01442_2',
 '20001025E02353_3',
 'T201301070298501_11',
 '20020716E01943_1',
 '20031126F01589_5',
 'T201301070209701_1',
 '20021014E01540_8',
 '20030227F01628_1',
 '20010322F01062_3',
 '20021014E01320_1',
 '20001203E01132_10',
 '20021020E01784_11',
 '20020318E02551_1',
 '20001025F03029_2',
 '20001203F01143_1',
 '20010108F02652_2',
 '20010514F05336_3',
 '20021020E00984_1',
 '20051206E01460_1',
 'T201504160405901_6',
 '20000208E01878_1',
 '20021223F00327_1',
 '20010925F03545_1',
 '20020812E02704_4',
 'T201504160332201_1',
 '20000727E01583_1',
 '20000804E01138_0',
 '20020227F02823_1',
 '20011127E02197_1',
 '20020602E0

In [33]:
with open(output_path, 'w') as fw:
    for claim in dev:
        claim['orig_par_id'] = predicted[str(claim['id'])]
        fw.write(json.dumps(claim, ensure_ascii=False) + '\n')

# Playing with NFC / NFD normalization of unicode

In [36]:
import unicodedata as ud

In [38]:
d = load_jsonl('/mnt/data/factcheck/fever/data_titles-cs/predictions/dev_anserini_k500.jsonl')

In [42]:
ans1 = d[0]['predicted_pages'][0]
ans1

'Katedra sociologie Fakulty sociálních studií Masarykovy univerzity'

In [43]:
dd = load_jsonl('/mnt/data/factcheck/fever/data_titles-cs/predictions/dev_drqa_k500.jsonl')

In [44]:
drqa1 = dd[0]['predicted_pages'][0]
drqa1

'Katedra sociologie Fakulty sociálních studií Masarykovy univerzity'