In [4]:
from tqdm import tqdm
from datetime import datetime
import sys, os
import logging

import torch 
import transformers

import faiss
import jsonlines

In [5]:
if '/home/ryparmar/experimental-martin/pretraining/src/' not in sys.path:
    sys.path.append('/home/ryparmar/experimental-martin/pretraining/src')

import io_util, eval

In [6]:
def load_claims(split: str, path):
    """
    split = ["train", "dev", "test"]
    :return: (claims, evidence, labels)
    """
    if split in ['train', 'dev', 'test']:
        claims_data = io_util.load_jsonl(path)
        db_cols = ['id', 'verifiable', 'label', 'claim', 'evidence', 'claim_en']

        data = {}
        for c in claims_data:
            for col in db_cols:
                if col in data:
                    data[col].append(c.get(col, float('nan')))
                else:
                    data[col] = [c.get(col, float('nan'))]

        assert len(data['claim']) == len(data['evidence']) == len(data['label'])
        print(f"Loaded {len(data['claim'])} claims from {split} split.")
    return data['claim'], data['evidence'], data['label']

In [4]:
### FEVER
# dev_path = '/mnt/data/factcheck/fever/data-cs/fever-data/dev.jsonl'
# dev_queris_path = "/mnt/data/factcheck/fever/data-cs/fever/dev_queries.tsv"
# collection_path = "/mnt/data/factcheck/fever/data-cs/fever/ "
# ranking_path = "/home/ryparmar/trained_models/colbert-fever2/fever/retrieve.py/2021-03-20_11.36.39/ranking.tsv"

### CTK
dev_path = '/mnt/data/factcheck/CTK/par5/'
dev_jsonl = os.path.join(dev_path, 'ctk-data', 'dev.jsonl')
dev_queries_path = os.path.join(dev_path, 'ctk-data', 'dev_queries.tsv')
test_path = '/mnt/data/factcheck/CTK/par5/'
test_jsonl = os.path.join(dev_path, 'ctk-data', 'test.jsonl')
test_queries_path = os.path.join(dev_path, 'ctk-data', 'test_queries.tsv')
collection_path = os.path.join(dev_path, 'interim', 'collection_filtered.tsv')
idx2title_path = os.path.join(dev_path, 'interim', 'old-id2new-id.tsv')

ranking_path = "/home/ryparmar/trained_models/colbert/ctk-fever/rerank.py/test/ranking.tsv"

In [48]:
claims, evidence, labels = load_claims('dev', dev_jsonl)

Loaded 161 claims from dev split.


In [12]:
!head -n 1 {dev_jsonl}

{"id": 24, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Společnost Bühler Motor Hradec Králové vlastní výrobní halu za 90 miliónů korun.", "evidence": [[[-1, 15, "20020322E03049_1", -1]], [[-1, 260, "20020322E03049_1", -1]], [[-1, 957, "20020322E03049_1", -1]]], "orig_par_id": "20020322E03049_1"}


In [13]:
!head -n 2 {dev_queries_path}

0	Společnost Bühler Motor Hradec Králové vlastní výrobní halu za 90 miliónů korun.
1	Andrzej Žulawski zemřel na rakovinu.


In [14]:
!head -n 5 {collection_path}

0	Dokazování v kauze korupce na finančním úřadu se blíží ke konci
1	Zlín 15. srpna ( ČTK ) - Krajský soud ve Zlíně dokončuje dokazování v korupční kauze kolem finančního úřadu v Kroměříži . Podle obžaloby braly tři úřednice úplatky za ovlivňování daňového řízení . V případu je stíhána i zastupitelka Kroměříže za KSČM Kamila Dudová , která podle obžaloby s korupcí pomáhala , a podnikatel Milan Macourek , v jehož prospěch byla daň krácena . Úřednice i Dudová vinu odmítají , Macourek se přiznal .
2	Dudová podle obžaloby pomáhala s daňovým řízením obchodníkovi s trvanlivými potravinami Macourkovi . Jeho společnosti DUHAN byla podle státní zástupkyně neoprávněně vyplacena daň z přidané hodnoty 22,5 milionu korun a o vyplacení dalších 5,8 milionu se pokusil . Podle obžaloby se na tom podílely pracovnice finančního úřadu Lenka Burianová , Jarmila Červinková a Alena Olšinová , které ale tvrdí , že samy odhalily Macourkův podvod .
3	Dnes u soudu vypovídali svědkové obhajoby , mimo jiné znalc

In [15]:
!head -n 5 {idx2title_path}

T201608150566901_0	0
T201608150566901_1	1
T201608150566901_2	2
T201608150566901_3	3
T201608150566901_4	4


In [16]:
# !ls /mnt/data/factcheck/fever/data-cs/fever/ 

In [17]:
def read_tsv_col(path):
    with open(path, 'r') as fr:
        ids, data = [], []
        for l in tqdm(fr.readlines()):
            tmp = l.split('\t')
            if len(tmp) > 1:
                ids.append(tmp[0].strip())
                data.append(tmp[1].strip())
            else:
                print(f"Problem -- only one value in {tmp}?")
    return ids, data

def read_tsv_ranking(path):
    with open(path, 'r') as fr:
        data = []
        for l in tqdm(fr.readlines()):
            tmp = l.split('\t')
            query, doc = int(tmp[0].strip()), int(tmp[1].strip())
            if len(data) == query:
                data.append([doc])
            else:
                data[query].append(doc)
    return data

In [18]:
_, dev_queries = read_tsv_col(dev_queries_path)
collection_ids, collection = read_tsv_col(collection_path)

100%|██████████| 161/161 [00:00<00:00, 86808.45it/s]
100%|██████████| 13619573/13619573 [00:28<00:00, 481261.94it/s]


In [19]:
dev_queries[:5]

['Společnost Bühler Motor Hradec Králové vlastní výrobní halu za 90 miliónů korun.',
 'Andrzej Žulawski zemřel na rakovinu.',
 'Společnost Bühler Motor otevřela výrobní halu za 90 miliónů korun v Hradci Králové.',
 'Christof Furtwängler nepracuje pro firmu Bühler Motor.',
 'Christof Furtwängler je jednatelem firmy.']

In [20]:
collection_ids[:5]

['0', '1', '2', '3', '4']

In [21]:
predicted = read_tsv_ranking(ranking_path)

100%|██████████| 161000/161000 [00:00<00:00, 643558.10it/s]


In [25]:
predicted[1][:5]

[3904887, 566765, 11280881, 3904886, 566764]

In [23]:
len(collection_ids)

13619573

In [26]:
with open(idx2title_path) as fr:
    idx2title = [l.split('\t')[0].strip() for l in fr.readlines()]

In [27]:
idx2title[10584065]

'T201401160579001_0'

In [28]:
len(idx2title)

13619573

In [81]:
idx = 1
print(f"query: {dev_queries[idx]}")
for i, ii in enumerate(predicted[idx][:10]):
    print(i, '\n', collection[ii])

query: Andrzej Žulawski zemřel na rakovinu.
0 
 PRAHA 24. června ( ČTK ) - Předsedou České strany národně sociální ( ČSNS ) bude nadále bývalý ředitel ekonomické kontrarozvědky Jan Šula , který stojí v jejím čele již od října 1998\ . Opětovně jej dnes večer do této funkce zvolilo 88 ze 112 delegátů dvoudenního 30. celostátního sjezdu ČSNS v Praze .
1 
 Pětačtyřicetiletý Šula , který pochází z rodiny pronásledované komunistickým režimem , je ženatý a má dvě dcery . V roce 1991 stál u zrodu policejní Služby na ochranu ekonomických zájmů a dva roky zastával funkci ředitele ekonomické kontrarozvědky . Toto místo opustil oficiálně z osobních důvodů . Podle informací některých sdělovacích prostředků však od policie odešel kvůli problematickému policejnímu zásahu v pražské restauraci u Holubů , údajně pro neshody s tehdejším ministrem vnitra Janem Rumlem . Před vstupem do ČSNS byl také členem KDU-ČSL a ODA .
2 
 Podle bývalého polského prezidenta a šéfa hnutí Solidarita Lecha Walesy byl Havel

In [82]:
### CTK
predicted_ev = [[idx2title[int(i)] for i in q] for q in predicted]
# FEVER
# predicted_ev = [[collection_ids[i] for i in q] for q in predicted]

In [83]:
predicted_ev[0][:10]

['T201401160579001_0',
 'T201402120559701_1',
 '20020820E01165_4',
 '20050527F00773_5',
 '20020815E04359_5',
 'T200912100852402_3',
 'T201205240484301_0',
 'T201410290396001_2',
 '20031117E03556_6',
 'T201203300776501_2']

In [84]:
assert len(predicted_ev) == len(dev_queries)

In [88]:
claim[0]

NameError: name 'claim' is not defined

In [90]:
claims[1]

'Andrzej Žulawski zemřel na rakovinu.'

In [97]:
evidence[1][0][0][2]

'T201602170468301_1'

In [104]:
predicted_ev[1]

['20000624E01269_1',
 '20000624E01269_4',
 'T201112180542602_6',
 'T201509240466901_3',
 'T200803270471802_0',
 'T201101310427201_5',
 '20040318F02077_5',
 'T201112290339901_1',
 '20050527F00745_13',
 '20030530F01057_3',
 'T200911180269101_1',
 'T200910120361901_4',
 'T201512140445401_1',
 'T200910120361901_8',
 '20031205F00712_4',
 '20050919F02693_3',
 '20031205F00712_10',
 '20050919F02693_7',
 'T201012060739301_10',
 'T201309170633201_0',
 'T201801020180301_3',
 '20010130F06972_4',
 '20000906E00774_4',
 'T201406240851202_7',
 'T202011240194701_5',
 'T201406240851202_10',
 '20010620F00351_1',
 'T200612010634101_5',
 '20030321F01181_6',
 '20050415F00318_5',
 'T201007080314101_5',
 '20000430E00684_3',
 '20010130F06532_0',
 '20010130F06532_1',
 '20031205F00630_3',
 'T201311210274001_4',
 'T201005250989801_1',
 'T201005250989801_3',
 'T201810130511001_3',
 '20030507E02156_4',
 '20000702E01236_2',
 '20000702E01236_3',
 '20000702E01236_1',
 '20031227E01019_4',
 '20050408F01546_6',
 'T201307

In [116]:
len(evidence) == len(claims)

True

In [120]:
evidence[idx][0][0][2] in predicted_ev[idx]

True

In [119]:
s = 0
for idx, _ in enumerate(claims):
    s += int(evidence[idx][0][0][2] in predicted_ev[idx])

print(s)

144


In [118]:
s = 0
for idx, _ in enumerate(claims):
    s += int(evidence[idx][0][0][2].split('_')[0] in [i.split('_')[0].strip() for i in predicted_ev[idx]])

print(s)

148


# Evaluate

In [85]:
macro_precision, macro_precision_hits = 0, 0
macro_recall, macro_recall_hits = 0, 0
macro_mrr, macro_mrr_hits = 0, 0
k = 500

for i, top_k_idxs in tqdm(enumerate(predicted_ev), desc='Calculating evaluation metrics'):
    predicted_evidence = top_k_idxs[:k]
    
    try:
        macro_prec = eval.evidence_macro_precision(evidence[i], labels[i], predicted_evidence)
        macro_precision += macro_prec[0]
        macro_precision_hits += macro_prec[1]

        macro_rec = eval.evidence_macro_recall(evidence[i], labels[i], predicted_evidence)
        macro_recall += macro_rec[0]
        macro_recall_hits += macro_rec[1]

        macro_rr = eval.evidence_macro_mrr(evidence[i], labels[i], predicted_evidence)
        macro_mrr += macro_rr[0]
        macro_mrr_hits += macro_rr[1]
    except:
        print("ERROR on ", i)

Calculating evaluation metrics: 161it [00:00, 1055.26it/s]


In [86]:
pr = (macro_precision / macro_precision_hits) if macro_precision_hits > 0 else 1.0
rec = (macro_recall / macro_recall_hits) if macro_recall_hits > 0 else 0.0
mrr = (macro_mrr / macro_mrr_hits) if macro_mrr_hits > 0 else 0.0
f1 = 2.0 * pr * rec / (pr + rec + 1e-6)
print(f"F1: {f1*100}\nRecall@{k}: {rec*100}\nPrecision@{k}: {pr*100}\nMRR@{k}: {mrr*100}")

F1: 0.0
Recall@500: 0.0
Precision@500: 0.0
MRR@500: 0.0


# Prepare for doc-retr Honza evaluation

## Check that train, dev and collection (fever.db) uses NFC / NFD consistently

## data-cs

In [39]:
!head -n 1 /mnt/data/factcheck/fever/data-cs/predictions/colbert_k500.jsonl

{"id": 207746, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Sammy Cahn byl americky\u0301 hudebni\u0301k narozeny\u0301 v roce 1913.", "evidence": [[[245753, 246454, "Sammy Cahn", 0, "Sammy Cahn"]]], "claim_en": "Sammy Cahn was an American musician born in 1913.", "predicted_pages": ["Sammy Cahn", "Danny Kaye", "Steve Khan", "Sammy Davis mlads\u030ci\u0301", "Al Cohn", "Tony Curtis", "Al Kooper", "Shane West", "Jerome Kern", "Jimmy Van Heusen", "Woody Allen", "Sammy Nestico", "Sammy Price", "Danny Elfman", "Billy Wilder", "Robert Schwartzman", "Sammy Voma\u0301c\u030cka", "Al Foster", "Gene Wilder", "Ian Underwood", "Joe Jonas", "Tony Kaye", "Ozzie Cadena", "Steve Grossman (saxofonista)", "Jason Schwartzman", "Paul Hampton", "Weird Al Yankovic", "Frank Sinatra", "Danny Fields", "Billie Joe Armstrong", "Stan Getz", "Frank Sinatra Jr.", "Eddie Kaye", "Joe Bidewell", "Eddie Gladden", "Scott Caan", "Ronnie James Dio", "Don S. Davis", "Duane Eddy", "Stephen Sondheim", "Elia Ka

In [40]:
!head -n 1 /mnt/data/factcheck/fever/data-cs/fever/fever.tsv

0	Vietnamská kuchyně je důležitou částí vietnamské kultury . Jídlo představuje pro Vietnamce možnost strávit čas společně s rodinou a je hlavním prvkem oslav a svátků . Hlavní ingredience využívané ve vietnamské kuchyni v sobě odrážejí polohu země a místní klima . Rýže , která se pěstuje na vodních polích po celé zemi , tvoří součást každodenních jídel , a navíc se z ní vyrábějí různé druhy nudlí nebo koláčů . Kromě množství buddhistických vegetariánských pokrmů jsou vietnamská jídla kombinací rozmanitých druhů zeleniny , bylin a masa . Ingredience se připravují na mnoho způsobů , jako je vaření , dušení nebo smažení . Cílem kuchařů je zachování co nejčerstvější a přírodní chuti jídel . Vietnamská kuchyně je často považována za jednu z nejzdravějších na světě . Vietnamská kuchyně se řídí filozofií pěti elementů ( `` ngũ hành '' ) v pěti základních chutích : ostrá ( železo

In [44]:
!head -n 1 /mnt/data/factcheck/fever/data-cs/fever-data/train.tsv

Nelson Mandela zavedl bezplatnou zdravotní péči.	Nelson Rolihlahla Mandela ( 18. července 1918 – 5. prosince 2013 ) byl jihoafrický bojovník proti apartheidu , politik a prezident Jihoafrické republiky v letech 1994 až 1999 . Byl prvním černošským prezidentem zvoleným v prvních svobodných volbách v historii Jihoafrické republiky . Jeho vláda se soustředila na likvidaci dědictví apartheidu , zaměřovala se na institucionalizovaný rasismus , chudobu a sociální nerovnost , a zprostředkovala mezirasové usmíření v zemi . Politicky se hlásil k africkému nacionalismu a demokratickému socialismu . V letech 1991 až 1997 byl prezidentem Afrického národního kongresu . V mezinárodní politice byl generálním tajemníkem Hnutí nezúčastněných zemí v letech 1998 až 1999 . Narodil se jako příslušník královské rodiny národa Xhosů . Vystudoval právo na univerzitách v Alice a Johannesburgu . Po studiích se přidal k Africkému národnímu kongresu 

In [46]:
!head -n 1 /mnt/data/factcheck/fever/data-cs/fever/dev_queries.tsv

0	Sociologie je studium vývoje politiky.


## data_titles-cs

In [51]:
!head -n 1 /mnt/data/factcheck/fever/data_titles-cs/predictions/colbert_k500.jsonl

{"id": 207746, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Sammy Cahn byl americky\u0301 hudebni\u0301k narozeny\u0301 v roce 1913.", "evidence": [[[245753, 246454, "Sammy Cahn", 0, "Sammy Cahn"]]], "claim_en": "Sammy Cahn was an American musician born in 1913.", "predicted_pages": ["Sammy Cahn", "Steve Khan", "Danny Kaye", "Sammy Davis mlads\u030ci\u0301", "Billy Wilder", "Shane West", "Tony Curtis", "Gene Wilder", "Jimmy Van Heusen", "Al Cohn", "Jason Schwartzman", "Ozzie Cadena", "Robert Schwartzman", "Jerome Kern", "Sammy Voma\u0301c\u030cka", "Ian Underwood", "Al Foster", "Sammy Price", "Woody Allen", "Al Kooper", "Sammy", "Danny Elfman", "Joe Jonas", "Scott Caan", "Stephen Sondheim", "Sammy Nestico", "Don S. Davis", "Steve Grossman (saxofonista)", "Tony Kaye", "Elia Kazan", "Eddie Kaye", "Frank Sinatra", "Harry Dean Stanton", "Josh Schwartz (hudebni\u0301k)", "Weird Al Yankovic", "Clive Davis", "Billy Bob Thornton", "Duane Eddy", "Ian Somerhalder", "Joe Bidewell", 

In [55]:
!head -n 1 /mnt/data/factcheck/fever/data_titles-cs/fever-data/dev.jsonl

{"id": 206088, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Sociologie je studium vývoje politiky.", "evidence": [], "claim_en": "Sociology is the study of politics development."}


In [54]:
!head -n 1 /mnt/data/factcheck/fever/data_titles-cs/fever-data/train.jsonl

{"id": 188153, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Útěk z planety Země je vědecký dokumentární film.", "evidence": [], "claim_en": "Escape from Planet Earth is a science documentary film."}


In [53]:
!head -n 2 /mnt/data/factcheck/fever/data_titles-cs/fever/dev_queries.tsv

0	Sociologie je studium vývoje politiky.
1	Sammy Cahn byl americký hudebník narozený v roce 1913.


In [28]:
!head -n 1 /mnt/data/factcheck/fever/data_titles-cs/predictions/dev_two_tower_mbert_finetuned_best_ict_NFC_NFC_k500.jsonl

{"id": 207746, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Sammy Cahn byl americky\u0301 hudebni\u0301k narozeny\u0301 v roce 1913.", "evidence": [[[245753, 246454, "Sammy Cahn", 0, "Sammy Cahn"]]], "claim_en": "Sammy Cahn was an American musician born in 1913.", "predicted_pages": ["Sammy Cahn", "Sammy Davis mlads\u030ci\u0301", "Al Cohn", "Jimmy Van Heusen", "Sammy Price", "Steve Khan", "Ed Shaughnessy", "Phil Cohran", "Zikmund Schul", "Sammy", "Artur Schnabel", "Harry Edison", "Sandy Siegelstein", "Vic Schoen", "Gunther Schuller", "Benny Carter", "James Cagney", "David Schildkraut", "Brad Mehldau", "Sun Ra", "Warren Covington", "Jimmy McHugh", "Jodie Christian", "Songwriters Hall of Fame", "Love and Marriage", "Miles Davis", "Eddie Heywood", "Hans Schimmerling", "Fletcher Henderson", "Phil Bodner", "Charlie Christian", "Kenny Clarke", "Eddie Davis", "Billy Cobham", "Sammy Nestico", "Milt Jackson", "Ronnie Deauville", "Frantis\u030cek Man\u030cas", "Claude Thornhill", 

In [30]:
!head -n 1 /mnt/data/factcheck/fever/data_titles-cs/predictions/dev_two_tower_mbert_finetuned_best_ict_1.3_NFC_NFC_k500.jsonl

{"id": 207746, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Sammy Cahn byl americký hudebník narozený v roce 1913.", "evidence": [[[245753, 246454, "Sammy Cahn", 0, "Sammy Cahn"]]], "claim_en": "Sammy Cahn was an American musician born in 1913.", "predicted_pages": ["Sammy Cahn", "Sammy Davis mladší", "Sammy Price", "Al Cohn", "Jimmy Van Heusen", "Sammy Nestico", "Sammy", "Benny Carter", "Steve Khan", "Miles Davis", "Harry Edison", "Ed Shaughnessy", "Phil Cohran", "Andy Simpkins", "Walter Davis, Jr.", "Fletcher Henderson", "Jimmy Cobb", "Louis Armstrong", "Charlie Christian", "Sidney Bechet", "Ronnie Deauville", "Juma Santos", "Les Spann", "Sun Ra", "Charlie Barnet", "Vic Schoen", "Ira Sullivan", "Love and Marriage", "Joe Evans", "Al Jolson", "Nat King Cole", "Art Davis", "Eddie Davis", "Zikmund Schul", "Kaleidoscope (americká hudební skupina)", "Warren Covington", "David Schildkraut", "Jodie Christian", "Al Casey (jazzový kytarista)", "Lee Konitz", "Red Garland",

In [29]:
!head -n 1 /mnt/data/factcheck/fever/data_titles-cs/predictions/dev_two_tower_finetuned_mbert_10epochs_lr_1e-6_ict_1.4_NFC_NFC_k500.jsonl

{"id": 207746, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Sammy Cahn byl americký hudebník narozený v roce 1913.", "evidence": [[[245753, 246454, "Sammy Cahn", 0, "Sammy Cahn"]]], "claim_en": "Sammy Cahn was an American musician born in 1913.", "predicted_pages": ["Sammy Cahn", "Al Cohn", "Sammy Davis mladší", "Rolf Kühn", "Benny Carter", "Steve Khan", "Warren Covington", "Jimmy Van Heusen", "Arthur Rubinstein", "Bobby Few", "Sammy Price", "Woody Herman", "Sammy Nestico", "Phil Cohran", "Harry Carney", "Vic Schoen", "Joachim Kühn", "Jimmy McHugh", "Walter Kohn", "Dave McKenna", "Les McCann", "Don Kirshner", "Willie Ruff", "Fred Katz", "Sabu Martinez", "Alvin Curran", "Herbert von Karajan", "Eddie Heywood", "Sahib Shihab", "Harry Edison", "Gildo Mahones", "Benny Golson", "Billy Mundi", "Artur Schnabel", "John Coltrane", "André Previn", "Kiane Zawadi", "Paul Desmond", "George Coleman", "Connie Kay", "Rahn Burton", "Sam Jaffe", "Plas Johnson", "Harold Mabern", "Elm

In [22]:
# path = '/mnt/data/factcheck/fever/data_titles-cs/predictions/dev_two_tower_finetuned_mbert_10epochs_lr_1e-6_ict_1.4_NFC_NFC_k500.jsonl'
# with jsonlines.open(path, 'r') as fr:
#         for l in fr:
#             print(l)
#             break

In [106]:
path = '/mnt/data/factcheck/fever/data-cs/fever-data/dev.jsonl'
with jsonlines.open(path, 'r') as fr:
        for l in fr:
            print(l)
            break

{'id': 206088, 'verifiable': 'NOT VERIFIABLE', 'label': 'NOT ENOUGH INFO', 'claim': 'Sociologie je studium vývoje politiky.', 'evidence': [], 'claim_en': 'Sociology is the study of politics development.'}


In [23]:
dev_queries[0]

'Sociologie je studium vývoje politiky.'

# Create the prediction file

In [7]:
if '/home/spaceape/drchajan/src/' not in sys.path:
    sys.path.append('/home/spaceape/drchajan/src/')
    
if '/home/spaceape/drchajan/src/utils/' not in sys.path:
    sys.path.append('/home/spaceape/drchajan/src/utils/')

In [8]:
import json
# from fever-baselines
class Reader:
    def __init__(self):
        self.enc = "utf-8"

    def read(self,file):
        with open(file,"r", encoding = self.enc) as f:
            return self.process(f)

class JSONLineReader(Reader):                                                                                                                                                  
    def process(self,fp):                                                                                                                                                      
        data = []                                                                                                                                                              
        for line in fp.readlines():                                                                                                                                            
            data.append(json.loads(line.strip()))                                                                                                                              
        return data   

In [9]:
def read_tsv_col(path):
    with open(path, 'r') as fr:
        ids, data = [], []
        for l in tqdm(fr.readlines()):
            tmp = l.split('\t')
            if len(tmp) > 1:
                ids.append(tmp[0].strip())
                data.append(tmp[1].strip())
            else:
                print(f"Problem -- only one value in {tmp}?")
    return ids, data

def read_tsv_ranking(path):
    with open(path, 'r') as fr:
        data = []
        for l in tqdm(fr.readlines()):
            tmp = l.split('\t')
            query, doc = int(tmp[0].strip()), int(tmp[1].strip())
            if len(data) == query:
                data.append([doc])
            else:
                data[query].append(doc)
    return data

In [10]:
def updated_evaluation(pre_jsonl, post_jsonl, queries, ranking, k=500):
    # TODO this is too SLOW! Work with batches!
    jlr = JSONLineReader()
    with open(pre_jsonl, "r") as f, open(post_jsonl, "w+") as fout:
        for l in tqdm(jlr.process(f)):
            if l["verifiable"] == "VERIFIABLE":
                claim = l["claim"]
                claim_id = queries.index(claim)
                l["predicted_pages"] = ranking[claim_id][:k]
                fout.write(json.dumps(l, ensure_ascii=False) + "\n")

In [13]:
### CTK
root = '/mnt/data/factcheck/CTK/dataset/v2.1/nfc'
pre_dev_jsonl_path = os.path.join(root, 'dev.jsonl')
pre_test_jsonl_path = os.path.join(root, 'test.jsonl')

# dev_queries_path = os.path.join(root, 'dev_queries.tsv')
# test_queries_path = os.path.join(root, 'test_queries.tsv')
dev_queries_path = '/home/ryparmar/trained_models/colbert/data/dev_queries.tsv'
test_queries_path ='/home/ryparmar/trained_models/colbert/data/test_queries.tsv'

root_col = '/mnt/data/factcheck/CTK/par5'
collection_path = os.path.join(root_col, 'interim', 'collection_filtered.tsv')
idx2title_path = os.path.join(root_col, 'interim', 'old-id2new-id.tsv')

dev_ranking_path = "/home/ryparmar/trained_models/colbert/ctk-fever-v2.1/rerank.py/dev/ranking.tsv"
test_ranking_path = "/home/ryparmar/trained_models/colbert/ctk-fever-v2.1/rerank.py/test/ranking.tsv"

post_dev_jsonl_path = os.path.join(root, 'predictions', 'dev_colbert_ctk+fever_k500.jsonl')
post_test_jsonl_path = os.path.join(root, 'predictions', 'test_colbert_ctk+fever_k500.jsonl')

In [14]:
dev_queries = read_tsv_col(dev_queries_path)[1]

100%|██████████| 301/301 [00:00<00:00, 246146.52it/s]

Problem -- only one value in ['.\n']?





In [15]:
test_queries = read_tsv_col(test_queries_path)[1]

100%|██████████| 600/600 [00:00<00:00, 355198.65it/s]


In [16]:
for ranking, pre_jsonl, post_jsonl, queries in zip([dev_ranking_path, test_ranking_path], 
                                                   [pre_dev_jsonl_path, pre_test_jsonl_path],
                                                   [post_dev_jsonl_path, post_test_jsonl_path],
                                                   [dev_queries, test_queries]):
    ranking = read_tsv_ranking(ranking)

    with open(idx2title_path) as fr:
        idx2title = [l.split('\t')[0].strip() for l in fr.readlines()]

    predicted = [[idx2title[int(i)] for i in q] for q in ranking]
    updated_evaluation(pre_jsonl, post_jsonl, queries, predicted)

100%|██████████| 300000/300000 [00:00<00:00, 642687.67it/s]
100%|██████████| 300/300 [00:00<00:00, 9270.07it/s]
100%|██████████| 600000/600000 [00:00<00:00, 783016.82it/s]
100%|██████████| 600/600 [00:00<00:00, 8486.66it/s]


In [60]:
### FEVER
root = '/mnt/data/factcheck/fever/data-cs/'
pre_dev_jsonl_path = os.path.join(root, 'fever-data', 'dev.jsonl')
# pre_test_jsonl_path = os.path.join(root, 'fever-data', 'test.jsonl')

dev_queries_path = os.path.join(root, 'fever', 'dev_queries.tsv')
# test_queries_path = os.path.join(root, 'fever', 'test_queries.tsv')

collection_path = os.path.join(root, 'fever', 'fever.tsv')
titles_path = os.path.join(root, 'fever', 'title_as_id_fever.tsv')

dev_ranking_path = "/home/ryparmar/trained_models/colbert/ctk-fever/retrieve.py/fever-dev/ranking.tsv"

post_dev_jsonl_path = os.path.join('/mnt/data/factcheck/fever/data_titles-cs/', 'predictions', 'dev_colbert_ctk+fever_k500.jsonl')

In [61]:
dev_queries = read_tsv_col(dev_queries_path)[1]

100%|██████████| 9999/9999 [00:00<00:00, 635706.75it/s]


In [62]:
for ranking, pre_jsonl, post_jsonl, queries in zip([dev_ranking_path], 
                                                   [pre_dev_jsonl_path],
                                                   [post_dev_jsonl_path],
                                                   [dev_queries]):
    ranking = read_tsv_ranking(ranking)

    with open(titles_path) as fr:
        titles = [l.split('\t')[0].strip() for l in fr.readlines()]

    predicted = [[titles[int(i)] for i in q] for q in ranking]
    updated_evaluation(pre_jsonl, post_jsonl, queries, predicted)

100%|██████████| 9999000/9999000 [00:11<00:00, 834212.64it/s]
100%|██████████| 9999/9999 [00:01<00:00, 6451.67it/s]
