In [1]:
#!python -m pip install pyserini --user
#!python -m pip install jsonlines --user

In [19]:
import sqlite3
import pandas as pd
import jsonlines
import json

# Run on CTK data

In [1]:
DATAFOLDER='/mnt/data/factcheck/CTK/par5'
!ls {DATAFOLDER}

ctk-data  emb  index  info.txt	interim  interim~  predictions	raw


In [2]:
!wc -l {DATAFOLDER}/ctk-data/dev.jsonl
!wc -l /home/ryparmar/pyserini/fever/paper_dev.jsonl

161 /mnt/data/factcheck/CTK/par5/ctk-data/dev.jsonl
9999 /home/ryparmar/pyserini/fever/paper_dev.jsonl


In [3]:
!ls {DATAFOLDER}/interim

collection.tsv		 ctk.jsonl	ctk_filtered.db
collection_filtered.tsv  ctk.jsonl.all	logs
ctk.db			 ctk.jsonl.err	old-id2new-id.tsv


In [4]:
!wc -l {DATAFOLDER}/interim/ctk.jsonl

40659844 /mnt/data/factcheck/CTK/par5/interim/ctk.jsonl


## Convert .db file into jsonl file

In [20]:
def load_db(path: str, limit=None):
    """
    Return documents (column: text) and document ids (column: id)
    FEVER db returns wiki abstracts (single paragraphs) and document names (=id)
    CTK db returns paragraphs and paragraph ids
    """
    # Create the connection
    connection = sqlite3.connect(path)
    # Create the dataframe from a query
    if limit:
        data = pd.read_sql_query(f"SELECT * FROM documents LIMIT {limit}", connection)
    else:
        data = pd.read_sql_query("SELECT * FROM documents", connection)
    return list(data.text.values), list(data.id.values)

def save_jsonl(data: list, ids: list, output_path: str, append=False):
    """Write list of objects to a JSON lines file."""
    mode = 'a+' if append else 'w'
    assert len(data) == len(ids)
    with open(output_path, mode, encoding='utf-8') as f:
        for i in range(len(data)):
            json_record = json.dumps({'id': ids[i], 
                                      'text': data[i]}, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

In [13]:
pars, par_ids = load_db('/mnt/data/factcheck/CTK/par5/interim/ctk_filtered.db')

In [21]:
save_jsonl(pars, par_ids, '/mnt/data/factcheck/CTK/par5/interim/jsonl/ctk_filtered.jsonl')

Wrote 13619573 records to /mnt/data/factcheck/CTK/par5/interim/jsonl/ctk_filtered.jsonl


### Prepare the wiki dump into format suitable for anserini

In [None]:
ROOT=/home/ryparmar/pyserini/ctk/data
COLLECTION=/mnt/data/factcheck/CTK/par5/interim

In [6]:
!python /home/ryparmar/pyserini/src/convert_collection_to_jsonl.py \
    --collection_folder /mnt/data/factcheck/CTK/par5/interim/jsonl \
    --output_folder /home/ryparmar/pyserini/cs/data \
    --max_docs_per_file 10000000 \
    --granularity 'paragraph'

Converting collection...
Converted 100000 docs in 1 files
Converted 200000 docs in 1 files
Converted 300000 docs in 1 files
Converted 400000 docs in 1 files
Done!


In [22]:
!ls -l --block-size=M /home/ryparmar/pyserini/data

total 8101M
-rw-r--r-- 1 ryparmar k13136 2981M Sep  8 13:27 docs00-id=int.json
-rw-r--r-- 1 ryparmar k13136 3078M Sep  8 19:53 docs00.json


In [10]:
!wc -l /mnt/data/factcheck/fever/data-cs/*jsonl
!wc -l /home/ryparmar/pyserini/cs/data/*json

453553 /mnt/data/factcheck/fever/data-cs/cswiki.jsonl
453553 /home/ryparmar/pyserini/cs/data/docs00.json


### Try to compute index

In [11]:
!python -m pyserini.index \
    -collection JsonCollection \
    -generator DefaultLuceneDocumentGenerator \
    -threads 1 \
    -input /home/ryparmar/pyserini/cs/data \
    -index /home/ryparmar/pyserini/cs/indexes-full \
    -storePositions -storeDocvectors -storeRaw

2020-09-17 17:06:54,453 INFO  [main] index.IndexCollection (IndexCollection.java:636) - Setting log level to INFO
2020-09-17 17:06:54,455 INFO  [main] index.IndexCollection (IndexCollection.java:639) - Starting indexer...
2020-09-17 17:06:54,456 INFO  [main] index.IndexCollection (IndexCollection.java:641) - DocumentCollection path: /home/ryparmar/pyserini/cs/data
2020-09-17 17:06:54,456 INFO  [main] index.IndexCollection (IndexCollection.java:642) - CollectionClass: JsonCollection
2020-09-17 17:06:54,457 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Generator: DefaultLuceneDocumentGenerator
2020-09-17 17:06:54,457 INFO  [main] index.IndexCollection (IndexCollection.java:644) - Threads: 1
2020-09-17 17:06:54,457 INFO  [main] index.IndexCollection (IndexCollection.java:645) - Stemmer: porter
2020-09-17 17:06:54,458 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Keep stopwords? false
2020-09-17 17:06:54,458 INFO  [main] index.IndexCollection (IndexColle

In [12]:
!ls /home/ryparmar/pyserini/indexes-full

ls: cannot access /home/ryparmar/pyserini/indexes-full: No such file or directory


### Test the computed index

In [133]:
from pyserini import analysis, index
from pyserini.search import SimpleSearcher

INDEXER = '/home/ryparmar/pyserini/indexes-full'
# index_reader = index.IndexReader(INDEXER)

searcher = SimpleSearcher(INDEXER)

query = 'Sláintecare should be the name of healthcare system in Ireland'
hits = searcher.search(query)

for i in range(len(hits)):
    print(f'{i+1:2} {hits[i].docid:15} {hits[i].score:.5f}')

 1 0               16.15490
 2 3655984         10.95880
 3 1736638         10.29230
 4 1949761         10.22800
 5 5495217         9.64750
 6 1580268         9.35750
 7 1390315         9.22520
 8 3014088         9.14190
 9 1354987         9.14100
10 2651102         9.12890


### Perform Retrieval on the Dev

In [16]:
# Generate queries and qrels files for the dev split
!python /home/ryparmar/pyserini/src/generate_queries_and_qrels.py \
    --dataset_file /mnt/data/factcheck/fever/data-cs/fever-data/dev.jsonl \
    --output_queries_file /home/ryparmar/pyserini/cs/queries.par.dev.tsv \
    --output_qrels_file /home/ryparmar/pyserini/cs/qrels.par.dev.tsv \
    --granularity paragraph

Generating qrels...
Generating queries...
Done!


In [17]:
# claim_id, claim
!head -n 3 /home/ryparmar/pyserini/cs/queries.par.dev.tsv

206088	Sociologie je studium vývoje politiky.
207746	Sammy Cahn byl americký hudebník narozený v roce 1913.
146151	Ryby emigrovaly na Island.


In [18]:
# claim_id, doc_id (where the evidence is present), evidence?? in the code is 2 (directly number)
!head -n 3 /home/ryparmar/pyserini/cs/qrels.par.dev.tsv

207746	0	Sammy Cahn	2
107802	0	David Bowie	2
126321	0	George A. Romero	2


In [20]:
!head -n 3 /mnt/data/factcheck/fever/data-cs/fever-data/dev.jsonl

{"id": 206088, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Sociologie je studium vývoje politiky.", "evidence": [], "claim_en": "Sociology is the study of politics development."}
{"id": 207746, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Sammy Cahn byl americký hudebník narozený v roce 1913.", "evidence": [[[245753, 246454, "Sammy Cahn", 0, "Sammy Cahn"]]], "claim_en": "Sammy Cahn was an American musician born in 1913."}
{"id": 146151, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Ryby emigrovaly na Island.", "evidence": [], "claim_en": "Fish emigrated to Iceland."}


In [21]:
# Retrieval run
# k1=0.6 n=0.5 BM25 parameters found by finetuning
!python /home/ryparmar/pyserini/src/retrieve.py \
--hits 1000 --threads 1 \
--index /home/ryparmar/pyserini/cs/indexes-full \
--queries /home/ryparmar/pyserini/cs/queries.par.dev.tsv \
--output /home/ryparmar/pyserini/cs/run.fever-par.dev.tsv \
--k1 0.6 --b 0.5

Initializing BM25, setting k1=0.6 and b=0.5
Retrieving query 0 (0.488 s/query)
Retrieving query 100 (0.079 s/query)
Retrieving query 200 (0.069 s/query)
Retrieving query 300 (0.066 s/query)
Retrieving query 400 (0.064 s/query)
Retrieving query 500 (0.063 s/query)
Retrieving query 600 (0.062 s/query)
Retrieving query 700 (0.061 s/query)
Retrieving query 800 (0.060 s/query)
Retrieving query 900 (0.060 s/query)
Retrieving query 1000 (0.059 s/query)
Retrieving query 1100 (0.059 s/query)
Retrieving query 1200 (0.059 s/query)
Retrieving query 1300 (0.059 s/query)
Retrieving query 1400 (0.059 s/query)
Retrieving query 1500 (0.059 s/query)
Retrieving query 1600 (0.058 s/query)
Retrieving query 1700 (0.058 s/query)
Retrieving query 1800 (0.058 s/query)
Retrieving query 1900 (0.058 s/query)
Retrieving query 2000 (0.058 s/query)
Retrieving query 2100 (0.058 s/query)
Retrieving query 2200 (0.058 s/query)
Retrieving query 2300 (0.058 s/query)
Retrieving query 2400 (0.057 s/query)
Retrieving query 2

In [22]:
# claim_id, predicted_document_id, rank  -- see that there up to 1000 hits (hits arg)
!head -n 1005 /home/ryparmar/pyserini/cs/run.fever-par.dev.tsv | tail -n 10

206088	Vývojová dysfázie	996
206088	Aarskogův–Scottův syndrom	997
206088	George Francis Hamilton	998
206088	Jiga'el Jadin	999
206088	Vladko Maček	1000
207746	Sammy Cahn	1
207746	Jimmy Van Heusen	2
207746	Steve Khan	3
207746	Free Money	4
207746	Sammy Davis mladší	5


In [23]:
!wc -l /mnt/data/factcheck/fever/data-cs/fever-data/dev.jsonl

9999 /mnt/data/factcheck/fever/data-cs/fever-data/dev.jsonl


In [24]:
!wc -l /home/ryparmar/pyserini/cs/run.fever-par.dev.tsv

9955176 /home/ryparmar/pyserini/cs/run.fever-par.dev.tsv


### Evaluation

In [26]:
import jsonlines
# Convert .tsv (anserini) prediction into .jsonl (drqa) predictions
def convert_to_drqa_format(inputfile, outputfile, truthfile):
    out = {}
    with jsonlines.open(truthfile) as ft:
        for line in ft.iter():
            out[str(line['id'])] = {'id': line['id'],
                               'label': line['label'],
                               'evidence': line['evidence']}
    with open(inputfile) as fr:
        for line in fr.readlines():
            claim_id, pred_docid, _ = line.split('\t')
            if claim_id in out and 'predicted_pages' in out[claim_id]:
                out[claim_id]['predicted_pages'].append(pred_docid)
            else:
                out[claim_id]['predicted_pages'] = [pred_docid]

        with jsonlines.open(outputfile, 'w') as fw:
            for key in out:
                fw.write(out[key])
                
# Convert .jsonl (drqa) predictions into .tsv (anserini) prediction 
def convert_to_anserini_format(inputfile, outputfile):
    out = []
    with jsonlines.open(inputfile) as ft:
        for line in ft.iter():
            for rank, docid in enumerate(line['predicted_pages']):
                out.append("{}\t{}\t{}\n".format(line['id'], docid, rank+1))

        with open(outputfile, 'w') as fw:
            for line in out:
                fw.write(line)

In [27]:
ls /home/ryparmar/pyserini/en-latest

[0m[38;5;27mdata[0m/                                    run.fever-par-finetuned-paper.dev.tsv
dev_drqa_k500.tsv                        run.fever-par-finetuned.dev.jsonl
[38;5;27mindexes-full[0m/                            run.fever-par-finetuned.dev.tsv
paper_qrels.par.dev.tsv                  run.fever-par.dev.tsv
paper_queries.par.dev.tsv                subset_dev_drqa_k500.jsonl
qrels.par.dev.tsv                        subset_dev_drqa_k500.tsv
queries.par.dev.tsv                      subset_run.fever-par-finetuned.dev.tsv
run.fever-par-finetuned-paper.dev.jsonl


In [53]:
# convert_to_drqa_format('/home/ryparmar/pyserini/en-latest/run.fever-par.dev.tsv',
#                        '/home/ryparmar/pyserini/en-latest/run.fever-par.dev.jsonl',
#                        '/mnt/data/factcheck/fever/data-en-latest/fever-data/dev.jsonl')
convert_to_drqa_format('/home/ryparmar/pyserini/en-latest/run.fever-par-finetuned-paper.dev.tsv',
                       '/home/ryparmar/pyserini/en-latest/run.fever-par-finetuned-paper.dev.jsonl',
                       '/home/ryparmar/pyserini/fever/paper_dev.jsonl')

In [28]:
# convert_to_anserini_format('/mnt/data/factcheck/fever/data-en-latest/predictions/dev_drqa_k500.jsonl',
#                            '/home/ryparmar/pyserini/en-latest/dev_drqa_k500.tsv')
convert_to_anserini_format('/mnt/data/factcheck/fever/data-cs/predictions/dev_drqa_k500.jsonl',
                           '/home/ryparmar/pyserini/cs/dev_drqa_k500.tsv')

In [29]:
# Evaluate anserini on CS FEVER dev data
!python /home/ryparmar/pyserini/src/evaluate_doc_retrieval.py \
--truth_file /mnt/data/factcheck/fever/data-cs/fever-data/dev.jsonl \
--run_file /home/ryparmar/pyserini/cs/run.fever-par.dev.tsv

k	Fully Supported	Oracle Accuracy
1	0.2799	0.5200
5	0.5089	0.6726
10	0.5921	0.7281
25	0.6860	0.7907
50	0.7433	0.8289
100	0.7903	0.8602
500	0.8630	0.9087


In [30]:
# Evaluate drqa on CS FEVER dev data
!python /home/ryparmar/pyserini/src/evaluate_doc_retrieval.py \
--truth_file /mnt/data/factcheck/fever/data-cs/fever-data/dev.jsonl \
--run_file /home/ryparmar/pyserini/cs/dev_drqa_k500.tsv

k	Fully Supported	Oracle Accuracy
1	0.2415	0.4943
5	0.4673	0.6449
10	0.5497	0.6998
25	0.6629	0.7753
50	0.7342	0.8228
100	0.7852	0.8568
500	0.8608	0.9072


In [28]:
!head -n 9999000 /home/ryparmar/pyserini/en-latest/run.fever-par-finetuned.dev.tsv > /home/ryparmar/pyserini/en-latest/subset_run.fever-par-finetuned.dev.tsv

# Evaluate using our evaluation script

In [60]:
!python /home/ryparmar/drchajan/src/evaluate_document_retrieval.py \
--actual_jsonl /mnt/data/factcheck/fever/data-en-latest/fever-data/dev.jsonl \
--pred_jsonl /mnt/data/factcheck/fever/data-en-latest/predictions/dev_drqa_k500.jsonl \
--max_evidence 500

[INFO] 2020-09-10 12:39:27,952 - LogHelper - Log Helper set up
[INFO] 2020-09-10 12:39:31,872 - EVALDR - Scores(precision=0.002055298647822243, recall=0.8874887488748875, f1=0.004101099727663442)


In [59]:
!python /home/ryparmar/drchajan/src/evaluate_document_retrieval.py \
--actual_jsonl /mnt/data/factcheck/fever/data-en-latest/fever-data/dev.jsonl \
--pred_jsonl /home/ryparmar/pyserini/en-latest/run.fever-par-0.6-0.5.dev.jsonl \
--max_evidence 500

[INFO] 2020-09-10 12:39:18,968 - LogHelper - Log Helper set up
[INFO] 2020-09-10 12:39:25,998 - EVALDR - Scores(precision=0.0016157452193816906, recall=0.7005700570057005, f1=0.0032240547070647844)


In [167]:
# Evaluate with trec_eval (TREC)
# Convert the runs and qrels to trec files
!python /home/ryparmar/pyserini/src/convert_msmarco_to_trec_run.py \
--input /home/ryparmar/pyserini/run.fever-par-0.6-0.5.dev.tsv \
--output /home/ryparmar/pyserini/run.fever-par-0.6-0.5.dev.trec

!python /home/ryparmar/pyserini/src/convert_msmarco_to_trec_qrels.py \
--input /home/ryparmar/pyserini/qrels.par.dev.tsv \
--output /home/ryparmar/pyserini/qrels.par.dev.trec

Done!
Done!


In [169]:
# Run evaluation itself
!./home/ryparmar/pyserini/src/trec_eval.9.0.4/trec_eval -c -m all_trec \
/home/ryparmar/pyserini/qrels.par.dev.trec /home/ryparmar/pyserini/run.fever-par-0.6-0.5.dev.trec

/usr/bin/sh: ./home/ryparmar/pyserini/src/trec_eval.9.0.4/trec_eval: No such file or directory
