In [2]:
import os, sys
from typing import Dict, List, Tuple, Optional
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf

## Set Project directory

In [3]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from config import SciFactT5Config
import definitions
sys.path.append(os.path.dirname(definitions.PROJECT_VARS.ROOT_DIR))
print(definitions.PROJECT_VARS.ROOT_DIR)

/home/qudratealahyratu/research/nlp/fact_checking/my_work/T5ParEvo
/home/qudratealahyratu/research/nlp/fact_checking/my_work/T5ParEvo


In [5]:
from T5ParEvo.src.data.data import Label, make_label

## Load dataset for Scifact

In [4]:
from verisci.covid import AbstractRetriever, RationaleSelector, LabelPredictor
from verisci.evaluate.lib.data import GoldDataset

In [5]:
def get_claim_label_from_jsonl(dataset_jsonl):
    """_summary_

    Args:
        dataset_jsonl (_type_): _description_

    Returns:
        _type_: _description_
    """
    claim_label_list_train = []
    for cur_claim in dataset_jsonl:
        claim_txt = cur_claim.claim

        for doc_id, evidence in cur_claim.evidence.items():

            ev_doc = cur_claim.release.corpus.get_document(doc_id)

            claim_label = evidence.label.name

            tmp_dic = {"claim" : claim_txt, "label" : claim_label}

            claim_label_list_train.append(tmp_dic)
    return claim_label_list_train

In [6]:
def get_claim_label_evidence_from_jsonl(dataset_jsonl, source):
    """_summary_

    Args:
        dataset_jsonl (_type_): _description_
        source (_type_): _description_

    Returns:
        _type_: _description_
    """
    claim_label_list_train = []
    for cur_claim in dataset_jsonl:
        claim_txt = cur_claim.claim
        for doc_id, evidence in cur_claim.evidence.items():
            ev_doc = claim_train.release.corpus.get_document(doc_id)
            claim_label = evidence.label.name
            list_rationales = []
            for i, sents in enumerate(evidence.rationales):
                list_rationales = [sent for i, sent in enumerate(ev_doc.sentences) if i in sents]
            tmp_dic = {"claim" : claim_txt, "label" : claim_label, "list_rationales" :list_rationales, "source" :source}
            claim_label_list_train.append(tmp_dic)
    return claim_label_list_train

In [7]:

cfg= SciFactT5Config()
ds_train = GoldDataset(cfg.target_dataset.loc_target_dataset_corpus,
                    cfg.target_dataset.loc_target_dataset_test)
claim_train = ds_train.get_claim(39)

dic_train = get_claim_label_evidence_from_jsonl(ds_train, source = "train")

In [8]:
ds_train[7].evidence

{33409100: EvidenceAbstract(id=33409100, label=<Label.SUPPORTS: 2>, rationales=[[8], [12]])}

In [9]:
doc_ids = list(ds_train.claims[4].evidence.keys())
for doc_id in doc_ids:
    print(ds_train.claims[4].id)
    print(ds_train.claims[4].claim)
    print(doc_id)
    print(ds_train.claims[4].evidence[doc_id].label.name)
    print(ds_train.claims[4].evidence[doc_id].rationales)

9
32% of liver transplantation programs required patients to discontinue methadone treatment in 2001.
44265107
SUPPORTS
[[15]]


In [4]:
from T5ParEvo.src.data.data import Label, make_label

# Initialize the dictionary that will store unique claim texts and corresponding claim objects
unique_claims_dict = {}

# Iterate through each claim
for claim in ds_train.claims:
    # Get all the labels for the claim
    labels = [evidence.label for evidence in claim.evidence.values()]

    # Filter out claims with no labels or only NEI labels
    if not labels or all(label == Label.NEI for label in labels):
        continue

    # Count the number of 'SUPPORTS' and 'REFUTES' labels
    supports_count = labels.count(Label.SUPPORTS)
    refutes_count = labels.count(Label.REFUTES)

    # Calculate the support-to-refute or refute-to-support ratio
    ratio = supports_count / refutes_count if refutes_count else supports_count

    # If the claim text is not in our dictionary, or if the new claim has a better ratio, update the dictionary
    if claim.claim not in unique_claims_dict or ratio > unique_claims_dict[claim.claim][1]:
        unique_claims_dict[claim.claim] = (claim, ratio)

# Get the list of unique claims from the dictionary
unique_claims = [claim_info[0] for claim_info in unique_claims_dict.values()]

NameError: name 'ds_train' is not defined

In [10]:
unique_claims

[Example 2: 1 in 5 million in UK have abnormal PrP positivity.,
 Example 9: 32% of liver transplantation programs required patients to discontinue methadone treatment in 2001.,
 Example 12: 40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.,
 Example 22: 76-85% of people with severe mental disorder receive no treatment in low and middle income countries.,
 Example 28: A T helper 2 cell (Th2) environment impedes disease development in patients with systemic lupus erythematosus (SLE).,
 Example 30: A breast cancer patient's capacity to metabolize tamoxifen influences treatment outcome.,
 Example 32: A country's Vaccine Alliance (GAVI) eligibility is not indictivate of accelerated adoption of the Hub vaccine.,
 Example 34: A deficiency of folate increases blood levels of homocysteine.,
 Example 39: A diminished ovarian reserve does not solely indicate infertility in an a priori non-infertile population.,
 Example 40: A

In [74]:
labels_set

{<Label.REFUTES: 0>, <Label.SUPPORTS: 2>}

In [8]:
from dataclasses import dataclass, field
from typing import Dict, List, Tuple, Optional

In [9]:
from T5ParEvo.src.data import data as dataobj

def get_datalist_from_dict_ds(data_dict : Dict):
    """_summary_

    Args:
        data_dict (Dict): _description_

    Returns:
        _type_: _description_
    """
    counter_gold_claim = 0
    data_list = []
    for cur_claim in data_dict:
        claim = dataobj.Claim(claim_text = cur_claim['claim'])
        rationales = []
        for cur_ratnl in cur_claim['list_rationales']:
            rationale = dataobj.Rationale(rationale_text = cur_ratnl)
            label_obj = dataobj.Label.get_enum_rep_label(res_str = cur_claim['label'])
            rationales.append({'rationale': rationale, 'label': label_obj})
        claim_org = dataobj.ClaimRationale(id_claim = str(counter_gold_claim), claim = claim, rationales = rationales)
        data_list.append(claim_org)
        counter_gold_claim += 1    
    return data_list

In [12]:
data_list_train = get_datalist_from_dict_ds(dic_train)

In [14]:
data_list_train[0] 

ClaimRationale(id_claim='0', claim=Claim(claim_text='1 in 5 million in UK have abnormal PrP positivity.'), rationales=[{'rationale': Rationale(rationale_text='RESULTS Of the 32,441 appendix samples 16 were positive for abnormal PrP, indicating an overall prevalence of 493 per million population (95% confidence interval 282 to 801 per million).'), 'label': <Label_enm.REFUTE: 0>}])

# Setting up experiment

# Scrap

In [1]:
import os
import sys
from tqdm import tqdm
import argparse
from pathlib import Path
from dataclasses import dataclass
from typing import Any, Dict, List
import json
#
module_path = os.path.abspath(os.path.join('...'))
if module_path not in sys.path:
    sys.path.append(module_path)

from config import SciFactT5Config
import definitions
sys.path.append(os.path.dirname(definitions.PROJECT_VARS.ROOT_DIR))
print(definitions.PROJECT_VARS.ROOT_DIR)
#

from T5ParEvo.src.data.data import Claim, ClaimPredictions,GoldDataset
from multivers import util
from multivers.data_r import ClaimDataLoaderGenerator, get_dataloader, DataLoaderGenerator
from multivers.model_r import MultiVerSModel

from T5ParEvo.target_system.multivers.multivers_interface import PredictionParams, ModelPredictor

/home/qudratealahyratu/research/nlp/fact_checking/my_work/T5ParEvo
/home/qudratealahyratu/research/nlp/fact_checking/my_work/T5ParEvo
/home/qudratealahyratu/research/nlp/fact_checking/my_work/T5ParEvo


  "The `@auto_move_data` decorator is deprecated in v1.3 and will be removed in v1.5."


In [2]:
# This dataset to be used only for training
cfg= SciFactT5Config()
ds_train = GoldDataset(cfg.target_dataset.loc_target_dataset_corpus,
                    cfg.target_dataset.loc_target_dataset_train)
claim_train = ds_train.get_claim(39)



In [3]:

params = PredictionParams(
    checkpoint_path="/home/qudratealahyratu/research/nlp/fact_checking/my_work/multivers/checkpoints/scifact.ckpt",
    output_file= None,#"prediction/pred_opt_scifact.jsonl",
    batch_size=3,
    device=0,
    num_workers=4,
    no_nei=False,
    force_rationale=False,
    debug=False,
)
corpus_file = "/home/qudratealahyratu/research/nlp/fact_checking/my_work/multivers/data/scifact/corpus.jsonl"
    

gold_claims = []
claims_path = '/home/qudratealahyratu/research/nlp/fact_checking/my_work/multivers/data/scifact/claims_test_retrived.jsonl'
with open(claims_path, 'r') as f:
    for line in f:
        data = json.loads(line)
        claim = Claim(id = data['id'], claim = data['claim'], cited_docs = data['doc_ids'], evidence = {},release = None)
        gold_claims.append(claim)

dataloader_generator = DataLoaderGenerator(params, gold_claims, corpus_file)
dataloader = dataloader_generator.get_dataloader_by_claims()
predictor = ModelPredictor(params, dataloader)
prediction_formatted = predictor.run()

Some weights of the model checkpoint at allenai/longformer-large-4096 were not used when initializing LongformerModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  stream(template_mgs % msg_args)
 21%|██        | 314/1500 [01:06<03:19,  5.94it/s]

In [None]:
# prediction = prediction_formatted[0]  # assuming there's only one prediction
# claim_predictions = ClaimPredictions.from_formatted_prediction(prediction, claim)
# claim_predictions.pretty_print_simple()
# print(claim_predictions)

In [None]:
# prediction_formatted
claim_org_predictions: List[ClaimPredictions] = []
# format all the predictions
for cur_prediction in prediction_formatted:
    cur_claim = Claim.get_claim_by_id(gold_claims, cur_prediction['id'])
    claim_predictions = ClaimPredictions.from_formatted_prediction(cur_prediction, gold_claim = cur_claim)
    claim_org_predictions.append(claim_predictions)


In [None]:
claim_org_predictions[0].pretty_print_simple()

Claim ID: 7
Gold Claim: Example 7: 10-20% of people with severe mental disorder receive no treatment in low and middle income countries.

Predictions:

Abstract ID: 6490571
Label: SUPPORTS
Rationale Sentences:
- 7


In [None]:
for cur_claim_pred in claim_org_predictions:
    if len(cur_claim_pred.predictions.keys()) > 1:
        print(cur_claim_pred.gold.id)

8
16
23
38
65


In [19]:
prediction_formatted


[Predictions for 7: 10-20% of people with severe mental disorder receive no treatment in low and middle income countries.,
 Predictions for 8: 25% of patients with melanoma and an objective response to PD-1 blockade will experience a progression in their melanoma.,
 Predictions for 16: 50% of patients exposed to radiation have activated markers of myofibroblasts.,
 Predictions for 23: 8% of burn patients are admitted for hospitalization and further treatment after appearing at hospital emergency wards or outpatient clinics.,
 Predictions for 29: A breast cancer patient's capacity to metabolize tamoxifen has no effect on treatment outcome.,
 Predictions for 31: A country's Vaccine Alliance (GAVI) eligibility is associated with accelerated adoption of the Hub vaccine.,
 Predictions for 33: A deficiency of folate decreases blood levels of homocysteine.,
 Predictions for 38: A deficiency of vitamin B6 increases blood levels of homocysteine.,
 Predictions for 59: APOE4 expression in iPSC-de

In [14]:
def get_unique_claims(claims: List[Claim]):
    unique_claims = set()
    unique_claim_objects = []
    for claim in claims:
        if claim.claim not in unique_claims:
            unique_claims.add(claim.claim)
            unique_claim_objects.append(claim)
    return unique_claim_objects

In [12]:
len(claims)

300

In [15]:
claims_unique = get_unique_claims(claims)
len(claims_unique)

297

In [34]:
print(claim_predictions.claim_id)
print(claim_predictions.predictions[6490571].abstract_id)
print(claim_predictions.predictions[6490571].label)
print(claim_predictions.predictions[6490571].rationale)

7
6490571
Label.SUPPORTS
[7]
