In [1]:
import argparse
import json
import os
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

from dataclasses import dataclass
from tqdm import tqdm

# Add module path to sys.path
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.insert(0,module_path)

# Import third-party libraries
from config import SciFactT5Config
from multivers.data_r import ClaimDataLoaderGenerator, DataLoaderGenerator, get_dataloader
from multivers.model_r import MultiVerSModel
from multivers import util

# Other necessary imports
import definitions

# Add necessary paths to sys.path
sys.path.append(os.path.dirname(definitions.PROJECT_VARS.ROOT_DIR))

# Import project-specific modules
from T5ParEvo.src.data.data import Claim, ClaimPredictions, GoldDataset, Label
from T5ParEvo.src.linguistic.ner_abbr import Abbreviation, NEREntity
from T5ParEvo.target_system.multivers.multivers_interface import ModelPredictorMultivers, PredictionParams,ModelPredictorMultiversList

# Print module_path and definitions.PROJECT_VARS.ROOT_DIR
print('module_path:', module_path)
print('definitions.PROJECT_VARS.ROOT_DIR:', definitions.PROJECT_VARS.ROOT_DIR)


root dir :  /home/qudratealahyratu/research/nlp/fact_checking/my_work/T5ParEvo


  "The `@auto_move_data` decorator is deprecated in v1.3 and will be removed in v1.5."


/home/qudratealahyratu/research/nlp/fact_checking/my_work/T5ParEvo
module_path: /home/qudratealahyratu/research/nlp/fact_checking/my_work/T5ParEvo
definitions.PROJECT_VARS.ROOT_DIR: /home/qudratealahyratu/research/nlp/fact_checking/my_work/T5ParEvo


## Load Data

In [2]:
# This dataset to be used only for training
cfg= SciFactT5Config()
ds_train = GoldDataset(cfg.target_dataset.loc_target_dataset_corpus,
                    cfg.target_dataset.loc_target_dataset_train)
claim_train = ds_train.get_claim(39)

In [3]:

params = PredictionParams(
    checkpoint_path= "/home/qudratealahyratu/research/nlp/fact_checking/my_work/multivers/checkpoints/scifact.ckpt",
    output_file= None,#"prediction/pred_opt_scifact.jsonl",
    batch_size=5,
    device=0,
    num_workers=4,
    no_nei=False,
    force_rationale=False,
    debug=False,
    corpus_file = cfg.target_dataset.loc_target_dataset_corpus
)
corpus_file = cfg.target_dataset.loc_target_dataset_corpus#cfg.target_dataset.loc_target_dataset_test#"/home/qudratealahyratu/research/nlp/fact_checking/my_work/multivers/data/scifact/corpus.jsonl"
    

gold_claims = []
claims_path = cfg.target_dataset.loc_target_dataset_test#'/home/qudratealahyratu/research/nlp/fact_checking/my_work/multivers/data/scifact/claims_test_retrived.jsonl'
with open(claims_path, 'r') as f:
    for line in f:
        data = json.loads(line)
        claim = Claim(id = data['id'], claim = data['claim'], cited_docs = data['doc_ids'], evidence = {},release = None)
        gold_claims.append(claim)

#get unique claims
unique_gold_claims = Claim.get_unique_claims(gold_claims)
# Predict for unique claims
# dataloader_generator = DataLoaderGenerator(params, unique_gold_claims, corpus_file)
# dataloader = dataloader_generator.get_dataloader_by_claims()
# predictor = ModelPredictor(params, dataloader)
# prediction_formatted = predictor.run()

## Predict Original Claims

In [4]:
#get unique claims
unique_gold_claims = Claim.get_unique_claims(gold_claims)
# Predict for unique claims
corpus_file="/home/qudratealahyratu/research/nlp/fact_checking/my_work/multivers/data/scifact/corpus.jsonl"
dataloader_generator = DataLoaderGenerator(params, unique_gold_claims, corpus_file)
dataloader = dataloader_generator.get_dataloader_by_claims()
# prediction_model = ModelPredictorMultivers(params, dataloader,corpus_file)
prediction_model = ModelPredictorMultivers(params, unique_gold_claims[0])
original_claim_predictions_raw = prediction_model.predict()

Some weights of the model checkpoint at allenai/longformer-large-4096 were not used when initializing LongformerModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  stream(template_mgs % msg_args)
100%|██████████| 2/2 [00:01<00:00,  1.04it/s]


AttributeError: 'dict' object has no attribute 'claim_id'

In [None]:
# prediction_formatted
org_claim_predictions: List[ClaimPredictions] = []
# format all the predictions
for cur_prediction in tqdm(original_claim_predictions_raw, desc="Formatting predictions"):
    cur_claim = Claim.get_claim_by_id(gold_claims, cur_prediction['id'])
    claim_predictions = ClaimPredictions.from_formatted_prediction(cur_prediction, gold_claim = cur_claim)
    org_claim_predictions.append(claim_predictions)

Formatting predictions: 100%|██████████| 297/297 [00:00<00:00, 71964.66it/s]


In [6]:
org_claim_predictions[0].predictions

{6490571: PredictedAbstract(abstract_id=6490571, label=<Label.SUPPORTS: 2>, rationale=[7])}

## Paraphrase and attack original claims

In [7]:
from T5ParEvo.src.paraphrase.paraphrase_claim import ParaphrasedAttack
from T5ParEvo.src.paraphrase.paraphraser import T5Paraphraser, ModelConfig
from transformers import T5ForConditionalGeneration, PreTrainedTokenizer, PreTrainedModel, T5Tokenizer, T5ForConditionalGeneration
import torch


paraphrase_config_params = {
        'max_length':512,
        'do_sample':True,
        'top_k':50,
        'top_p': 0.99,
        'repetition_penalty':3.5,
        'early_stopping':True,
        'num_return_sequences':10
}
paraphrase_config = ModelConfig(**paraphrase_config_params)
    # Load T5 model and tokenizer
model_t5 = T5ForConditionalGeneration.from_pretrained('/home/qudratealahyratu/research/nlp/fact_checking/my_work/SciMedAttack/results/t5_paws_masked_claim_abstract_paws_3_epoch_2/models/model_3_epochs/')
tokenizer_t5 = T5Tokenizer.from_pretrained('Vamsi/T5_Paraphrase_Paws')

# Ensure model is in correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_t5 = model_t5.to(device)

paraphrase_model = T5Paraphraser(model_t5, tokenizer_t5, paraphrase_config)
paraphrase_attack = ParaphrasedAttack(paraphrase_model, prediction_model)

In [12]:
result = paraphrase_attack.attack(0 , cur_original_claim, cur_claim_predictions)

NameError: name 'cur_original_claim' is not defined

In [8]:
paraphrased_attack_results = []
for cur_claim in tqdm(org_claim_predictions, desc="Attacking and Paraphrasing claims"):
    result = paraphrase_attack.attack(0 , cur_claim.gold, cur_claim.predictions)
    paraphrased_attack_results.append(result)

Attacking and Paraphrasing claims:   0%|          | 0/297 [00:00<?, ?it/s]


TypeError: __init__() missing 1 required positional argument: 'corpus_file'

In [5]:
import pickle

with open('../data/meta/merged_abbreviations.pkl', 'rb') as f:
    merged_abbreviations = pickle.load(f)

with open('../data/meta/merged_entities.pkl', 'rb') as f:
    merged_entities = pickle.load(f)


In [7]:
merged_entities

{7: [NEREntity(claim_id=7, ner_text='people', ner_label='ORGANISM', ner_model='ner_bionlp13cg_md', start_char=10, end_char=16)],
 8: [NEREntity(claim_id=8, ner_text='patients', ner_label='ORGANISM', ner_model='ner_bionlp13cg_md', start_char=7, end_char=15),
  NEREntity(claim_id=8, ner_text='melanoma', ner_label='DISEASE', ner_model='ner_bc5cdr_md', start_char=21, end_char=29),
  NEREntity(claim_id=8, ner_text='PD-1', ner_label='GENE_OR_GENE_PRODUCT', ner_model='ner_bionlp13cg_md', start_char=59, end_char=63),
  NEREntity(claim_id=8, ner_text='melanoma', ner_label='DISEASE', ner_model='ner_bc5cdr_md', start_char=112, end_char=120)],
 16: [NEREntity(claim_id=16, ner_text='patients', ner_label='ORGANISM', ner_model='ner_bionlp13cg_md', start_char=7, end_char=15),
  NEREntity(claim_id=16, ner_text='myofibroblasts', ner_label='CELL', ner_model='ner_bionlp13cg_md', start_char=63, end_char=77)],
 23: [NEREntity(claim_id=23, ner_text='patients', ner_label='ORGANISM', ner_model='ner_bionlp13cg_

In [4]:
import torch
# Constants and configurations
CHECKPOINT_PATH = '/home/qudratealahyratu/research/nlp/fact_checking/my_work/SciMedAttack/results/t5_paws_masked_claim_abstract_paws_3_epoch_2/models/model_3_epochs/'
PARAPHRASE_CONFIG_PARAMS = {
    'max_length': 512,
    'do_sample': True,
    'top_k': 50,
    'top_p': 0.99,
    'repetition_penalty': 3.5,
    'early_stopping': True,
    'num_return_sequences': 10
}
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Utility function to load T5 model
def load_t5_model(checkpoint_path):
    model_t5 = T5ForConditionalGeneration.from_pretrained(checkpoint_path)
    model_t5 = model_t5.to(DEVICE)
    return model_t5

In [6]:
# Loading unique claims and predicting
unique_gold_claims = Claim.get_unique_claims(gold_claims)
dataloader_generator = ClaimDataLoaderGenerator(params, unique_gold_claims[0], corpus_file)
dataloader = dataloader_generator.get_dataloader_by_single_claim()
# prediction_model = ModelPredictorMultivers(params, dataloader, corpus_file)
prediction_model = ModelPredictorMultivers(params, unique_gold_claims[0])
original_claim_predictions_raw = prediction_model.predict()

Some weights of the model checkpoint at allenai/longformer-large-4096 were not used when initializing LongformerModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  stream(template_mgs % msg_args)
100%|██████████| 2/2 [00:01<00:00,  1.06it/s]


In [9]:
type(original_claim_predictions_raw)

T5ParEvo.src.data.data.ClaimPredictions

In [7]:
# Formatting all the predictions
org_claim_predictions = []
cur_prediction =original_claim_predictions_raw
cur_claim = Claim.get_claim_by_id(gold_claims, cur_prediction.claim_id)
claim_predictions = ClaimPredictions.from_formatted_prediction(cur_prediction, gold_claim=cur_claim)
org_claim_predictions.append(claim_predictions)

TypeError: 'ClaimPredictions' object is not subscriptable

In [None]:
# Formatting all the predictions
org_claim_predictions = []
for cur_prediction in tqdm(original_claim_predictions_raw, desc="Formatting predictions"):
    cur_claim = Claim.get_claim_by_id(gold_claims, cur_prediction['id'])
    claim_predictions = ClaimPredictions.from_formatted_prediction(cur_prediction, gold_claim=cur_claim)
    org_claim_predictions.append(claim_predictions)

NameError: name 'original_claim_predictions_raw' is not defined

In [11]:
from transformers import T5ForConditionalGeneration, PreTrainedTokenizer, PreTrainedModel, T5Tokenizer, T5ForConditionalGeneration

In [12]:
# Load T5 model and tokenizer
model_t5 = load_t5_model(CHECKPOINT_PATH)
tokenizer_t5 = T5Tokenizer.from_pretrained('Vamsi/T5_Paraphrase_Paws')


In [15]:
from T5ParEvo.src.paraphrase.paraphrase_claim import ParaphrasedAttack
from T5ParEvo.src.paraphrase.paraphraser import T5Paraphraser, ModelConfig

# Initialize paraphrase model and paraphrase attack
paraphrase_config = ModelConfig(**PARAPHRASE_CONFIG_PARAMS)
paraphrase_model = T5Paraphraser(model_t5, tokenizer_t5, paraphrase_config)
paraphrase_attack = ParaphrasedAttack(paraphrase_model, prediction_model)



In [None]:
# Attack and paraphrase claims
paraphrased_attack_results = []
for cur_claim in tqdm(org_claim_predictions, desc="Attacking and Paraphrasing claims"):
    result = paraphrase_attack.attack(0, cur_claim.gold, cur_claim.predictions)
    paraphrased_attack_results.append(result)

In [17]:
result = paraphrase_attack.attack(0, original_claim_predictions_raw.gold, original_claim_predictions_raw.predictions)

TypeError: predict() takes 1 positional argument but 2 were given