In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import tiktoken
from datasets import load_dataset
from tqdm import tqdm
import dataset_creators.config as config
import pandas as pd
import os
import uuid
import tqdm

In [5]:

dataset = load_dataset("ccdv/pubmed-summarization", split="test", cache_dir = '/scratch/ramprasad.sa/huggingface_datasets')

No config specified, defaulting to: pubmed-summarization/section
Found cached dataset pubmed-summarization (/scratch/ramprasad.sa/huggingface_datasets/ccdv___pubmed-summarization/section/1.0.0/f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b)


In [10]:

def check_prompt_token_limits(article, instructions, tokenizer, token_limit):
    counter = 0
    for key, instruction in instructions.items():
        prompt = f'Article: {article}\n{instruction}'
        prompt_len = len(tokenizer.encode(prompt))
        if prompt_len < token_limit:
            counter += 1 
    return counter 

def get_shortlisted_data(articles, reference_summaries, ids = [], token_limit = 4096, dataset = 'xsum'):
    
    flan_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
    flan_instructions = config.instructions[f'{dataset}_flant5']
    
    gpt_tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
    gpt_instructions = config.instructions[f'{dataset}_gpt3']
    
    shortlisted_articles = []
    shortlisted_reference_summaries = []
    
    shortlisted_ids = []
    
    for idx, article in enumerate(tqdm(articles)):
        add_article = 0 
        
        flan_counter = check_prompt_token_limits(article, flan_instructions, flan_tokenizer, token_limit)
        add_article += flan_counter
        
        gpt_counter = check_prompt_token_limits(article, gpt_instructions, gpt_tokenizer, token_limit)
        add_article += gpt_counter
        

        if add_article == 4:
            shortlisted_articles.append(article)
            shortlisted_reference_summaries.append(reference_summaries[idx])
            
            if not ids:
                article_id = str(uuid.uuid4())
            else:
                article_id = ids[idx]
            
            shortlisted_ids.append(article_id)
            
    return shortlisted_articles, shortlisted_reference_summaries, shortlisted_ids


def make_sample_pubmed(data_path, token_limit = 4096):
    dataset = load_dataset("ccdv/pubmed-summarization", split="test", cache_dir = '/scratch/ramprasad.sa/huggingface_datasets')
    articles = dataset['article']
    reference_summaries = dataset['abstract']
    
    
    
    shortlisted_data = {'article': [], 'reference_summary': [], 'id': [], 'origin': []}
    
    shortlisted_articles, shortlisted_reference_summaries, shortlisted_ids = get_shortlisted_data(articles, reference_summaries, dataset = 'pubmed')
    shortlisted_data['article'] += shortlisted_articles
    shortlisted_data['reference_summary'] += shortlisted_reference_summaries
    shortlisted_data['id'] += shortlisted_ids
    shortlisted_data['origin'] += ['pubmed'] * len(shortlisted_ids)
            
    isExist = os.path.exists(data_path)
    if not isExist:
        os.makedirs(data_path)
    
    df = pd.DataFrame(shortlisted_data)
    df.to_csv(f'{data_path}/test_sample.csv')
    return df



In [11]:
shortlisted_data = make_sample_pubmed('/home/ramprasad.sa/factual_annotation_llm_summaries/datasets/pubmed')
# pd.DataFrame(shortlisted_data)

No config specified, defaulting to: pubmed-summarization/section
Found cached dataset pubmed-summarization (/scratch/ramprasad.sa/huggingface_datasets/ccdv___pubmed-summarization/section/1.0.0/f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b)
  0%|                                                                                                                                                                                                                                                  | 0/6658 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (4625 > 512). Running this sequence through the model will result in indexing errors
  1%|█▊                                                                                                                                                                                                                                       | 51/6658 [00:01<04:05, 26.97it/s]

KeyboardInte

In [8]:
shortlisted_data

Unnamed: 0,article,reference_summary,id,origin
0,congenital adrenal hyperplasia ( cah ) refers ...,congenital adrenal hyperplasia is a group of a...,f05ee546-a3c4-40fb-b604-9497cb165de8,pubmed
1,the family is the cornerstone of human social ...,background : since the family is a social syst...,7bca5d77-7541-4ff9-a46f-d446b72dd9cd,pubmed
2,development of human societies and industriali...,background and objective : anxiety and depre...,0fc68a72-a6d3-4ec9-8f43-61124446a064,pubmed
3,male macroprolactinomas ( mprl ) are usually r...,background : suppurative meningitis ( sm ) or ...,86ed3313-6fde-46d4-b555-0afd6821f378,pubmed
4,the femoral head often leads to healing compli...,fracture of the femoral neck continues to be a...,861ad9de-1ed4-4dc2-94f3-b4ec29b8e99e,pubmed
...,...,...,...,...
3499,odorant binding proteins ( obps ) were identif...,odorant - binding proteins ( obps ) were disco...,543ed531-f820-48f4-ba82-bb7cd18b0f56,pubmed
3500,minimally invasive surgery was first described...,purposethis study aimed to comparatively evalu...,29d98812-1c9e-456b-9e7c-1836a5d7c478,pubmed
3501,a 60-year - old male patient was referred to a...,warthin 's tumor is the second most common typ...,e7456860-d6cc-4811-955b-69eea48b860c,pubmed
3502,they almost always require open reduction and ...,introduction : subtrochanteric fractures have ...,bc18fe1d-dd83-4523-a52c-b9bf465b8d78,pubmed


In [6]:
len(dataset['abstract'])

6658

In [9]:
dataset['abstract'][0]

"research on the implications of anxiety in parkinson 's disease ( pd ) has been neglected despite its prevalence in nearly 50% of patients and its negative impact on quality of life . \n previous reports have noted that neuropsychiatric symptoms impair cognitive performance in pd patients ; however , to date , no study has directly compared pd patients with and without anxiety to examine the impact of anxiety on cognitive impairments in pd . \n this study compared cognitive performance across 50 pd participants with and without anxiety ( 17 pda+ ; 33 pda ) , who underwent neurological and neuropsychological assessment . \n group performance was compared across the following cognitive domains : simple attention / visuomotor processing speed , executive function ( e.g. , set - shifting ) , working memory , language , and memory / new verbal learning . \n results showed that pda+ performed significantly worse on the digit span forward and backward test and part b of the trail making task

In [12]:
# df = pd.read_csv('/home/ramprasad.sa/factual_annotation_llm_summaries/datasets/pubmed/test_sample.csv')
# df

Unnamed: 0.1,Unnamed: 0,article,reference_summary,id,origin
0,0,congenital adrenal hyperplasia ( cah ) refers ...,congenital adrenal hyperplasia is a group of a...,f86b1a9c-e296-4a7f-ba07-1920e1f2df63,pubmed
1,1,the family is the cornerstone of human social ...,background : since the family is a social syst...,4840cac6-eb99-4c49-b260-df9bd7029794,pubmed
2,2,development of human societies and industriali...,background and objective : anxiety and depre...,fe2f2929-1cef-4d11-a0b8-6ed2f6fc4e30,pubmed
3,3,male macroprolactinomas ( mprl ) are usually r...,background : suppurative meningitis ( sm ) or ...,f9c34a2c-2139-4f59-ad83-4cbb5db65927,pubmed
4,4,the femoral head often leads to healing compli...,fracture of the femoral neck continues to be a...,b8889d29-54d1-482c-adea-e523306391bc,pubmed
...,...,...,...,...,...
3499,3499,odorant binding proteins ( obps ) were identif...,odorant - binding proteins ( obps ) were disco...,0f88f002-8c4a-421c-8fb4-9e40bb27d9b1,pubmed
3500,3500,minimally invasive surgery was first described...,purposethis study aimed to comparatively evalu...,ebec1384-1956-4bd8-8235-81d9f56c5c6d,pubmed
3501,3501,a 60-year - old male patient was referred to a...,warthin 's tumor is the second most common typ...,b2414771-8bc2-446c-849d-311e0b3c2c21,pubmed
3502,3502,they almost always require open reduction and ...,introduction : subtrochanteric fractures have ...,7800e5c6-a945-4511-a5b2-20efc08b49c3,pubmed
