In [20]:
from datasets import load_dataset

dataset = load_dataset("Blaise-g/scitldr", cache_dir = '/scratch/ramprasad.sa/huggingface_datasets')

Downloading metadata:   0%|          | 0.00/1.70k [00:00<?, ?B/s]

Downloading and preparing dataset scitldr/FullText (download: 46.14 MiB, generated: 95.46 MiB, post-processed: Unknown size, total: 141.60 MiB) to /scratch/ramprasad.sa/huggingface_datasets/Blaise-g___parquet/Blaise-g--scitldr-0e01d2d9de8032a7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/29.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.67M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1992 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/619 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/618 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /scratch/ramprasad.sa/huggingface_datasets/Blaise-g___parquet/Blaise-g--scitldr-0e01d2d9de8032a7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [29]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import tiktoken
from datasets import load_dataset
from tqdm import tqdm
import dataset_creators.config as config
import pandas as pd
import os
import uuid
import re

In [40]:
def check_prompt_token_limits(article, instructions, tokenizer, token_limit):
    counter = 0
    for key, instruction in instructions.items():
        prompt = f'Article: {article}\n{instruction}'
        prompt_len = len(tokenizer.encode(prompt))
        if prompt_len < token_limit:
            counter += 1 
    return counter 

def get_shortlisted_data(articles, reference_summaries, ids = [], token_limit = 4096, dataset = 'xsum'):
    
    flan_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
    flan_instructions = config.instructions[f'{dataset}_flant5']
    
    gpt_tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
    gpt_instructions = config.instructions[f'{dataset}_gpt3']
    
    shortlisted_articles = []
    shortlisted_reference_summaries = []
    
    shortlisted_ids = []
    
    for idx, article in enumerate(articles):
        add_article = 0 
        
        flan_counter = check_prompt_token_limits(article, flan_instructions, flan_tokenizer, token_limit)
        add_article += flan_counter
        
        gpt_counter = check_prompt_token_limits(article, gpt_instructions, gpt_tokenizer, token_limit)
        add_article += gpt_counter
        

        if add_article == 4:
            shortlisted_articles.append(article)
            shortlisted_reference_summaries.append(reference_summaries[idx])
            
            if not ids:
                article_id = str(uuid.uuid4())
            else:
                article_id = ids[idx]
            
            shortlisted_ids.append(article_id)
            
    return shortlisted_articles, shortlisted_reference_summaries, shortlisted_ids

def preprocess_html_tags(article):
    CLEANR = re.compile('<.*?>') 
    article = re.sub(CLEANR, '', article)
    return article

def make_sample_scitldr(data_path, token_limit = 4096):
    dataset = load_dataset("allenai/scitldr", split = 'test', cache_dir = '/scratch/ramprasad.sa/huggingface_datasets')
    articles = dataset['source']
#     articles = [preprocess_html_tags(each) for each in articles]
    articles = ['\n'.join(each) for each in articles]
    reference_summaries = dataset['target']
    ids = dataset['paper_id']
    
    
    shortlisted_data = {'article': [], 'reference_summary': [], 'id': [], 'origin': []}
    
    shortlisted_articles, shortlisted_reference_summaries, shortlisted_ids = get_shortlisted_data(articles, reference_summaries, ids, dataset = 'chemsum' )
    shortlisted_data['article'] += shortlisted_articles
    shortlisted_data['reference_summary'] += shortlisted_reference_summaries
    shortlisted_data['id'] += shortlisted_ids
    shortlisted_data['origin'] += ['scitldr'] * len(shortlisted_ids)
            
    isExist = os.path.exists(data_path)
    if not isExist:
        os.makedirs(data_path)
    
    df = pd.DataFrame(shortlisted_data)
    df.to_csv(f'{data_path}/test_sample.csv')
    return df

In [41]:
make_sample_scitldr('/home/ramprasad.sa/factual_annotation_llm_summaries/datasets/scitldr')

Found cached dataset parquet (/scratch/ramprasad.sa/huggingface_datasets/Blaise-g___parquet/Blaise-g--scitldr-0e01d2d9de8032a7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Token indices sequence length is longer than the specified maximum sequence length for this model (8739 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,article,reference_summary,id,origin
0,The variational autoencoder (VAE) is a popular...,"To address posterior collapse in VAEs, we prop...",rylDfnCqF7,scitldr
1,Generative models have been successfully appli...,The paper uses Variational Auto-Encoding and n...,HJgOl3AqY7,scitldr
2,The quality of the features used in visual rec...,A simple fast method for extracting visual fea...,SyGT_6yCZ,scitldr
3,Humans are capable of attributing latent menta...,We proposed a novel probabilisitic recursive r...,rkl6As0cF7,scitldr
4,Deep learning has found numerous applications ...,We introduce a transparent middleware for neur...,rkf5hnNDj7,scitldr
5,Humans possess an ability to abstractly reason...,"We introduce Recurrent Relational Networks, a ...",SkJKHMW0Z,scitldr
6,Modern Convolutional Neural Networks (CNNs) ar...,"In this paper, we develop fast retraining-free...",rkz1YD0vjm,scitldr
7,Backprop is the primary learning algorithm use...,We ignore non-linearities and do not compute g...,ByfPDyrYim,scitldr
8,Deep network compression seeks to reduce the n...,We seek to understand learned representations ...,HJWpQCa7z,scitldr
9,"This paper introduces HybridNet, a hybrid neur...",It is a hybrid neural architecture to speed-up...,rJoXrxZAZ,scitldr


In [43]:
from datasets import load_dataset

dataset = dataset = load_dataset("allenai/scitldr", split = 'test', cache_dir = '/scratch/ramprasad.sa/huggingface_datasets')

No config specified, defaulting to: scitldr/Abstract
Found cached dataset scitldr (/scratch/ramprasad.sa/huggingface_datasets/allenai___scitldr/Abstract/0.0.0/79e0fa75961392034484808cfcc8f37deb15ceda153b798c92d9f621d1042fef)


In [48]:
print('\n'.join(dataset['source'][0]))

Incremental class learning involves sequentially learning classes in bursts of examples from the same class.
This violates the assumptions that underlie  methods for training standard deep neural networks, and will cause them to suffer from catastrophic forgetting.
Arguably, the best method for incremental class learning is iCaRL, but it requires storing  training examples for each class, making it challenging to scale.
Here, we propose FearNet for incremental class learning.
FearNet is a generative model that does not store previous examples, making it memory efficient.
FearNet uses a brain-inspired dual-memory system in which new memories are consolidated from a network for recent memories inspired by the mammalian hippocampal complex to a network for long-term storage inspired by medial prefrontal cortex.
Memory consolidation is inspired by mechanisms that occur during sleep.
FearNet also uses a module inspired by the basolateral amygdala for determining which memory system to use f

In [37]:
dataset['target'][0]

'FearNet is a memory efficient neural-network, inspired by memory formation in the mammalian brain, that is capable of incremental class learning without catastrophic forgetting.'