In [8]:
import torch
torch.cuda.empty_cache()

In [9]:
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM
from datasets import load_dataset
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [10]:
db=load_dataset("scientific_papers","arxiv")
db

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6440
    })
})

In [11]:
db['train']=db['train'].shuffle(seed=42).select(range(10000))
db['validation']=db['validation'].shuffle(seed=42).select(range(3000))
db['test']=db['test'].shuffle(seed=42).select(range(1000))

In [12]:
model_id='google/flan-t5-small'
tokenizer=AutoTokenizer.from_pretrained(model_id)
model=AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [13]:
import transformers
data_collator = transformers.data.data_collator.default_data_collator

In [14]:
from torch.utils.data import DataLoader

def preprocess_data(data):
    model_inputs=tokenizer(["summarize"+article for article in data['article']],max_length=256,padding='max_length',truncation=True)
    labels=tokenizer(data['abstract'],max_length=128,padding='max_length',truncation=True)
    model_inputs['labels']=labels['input_ids']
    return model_inputs

db_train=db['train'].map(preprocess_data,batched=True,remove_columns=['article','abstract','section_names'])
db_validation=db['validation'].map(preprocess_data,batched=True,remove_columns=['article','abstract','section_names'])

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [8]:
db_train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})

In [15]:
data_collator = transformers.data.data_collator.default_data_collator
train_loader = DataLoader(db_train, batch_size=16, shuffle=False, drop_last=False, collate_fn=data_collator)
eval_loader = DataLoader(db_validation,batch_size=16, shuffle=False, drop_last=False, collate_fn=data_collator)

In [16]:
#Convert to composer model
from torchmetrics.classification import MulticlassAccuracy
from composer.models.huggingface import HuggingFaceModel
from composer.metrics import LanguageCrossEntropy

metrics = [LanguageCrossEntropy()]
# Package as a trainer-friendly Composer model
composer_model = HuggingFaceModel(model, tokenizer=tokenizer, metrics=metrics,use_logits=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
len(train_loader)

625

In [18]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR

optimizer = AdamW(
    params=composer_model.parameters(),
    lr=3e-5, betas=(0.9, 0.98),
    eps=1e-6, weight_decay=3e-6
)
linear_lr_decay = LinearLR(
    optimizer, start_factor=1.0,
    end_factor=0, total_iters=150
)

In [19]:
import torch
from composer import Trainer

# Create Trainer Object
trainer = Trainer(
    model=composer_model, # This is the model from the HuggingFaceModel wrapper class.
    train_dataloader=train_loader,
    eval_dataloader=eval_loader,
    max_duration="1ep",
    optimizers=optimizer,
    schedulers=[linear_lr_decay],
    device='gpu' if torch.cuda.is_available() else 'cpu',
    train_subset_num_batches=150,
    precision='fp32',
    seed=17,
    
)
# Start training
trainer.fit()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
******************************
Config:
composer_commit_hash: None
composer_version: 0.17.2
node_name: unknown because NODENAME environment variable not set
num_gpus_per_node: 1
num_nodes: 1
rank_zero_seed: 17

******************************


train          Epoch   0:    0%|| 0/150 [00:00<?, ?ba/s]                                                      …

eval           Epoch   0:    0%|| 0/188 [00:00<?, ?ba/s]                                                      …



In [20]:
trainer.state.eval_metrics

{'eval': {'LanguageCrossEntropy': LanguageCrossEntropy(
    (loss_fn): CrossEntropyLoss()
  )}}

In [21]:
eval_batch = next(iter(eval_loader))

# Move batch to gpu
eval_batch = {k: v.cuda() if torch.cuda.is_available() else v for k, v in eval_batch.items()}
with torch.no_grad():
    predictions = composer_model(eval_batch)["logits"].argmax(dim=1)

# Visualize only 5 samples
predictions = predictions[:5]


In [22]:
torch.save(trainer.state.model.state_dict(), 'mosaic_summarize.pt')

In [58]:
loaded_model = torch.load('mosaic_summarize.pt')
newmodel.load_state_dict(loaded_model['model_state_dict'])

NameError: name 'newmodel' is not defined

In [62]:
db['test'][10]['article']

'markov chain monte carlo ( mcmc ) has become a standard tool in bayesian analysis .\nthe greatest benefit of mcmc is its generality  it is guaranteed to be consistent with virtually no assumptions on the underlying model . however , the practical applicability of mcmc generally depends on the dimension of the unknown variables , the number of data , and the computational resources available .\nfurthermore , because mcmc is sequential in nature , it can be difficult to implement efficiently with modern parallel and distributed computing architectures ; see @xcite for general discussion about mcmc in challenging scenarios , and with parallel computing architectures .\nin recent years , a number of generic approximation methods have been developed for complex bayesian inference , for instance variational bayes ( cf .\nreviews * ? ? ?\n* ; * ? ? ?\n* ) , expectation propagation @xcite and laplace approximations @xcite .\nthese methods have been reported to provide accurate enough inferenc

In [23]:
from transformers import pipeline
from random import randrange

# load model and tokenizer from huggingface hub with pipeline
#summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0)
summarizer=pipeline('summarization',model=model,device='cuda',tokenizer=tokenizer)

# select a random test sample
#sample = db['test'][randrange(len(db["test"]))]
sample=db['test'][10]['article']
print(f"article: \n{sample}\n\n\n")
# summarize dialogue
res = summarizer(sample)
print("..................................................Summarize Text..................................")
print(f"flan-t5-base summary:\n{res[0]['summary_text']}")

Token indices sequence length is longer than the specified maximum sequence length for this model (25591 > 512). Running this sequence through the model will result in indexing errors


article: 
markov chain monte carlo ( mcmc ) has become a standard tool in bayesian analysis .
the greatest benefit of mcmc is its generality  it is guaranteed to be consistent with virtually no assumptions on the underlying model . however , the practical applicability of mcmc generally depends on the dimension of the unknown variables , the number of data , and the computational resources available .
furthermore , because mcmc is sequential in nature , it can be difficult to implement efficiently with modern parallel and distributed computing architectures ; see @xcite for general discussion about mcmc in challenging scenarios , and with parallel computing architectures .
in recent years , a number of generic approximation methods have been developed for complex bayesian inference , for instance variational bayes ( cf .
reviews * ? ? ?
* ; * ? ? ?
* ) , expectation propagation @xcite and laplace approximations @xcite .
these methods have been reported to provide accurate enough infere

OutOfMemoryError: CUDA out of memory. Tried to allocate 4.88 GiB. GPU 0 has a total capacty of 39.39 GiB of which 4.10 GiB is free. Including non-PyTorch memory, this process has 35.28 GiB memory in use. Of the allocated memory 33.89 GiB is allocated by PyTorch, and 907.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF