In [2]:
!pip install nltk rouge-score
!pip install rouge


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=362617a1822097b043f89e2e11cca738f7b8919f079747faa2ffdb30e4205c9e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [6]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import DataLoader, Dataset
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
import math

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import DataLoader, Dataset
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
import math

# Initialize tokenizer with left padding
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'  # Set padding to left

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()

class MusixmatchDataset(Dataset):
    def __init__(self, filepath, tokenizer, top_words, max_length=768, max_samples=None):
        self.tokenizer = tokenizer
        self.examples = []

        with open(filepath, 'r', encoding='utf-8') as file:
            for i, line in enumerate(file):
                if max_samples is not None and i >= max_samples:
                    break
                if line.startswith('#') or line.startswith('%'):
                    continue
                parts = line.strip().split(',')
                lyrics = []
                for word_count in parts[2:]:
                    idx, cnt = map(int, word_count.split(':'))
                    word = top_words[idx - 1] if idx - 1 < len(top_words) else '[UNK]'
                    lyrics.extend([word] * cnt)

                lyrics_text = ' '.join(lyrics)
                encoded = tokenizer(lyrics_text, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')
                # Flatten the batch dimension here, if necessary
                input_ids = encoded.input_ids.squeeze()  # Remove unnecessary dimensions
                attention_mask = encoded.attention_mask.squeeze()  # Remove unnecessary dimensions

                self.examples.append({'input_ids': input_ids, 'attention_mask': attention_mask})

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]


def generate_text(input_ids, attention_mask):
    # Assuming input_ids and attention_mask are correctly batched
    generated_text_samples = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=50  # Generate 50 new tokens beyond the length of input_ids
    )
    return tokenizer.decode(generated_text_samples[0], skip_special_tokens=True)
def evaluate_model(test_loader):
    rouge = Rouge()
    bleu_scores = []
    rouge_scores = []
    perplexity_scores = []

    for data in test_loader:
        input_ids, attention_mask = data['input_ids'], data['attention_mask']

        # Debugging: Print shapes
        print("Input IDs shape:", input_ids.shape)
        print("Attention Mask shape:", attention_mask.shape)

        # Check if input_ids is a single tensor without a batch dimension
        if input_ids.dim() == 1:
            input_ids = input_ids.unsqueeze(0)
            attention_mask = attention_mask.unsqueeze(0)

        generated_text = generate_text(input_ids, attention_mask)
        reference_text = tokenizer.decode(input_ids.squeeze(0), skip_special_tokens=True)

        bleu_score = sentence_bleu([reference_text.split()], generated_text.split())
        scores = rouge.get_scores(generated_text, reference_text)
        with torch.no_grad():
            # Additional debugging to ensure labels are correctly shaped
            print("Label IDs shape (for loss calculation):", input_ids.shape)
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss
            perplexity = math.exp(loss.item())

        bleu_scores.append(bleu_score)
        rouge_scores.append(scores[0])
        perplexity_scores.append(perplexity)

    return {
        'Average BLEU': sum(bleu_scores) / len(bleu_scores),
        'Average ROUGE': {key: sum(score[key]['f'] for score in rouge_scores) / len(rouge_scores) for key in rouge_scores[0]},
        'Average Perplexity': sum(perplexity_scores) / len(perplexity_scores)
    }

# Function to extract and add words from the file
def add_top_words(file_path, top_words):
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.startswith('%'):
                line = line[1:].strip()
                words = line.split(',')
                top_words.extend(words)
                break

# Top words initialization
top_words = []
add_top_words("train.txt", top_words)  # Ensure you load from the correct file



# Create datasets and loaders
train_dataset = MusixmatchDataset('train.txt', tokenizer, top_words, max_samples=1000)
test_dataset = MusixmatchDataset('test.txt', tokenizer, top_words, max_samples=500)


from torch.utils.data import random_split


total_train_samples = len(train_dataset)
val_size = int(0.2 * total_train_samples)  # 20% for validation
train_size = total_train_samples - val_size  # Remaining 80% for training

# Randomly split the dataset into training and validation datasets
train_subset, val_subset = random_split(train_dataset, [train_size, val_size])

# Create DataLoaders for each subset
train_loader = DataLoader(train_subset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=1, shuffle=False)

#train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)


# Perform evaluation on the test dataset
evaluation_results = evaluate_model(test_loader)
print("Evaluation Results:", evaluation_results)


# Perform evaluation on the valid dataset
e2 = evaluate_model(val_loader)
print("Evaluation Results:", e2)






Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Evaluation Results: {'Average BLEU': 0.7883825974227959, 'Average ROUGE': {'rouge-1': 0.9875541616247343, 'rouge-2': 0.97965639502926, 'rouge-l': 0.9875541616247343}, 'Average Perplexity': 2019.1430840125938}
Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs shape: torch.Size([1, 768])
Attention Mask shape: torch.Size([1, 768])
Label IDs shape (for loss calculation): torch.Size([1, 768])
Evaluation Results: {'Average BLEU': 0.8091739761216526, 'Average ROUGE': {'rouge-1': 0.9892686876376278, 'rouge-2': 0.982717240182979, 'rouge-l': 0.9892686876376278}, 'Average Perplexity': 1870.8635052325128}


In [7]:
import torch
from torch import nn
from transformers import GPT2LMHeadModel
class CustomGPT2Model(nn.Module):
    def __init__(self, gpt2_model_name='gpt2', lstm_hidden_size=256, conv_channels=128):
        super(CustomGPT2Model, self).__init__()
        self.gpt2 = GPT2LMHeadModel.from_pretrained(gpt2_model_name, output_hidden_states=True)
        self.vocab_size = self.gpt2.config.vocab_size
        self.lstm = nn.LSTM(self.gpt2.config.n_embd, lstm_hidden_size, batch_first=True)
        self.conv1d = nn.Conv1d(lstm_hidden_size, conv_channels, kernel_size=3, padding=1)
        self.adaptation = nn.Linear(conv_channels, self.vocab_size)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.to(self.device)


    def forward(self, input_ids, attention_mask=None):
        input_ids = input_ids.to(self.device)
        attention_mask = attention_mask.to(self.device)
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.hidden_states[-1]
        lstm_out, _ = self.lstm(last_hidden_state)
        lstm_out = lstm_out.permute(0, 2, 1)
        conv_out = self.conv1d(lstm_out)
        conv_out = conv_out.permute(0, 2, 1)
        logits = self.adaptation(conv_out)
        return logits


In [37]:
from torch.cuda.amp import GradScaler, autocast

def train_model(train_loader, model, optimizer, criterion, epochs=3, device='cuda'):
    scaler = GradScaler()  # Initialize the gradient scaler
    model.train()

    for epoch in range(epochs):
        for batch in train_loader:
            inputs = batch['input_ids'].to(device)
            labels = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            optimizer.zero_grad()

            # Using autocast for the forward pass to enable mixed precision
            with autocast():
                outputs = model(inputs, attention_mask=attention_mask)
                loss = criterion(outputs.transpose(1, 2), labels)  # Adjust the loss for the shape [batch, seq_length, vocab_size]

            # Backpropagation with scaled loss
            scaler.scale(loss).backward()

            # Gradient clipping can be included here within the scaled context
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Optimizer step with scaler
            scaler.step(optimizer)
            scaler.update()  # Update the scaler

        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Assuming CUDA is available and preferable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomGPT2Model().to(device)

# Define optimizer and loss
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Train the model
train_model(train_loader, model, optimizer, criterion)


In [48]:
# Assuming CUDA is available and preferable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomGPT2Model().to(device)

# Define optimizer and loss
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Train the model
train_model(train_loader, model, optimizer, criterion)




Epoch 1, Loss: 2.7456443309783936
Epoch 2, Loss: 3.987399101257324
Epoch 3, Loss: 2.7079994678497314


In [39]:
def generate_text(model, input_ids, attention_mask):
    # Move input tensors to the same device as the model
    input_ids = input_ids.to(model.device)
    attention_mask = attention_mask.to(model.device)

    # Call the generate method
    generated_text_samples = model.gpt2.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=input_ids.shape[1] + 50  # Assuming you want to generate 50 tokens more
    )

    return tokenizer.decode(generated_text_samples[0], skip_special_tokens=True)
def evaluate_model(test_loader, model):
    model.to(model.device)
    rouge = Rouge()
    bleu_scores = []
    rouge_scores = []
    perplexity_scores = []
    criterion = torch.nn.CrossEntropyLoss()

    for data in test_loader:
        input_ids = data['input_ids'].to(model.device)
        attention_mask = data['attention_mask'].to(model.device)

        generated_text = generate_text(model, input_ids, attention_mask)
        reference_text = tokenizer.decode(input_ids.squeeze(0), skip_special_tokens=True)

        logits = model(input_ids, attention_mask=attention_mask)
        loss = criterion(logits.view(-1, model.vocab_size), input_ids.view(-1))

        bleu_score = sentence_bleu([reference_text.split()], generated_text.split())
        scores = rouge.get_scores(generated_text, reference_text)
        perplexity = math.exp(loss.item())

        bleu_scores.append(bleu_score)
        rouge_scores.append(scores[0])
        perplexity_scores.append(perplexity)

    return {
        'Average BLEU': sum(bleu_scores) / len(bleu_scores),
        'Average ROUGE': {key: sum(score[key]['f'] for score in rouge_scores) / len(rouge_scores) for key in rouge_scores[0]},
        'Average Perplexity': sum(perplexity_scores) / len(perplexity_scores)
    }





evaluation_results = evaluate_model(test_loader, model)
print("Evaluation Results:", evaluation_results)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Evaluation Results: {'Average BLEU': 0.9451715977345719, 'Average ROUGE': {'rouge-1': 0.9397921401256496, 'rouge-2': 0.9514508211812858, 'rouge-l': 0.9397921401256496}, 'Average Perplexity': 57.57927617624613}


In [40]:

evaluation_results2 = evaluate_model(val_loader, model)
print("Evaluation Results:", evaluation_results2)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Evaluation Results: {'Average BLEU': 0.9536284444905129, 'Average ROUGE': {'rouge-1': 0.9446309635276696, 'rouge-2': 0.9574271360760817, 'rouge-l': 0.9446309635276696}, 'Average Perplexity': 30.206633574107293}


In [57]:
model = CustomGPT2Model().to(device)
model.eval()  # Set the model to evaluation mode
seed_text = "And for a fortnight there we were, Forever running to you"

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
encoded_input = tokenizer.encode(seed_text, return_tensors='pt').to(device)
# Assuming the model has a method to handle generation directly
# Adjust max_length as needed for longer sequences
generated_outputs = generate_text(model,encoded_input,torch.ones_like(encoded_input))

print(generated_outputs)



# Decode the output tokens to text
generated_text = tokenizer.decode(generated_outputs[0], skip_special_tokens=True)
print(generated_text)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[ 1870,   329,   257, 46327,   612,   356,   547,    11, 28285,  2491,
           284,   345,    13,   198,    40,  1101,  7926,   314,  1422,   470,
           910,   340,   878,   475,   428,   318,   262,   717,   640,   326,
           616,  1438,   468,   587,  4750,   287,   597,   286,   534,  6851,
           290,   783,   994,   338,   644,  3022,    25,   775,   550,   281,
          4578,   546,   703,   881,  1637,   373,  1016,   656,   674,  1923,
           523,   618,  2130,  1965,   502,   611,   484,   714, 16565,   720,
            16,   393,   517,   788,   339,   561,   651,   465,   898,  4866,
             0,  1406,   706,   617,  5114,   351,   683,   357,   392,  1854,
             8,   625,   508,   815,   307, 29798,   511,  5153,   736,   319]],
       device='cuda:0')
And for a fortnight there we were, Forever running to you.
I'm sorry I didn't say it before but this is the first time that my name has been mentioned in any of your posts and now here

In [27]:
from transformers import pipeline
sentiment_analyzer = pipeline("sentiment-analysis")
tokenizer.padding_side = 'left'


def generate_text(model, input_ids, attention_mask):
    # Move input tensors to the same device as the model
    input_ids = input_ids.to(model.device)
    attention_mask = attention_mask.to(model.device)

    # Truncate input_ids and attention_mask to max length of 512 if longer
    max_length = 512
    if input_ids.size(1) > max_length:
        input_ids = input_ids[:, :max_length]
        attention_mask = attention_mask[:, :max_length]

    # Call the generate method with truncated input
    generated_text_samples = model.gpt2.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=input_ids.shape[1] + 50  # Generate up to 50 tokens more
    )

    return tokenizer.decode(generated_text_samples[0], skip_special_tokens=True)




def get_sentiment_scores(lyrics):
    lines = lyrics.strip().split('\n')
    sentiments = []
    for line in lines:
        if line.strip():  # Ensure the line has content
            # Directly use the sentiment analyzer pipeline which handles tokenization
            result = sentiment_analyzer(line)
            if result:
                sentiments.append(result[0])
    sentiment_scores = [sent['score'] * (1 if sent['label'] == 'POSITIVE' else -1) for sent in sentiments if 'score' in sent]
    return sentiment_scores

def evaluate_sentiment(test_loader, model, tokenizer):
    model.eval()
    model.to(model.device)
    all_sentiments = []

    for data in test_loader:
        input_ids = data['input_ids'].to(model.device)
        attention_mask = data['attention_mask'].to(model.device)

        # Make sure to truncate or pad input sequences to the model's max input size
        if input_ids.size(1) > 512:
            input_ids = input_ids[:, :512]
            attention_mask = attention_mask[:, :512]

        generated_text = generate_text(model, input_ids, attention_mask)
        sentiment_scores = get_sentiment_scores(generated_text)
        all_sentiments.append(sentiment_scores)

    return np.array(all_sentiments)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [28]:
import seaborn as sns
import matplotlib.pyplot as plt

sentiment_data = evaluate_sentiment(test_loader, model, tokenizer)
plt.figure(figsize=(10, 6))
sns.heatmap(sentiment_data, cmap='coolwarm', cbar=True)
plt.title('Emotional Progression in Song Lyrics')
plt.xlabel('Line Index')
plt.ylabel('Song Index')
plt.show()


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but ri

RuntimeError: The size of tensor a (565) must match the size of tensor b (512) at non-singleton dimension 1