In [46]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from datasets import load_dataset
from torch.utils.data import DataLoader
import torch

In [25]:
# Load pre-trained model and tokenizer
model = AutoModelForCausalLM.from_pretrained('distilbert/distilgpt2')
finetuned_model = PeftModel.from_pretrained(model, "../output")
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilgpt2')

# Input text
input_text = "Once upon a time"

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors='pt')
print(type(inputs['input_ids']))

output = finetuned_model.generate(inputs['input_ids'], max_length=50, num_return_sequences=1)

# Decode the generated tokens
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<class 'torch.Tensor'>
Once upon a time when the world was full of people, the world was full of people who were not in the world, but were in the world.
"I'm not a fan of the idea of a world where people are not in the


In [7]:
data_path = "../data/processed_data/test.json"


In [30]:
def process_data(tokenizer):
    dataset = load_dataset("json", data_files=data_path, split="train[:1500]")

    def format_example(example):
        instruction = f"Question: {example['Question']}\nAnalysis: {example['Complex_CoT']}"
        inputs = tokenizer(
            f"{instruction}\n<|endoftext|>",
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        return {"input_ids": inputs["input_ids"].squeeze(0), "attention_mask": inputs["attention_mask"].squeeze(0)}

    return dataset.map(format_example, remove_columns=dataset.column_names)

In [31]:
tokenizer.pad_token = tokenizer.eos_token
dataset = process_data(tokenizer)

Map: 100%|██████████| 1500/1500 [00:00<00:00, 2729.78 examples/s]


In [19]:
dataset?

[0;31mType:[0m        Dataset
[0;31mString form:[0m
Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1500
})
[0;31mLength:[0m      1500
[0;31mFile:[0m        ~/.virtualenvs/llm_project/lib/python3.10/site-packages/datasets/arrow_dataset.py
[0;31mDocstring:[0m   A Dataset backed by an Arrow table.

In [20]:
dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1500
})

In [47]:
dataloader = DataLoader(dataset, batch_size=8, collate_fn=lambda x: {
    "input_ids": torch.stack([torch.tensor(d["input_ids"]) for d in x]),
    "attention_mask": torch.stack([torch.tensor(d["attention_mask"]) for d in x])
})

In [None]:
for data in dataloader:
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    output = model.generate(input_ids = input_ids, attention_mask = attention_mask, max_new_tokens=100,
                temperature=0.7, pad_token_id=tokenizer.pad_token_id)
    
    print(output)
    break

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


tensor([[24361,    25, 16213,  ..., 50256, 50256, 50256],
        [24361,    25, 16213,  ..., 50256, 50256, 50256],
        [24361,    25, 16213,  ..., 50256, 50256, 50256],
        ...,
        [24361,    25, 16213,  ..., 50256, 50256, 50256],
        [24361,    25, 16213,  ..., 50256, 50256, 50256],
        [24361,    25, 16213,  ..., 50256, 50256, 50256]])


In [39]:
for example in dataset["input_ids"]:
    print(example)
    finetuned_model.generate(torch.tensor(example))
    break
    

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[24361, 25, 16213, 2736, 262, 1708, 6126, 329, 5465, 4046, 393, 5859, 3303, 25, 366, 14181, 2488, 18050, 45, 419, 261, 25, 1318, 318, 257, 24984, 51, 6006, 3185, 379, 262, 860, 12, 1157, 13257, 30, 843, 484, 3877, 13938, 13, 632, 338, 257, 2347, 12296, 345, 9372, 269, 34115, 13, 1867, 257, 27690, 526, 198, 32750, 25, 770, 6126, 4909, 3303, 326, 460, 307, 10090, 355, 5859, 3303, 13, 383, 6126, 468, 587, 11765, 416, 3294, 24708, 2024, 11, 351, 657, 1398, 4035, 340, 355, 5465, 4046, 11, 513, 355, 5859, 3303, 11, 290, 657, 355, 6159, 13, 198, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 

IndexError: tuple index out of range

In [36]:
type(dataset["input_ids"])

list