In [1]:
from datetime import datetime
import os
import sys

import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq


# %% [markdown]
# (If you have import errors, try restarting your Jupyter kernel)
# 

# %% [markdown]
# ### Load dataset
# 

# %%
from datasets import load_dataset
from datasets import load_dataset
train_dataset = load_dataset('json', data_files='/home/sam/finetune-llm-for-rag/datasets/viggo/gem-viggo-train-with-0-examples-random-False-emb_fn-text-embedding-ada-002.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='/home/sam/finetune-llm-for-rag/datasets/viggo/gem-viggo-test-with-0-examples-random-False-emb_fn-text-embedding-ada-002.jsonl', split='train')
print(train_dataset)
print(eval_dataset)

Downloading and preparing dataset json/default to /home/sam/.cache/huggingface/datasets/json/default-62a2819ae0a15382/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/sam/.cache/huggingface/datasets/json/default-62a2819ae0a15382/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.
Downloading and preparing dataset json/default to /home/sam/.cache/huggingface/datasets/json/default-8668502917b5e8a5/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/sam/.cache/huggingface/datasets/json/default-8668502917b5e8a5/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.
Dataset({
    features: ['gem_id', 'meaning_representation', 'target', 'references', 'full_prompt', 'inference_prompt'],
    num_rows: 5103
})
Dataset({
    features: ['gem_id', 'meaning_representation', 'target', 'references', 'full_prompt', 'inference_prompt'],
    num_rows: 1083
})


In [2]:
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")


Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


In [3]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

# %% [markdown]
# Setup the tokenize function to make labels and input_ids the same. This is basically what [self-supervised fine-tuning](https://neptune.ai/blog/self-supervised-learning) is:

# %%
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=700,
        padding=False,
        return_tensors=None,
    )
    # print("result is: ", result)
    print("length of input ids: ", len(result["input_ids"]))

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result

# %% [markdown]
# And run convert each data_point into a prompt that I found online that works quite well:

# %%
def generate_and_tokenize_prompt(data_point):

    full_prompt = data_point["full_prompt"]
    print("length of full prompt: ", len(full_prompt))
    # print(full_prompt)
    return tokenize(full_prompt)

In [5]:
# tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
max_len_input = 0
for data_point in eval_dataset:
    tokenized = generate_and_tokenize_prompt(data_point)
    length_of_input_ids = len(tokenized["input_ids"])
    if length_of_input_ids > max_len_input:
        max_len_input = length_of_input_ids
    # break
print("max_len_input: ", max_len_input)

# tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

length of full prompt:  1019
length of input ids:  255
length of full prompt:  1049
length of input ids:  266
length of full prompt:  1020
length of input ids:  258
length of full prompt:  1249
length of input ids:  340
length of full prompt:  1272
length of input ids:  342
length of full prompt:  1276
length of input ids:  344
length of full prompt:  1091
length of input ids:  292
length of full prompt:  1059
length of input ids:  282
length of full prompt:  1136
length of input ids:  297
length of full prompt:  974
length of input ids:  248
length of full prompt:  963
length of input ids:  247
length of full prompt:  946
length of input ids:  246
length of full prompt:  866
length of input ids:  222
length of full prompt:  863
length of input ids:  221
length of full prompt:  856
length of input ids:  220
length of full prompt:  1027
length of input ids:  268
length of full prompt:  1082
length of input ids:  285
length of full prompt:  1021
length of input ids:  271
length of full p

length of input ids:  290
length of full prompt:  1142
length of input ids:  313
length of full prompt:  1073
length of input ids:  293
length of full prompt:  1108
length of input ids:  303
length of full prompt:  1060
length of input ids:  285
length of full prompt:  1156
length of input ids:  305
length of full prompt:  1081
length of input ids:  286
length of full prompt:  941
length of input ids:  248
length of full prompt:  931
length of input ids:  246
length of full prompt:  946
length of input ids:  253
length of full prompt:  1209
length of input ids:  316
length of full prompt:  1176
length of input ids:  309
length of full prompt:  1226
length of input ids:  317
length of full prompt:  885
length of input ids:  235
length of full prompt:  903
length of input ids:  239
length of full prompt:  919
length of input ids:  240
length of full prompt:  1033
length of input ids:  277
length of full prompt:  1004
length of input ids:  272
length of full prompt:  1001
length of input 

In [22]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
base_model = "codellama/CodeLlama-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
# Add EOS token and padding configurations
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

# Initialize variables



The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
print(tokenizer.bos_token)
print(tokenizer.eos_token)

<s>
</s>


In [15]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=True,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result

In [26]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

# Tokenize a short prompt with padding and max_length set to 10
prompt = "Hello saudfjapwejsdjfioap  osdifwe89jaf asoidfa ssa p apfsdj oeia spda p apapap aifoias dfda ddoao sss aaooa "
result = tokenizer(
    prompt,
    truncation=True,
    max_length=5,
    padding=True,
    return_tensors=None,
)

# Display the tokenized IDs and check for padding token ID
print("Tokenized IDs:", result['input_ids'])
print("Pad Token ID:", tokenizer.pad_token_id)
print("length of input ids: ", len(result["input_ids"]))
# Check if pad token ID is present in the tokenized sequence
if tokenizer.pad_token_id in result['input_ids']:
    print("Pad token ID is present in the tokenized sequence.")
else:
    print("Pad token ID is not present in the tokenized sequence.")

Tokenized IDs: [1, 15043, 872, 566, 2]
Pad Token ID: 0
length of input ids:  5
Pad token ID is not present in the tokenized sequence.


In [16]:
tokenized_text = tokenize("hello mate")
print(tokenized_text)

{'input_ids': [1, 22172, 15358, 2], 'attention_mask': [1, 1, 1, 1], 'labels': [1, 22172, 15358, 2]}


In [7]:
# Sam's manual tests. We want to just investigate the different properties of tokenizers and what they do.
tokenized_text = tokenizer("hello mate", return_tensors="pt") # , padding='max_length', max_length=max_input_tokens+10
print(tokenized_text)

{'input_ids': tensor([[    1, 22172, 15358,     2]]), 'attention_mask': tensor([[1, 1, 1, 1]])}


In [4]:
base_model = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [5]:
base_text = "Add more text here to increase the token count. " * 100

max_new_tokens = 5
max_input_tokens = 0
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
# Add EOS token and padding configurations
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"
# tokenizer.padding_size = "left"
while True:
    batch_model_inputs = tokenizer(base_text, return_tensors="pt", padding='max_length', max_length=max_input_tokens+10)
    batch_model_inputs.to("cuda")
    input_tokens = batch_model_inputs['input_ids'].shape[1]
    print("input tokens: ", input_tokens)
    if input_tokens > max_input_tokens:
        max_input_tokens = input_tokens
    
    output = model.generate(**batch_model_inputs, max_new_tokens=max_new_tokens)
    output_tokens = output.shape[1]
    # Check if the last tokens of the input are present in the output
    last_input_tokens = batch_model_inputs['input_ids'][0, -max_new_tokens:].tolist()
    if not all(token in output[0, -max_new_tokens*2:].tolist() for token in last_input_tokens):
        print(f"Context limit reached with input tokens: {input_tokens}")
        break

    base_text += " Add more text here to increase the token count. "*30


input tokens:  1003
input tokens:  1333
input tokens:  1663
input tokens:  1993
input tokens:  2323
input tokens:  2653
input tokens:  2983
input tokens:  3313
input tokens:  3643
input tokens:  3973
input tokens:  4303


This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


input tokens:  4633
input tokens:  4963
input tokens:  5293
input tokens:  5623
input tokens:  5953


KeyboardInterrupt: 