In [1]:
! pip install datasets



In [2]:
from transformers import AutoTokenizer
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer2= AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
dataset = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train")

In [3]:
dataset[0]

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
 'label': 1}

In [4]:
# Tokenize
tokens = tokenizer.tokenize(dataset[0]["text"])
print("Tokens:", tokens)
print("Number of tokens:", len(tokens))

# Convert tokens to input IDs
input_ids = tokenizer(dataset[0]["text"])["input_ids"]
print("Token IDs:", input_ids)
print("Number of token IDs:", len(input_ids))

Tokens: ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'century', "'", 's', 'new', '"', 'conan', '"', 'and', 'that', 'he', "'", 's', 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarz', '##ene', '##gger', ',', 'jean', '-', 'cl', '##aud', 'van', 'dam', '##me', 'or', 'steven', 'sega', '##l', '.']
Number of tokens: 45
Token IDs: [101, 1996, 2600, 2003, 16036, 2000, 2022, 1996, 7398, 2301, 1005, 1055, 2047, 1000, 16608, 1000, 1998, 2008, 2002, 1005, 1055, 2183, 2000, 2191, 1037, 17624, 2130, 3618, 2084, 7779, 29058, 8625, 13327, 1010, 3744, 1011, 18856, 19513, 3158, 5477, 4168, 2030, 7112, 16562, 2140, 1012, 102]
Number of token IDs: 47


In [5]:
# Tokenize
tokens = tokenizer2.tokenize(dataset[0]["text"])
print("Tokens:", tokens)
print("Number of tokens:", len(tokens))

# Convert tokens to input IDs
input_ids = tokenizer2(dataset[0]["text"])["input_ids"]
print("Token IDs:", input_ids)
print("Number of token IDs:", len(input_ids))

Tokens: ['▁the', '▁rock', '▁is', '▁dest', 'ined', '▁to', '▁be', '▁the', '▁', '2', '1', 'st', '▁century', "'", 's', '▁new', '▁"', '▁con', 'an', '▁"', '▁and', '▁that', '▁he', "'", 's', '▁going', '▁to', '▁make', '▁a', '▁spl', 'ash', '▁even', '▁greater', '▁than', '▁ar', 'n', 'old', '▁sch', 'war', 'zen', 'eg', 'ger', '▁,', '▁je', 'an', '-', 'cl', 'aud', '▁van', '▁dam', 'me', '▁or', '▁st', 'even', '▁seg', 'al', '▁.']
Number of tokens: 57
Token IDs: [1, 1040, 5231, 1117, 3388, 2079, 1066, 1115, 1040, 29473, 29518, 29508, 1071, 6213, 29510, 29481, 1401, 1113, 1147, 1044, 1113, 1072, 1137, 1168, 29510, 29481, 2172, 1066, 1806, 1032, 8922, 1797, 1787, 7285, 1589, 1365, 29479, 1506, 2311, 7002, 3807, 1409, 1865, 1968, 2986, 1044, 29501, 1280, 6443, 2465, 3975, 1895, 1210, 1109, 9558, 4747, 1050, 1610]
Number of token IDs: 58


In [6]:
def tokenization(example):
    return tokenizer(example["text"])

dataset = dataset.map(tokenization, batched=True)

In [7]:
dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
dataset.format['type']

'torch'

In [8]:
def format_mistral_prompt_with_context(example):
    context = example.get("context", "").strip()
    messages = example["messages"]
    
    prompt = ""
    if context:
        prompt += f"<s>[INST] Context: {context}\n"

    # Add user-assistant chat
    for i, msg in enumerate(messages):
        if msg["role"] == "user":
            if context and i == 0:
                prompt += f"User: {msg['content']} [/INST] "
            else:
                prompt += f"<s>[INST] {msg['content']} [/INST] "
        elif msg["role"] == "assistant":
            prompt += f"{msg['content']} </s>"

    return {"text": prompt.strip()}


In [9]:
dataset = [
  {
    "context": "This is the background for the conversation.",
    "messages": [
      {"role": "user", "content": "Hello, who are you?"},
      {"role": "assistant", "content": "I'm an AI assistant."},
      {"role": "user", "content": "What can you do?"},
      {"role": "assistant", "content": "I can help you with many tasks."}
    ]
  },
  {
    "context": "",
    "messages": [
      {"role": "user", "content": "Tell me a joke."},
      {"role": "assistant", "content": "Why did the chicken cross the road? To get to the other side!"}
    ]
  }
]

single_prompt = format_mistral_prompt_with_context(dataset[0])

single_prompt

{'text': "<s>[INST] Context: This is the background for the conversation.\nUser: Hello, who are you? [/INST] I'm an AI assistant. </s><s>[INST] What can you do? [/INST] I can help you with many tasks. </s>"}

In [10]:
chat_data = tokenizer2.apply_chat_template(dataset[0]['messages'], tokenize=False)
chat_data

"<s>[INST] Hello, who are you?[/INST] I'm an AI assistant.</s>[INST] What can you do?[/INST] I can help you with many tasks.</s>"