# Fine-tune Mistral pipeline

In [1]:
personal = True
idun = False

## Load dataset

In [2]:
import os

if personal and not idun:
    output_dir = os.path.expanduser('~/models')
elif idun and not personal:
    output_dir = os.path.expanduser('/cluster/work/eliashk/models')

os.environ['HF_HOME'] = output_dir

from datasets import load_dataset

tiny_dataset_path = 'single_output_10_samples.jsonl'

raw_dataset = load_dataset('json', data_files=tiny_dataset_path)

## Create tokenizer

In [3]:
from transformers import AutoTokenizer

base_model_id = 'mistralai/Mistral-7B-Instruct-v0.2'

checkpoint = base_model_id
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_dataset['train']['input'])
tokenized_sentences_2 = tokenizer(raw_dataset['train']['output'])

In [4]:
inputs = tokenizer(raw_dataset['train']['input'][0], raw_dataset['train']['output'][0])
tokenizer.convert_ids_to_tokens(inputs['input_ids'])

['<s>',
 '▁<',
 'domain',
 '.',
 'p',
 'dd',
 'l',
 '>:',
 '▁(',
 'define',
 '▁(',
 'domain',
 '▁brief',
 'case',
 ')',
 '▁(',
 ':',
 'require',
 'ments',
 '▁:',
 'ad',
 'l',
 ')',
 '▁(',
 ':',
 'types',
 '▁portable',
 '▁location',
 ')',
 '▁(',
 ':',
 'pred',
 'icates',
 '▁(',
 'at',
 '▁?',
 'y',
 '▁-',
 '▁portable',
 '▁?',
 'x',
 '▁-',
 '▁location',
 ')',
 '▁(',
 'in',
 '▁?',
 'x',
 '▁-',
 '▁portable',
 ')',
 '▁(',
 'is',
 '-',
 'at',
 '▁?',
 'x',
 '▁-',
 '▁location',
 '))',
 '▁(',
 ':',
 'action',
 '▁move',
 '▁:',
 'parameters',
 '▁(',
 '?',
 'm',
 '▁?',
 'l',
 '▁-',
 '▁location',
 ')',
 '▁:',
 'pre',
 'condition',
 '▁',
 '▁(',
 'is',
 '-',
 'at',
 '▁?',
 'm',
 ')',
 '▁:',
 'effect',
 '▁(',
 'and',
 '▁(',
 'is',
 '-',
 'at',
 '▁?',
 'l',
 ')',
 '▁(',
 'not',
 '▁(',
 'is',
 '-',
 'at',
 '▁?',
 'm',
 '))',
 '▁(',
 'forall',
 '▁(',
 '?',
 'x',
 '▁-',
 '▁portable',
 ')',
 '▁(',
 'when',
 '▁(',
 'in',
 '▁?',
 'x',
 ')',
 '▁(',
 'and',
 '▁(',
 'at',
 '▁?',
 'x',
 '▁?',
 'l',
 ')',
 '▁(',
 

## Check maximum sequence length in the dataset
### Mistral’s sliding window attention allows sequence of up to 4096*32 tokens.

In [5]:
# <s> = start-of-sentence token. No end-of-sentece token is used.
tokenizer.pad_token = tokenizer.bos_token

print(tokenizer.pad_token)

max_length = 0
for i in range(len(raw_dataset['train'])):    
    inputs = tokenizer(raw_dataset['train']['input'][i], raw_dataset['train']['output'][i])
    length = len(inputs['input_ids'])
    if length > max_length:
        max_length = length

print("Maximum sequence length:", max_length)

<s>
Maximum sequence length: 2373


In [6]:
def tokenize_function(example):
    return tokenizer(example['input'], example['output'], truncation=True, max_length=max_length)

In [7]:
tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'input_ids', 'attention_mask'],
        num_rows: 10
    })
})

### Dynamic padding

In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
batch_size = 5 # dataset contains 10, 20, 30, 40 or 50 samples

samples = tokenized_datasets['train'][:batch_size]
samples = {k: v for k, v in samples.items() if k not in ['input', 'output']}
[len(x) for x in samples['input_ids']]

[460, 1506, 1991, 533, 2325]

In [10]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([5, 2325]), 'attention_mask': torch.Size([5, 2325])}

## Fine-tuning with Trainer API
#### Using stadard training arguments

In [11]:
from transformers import TrainingArguments, AutoModelForSequenceClassification

training_args = TrainingArguments(output_dir=output_dir)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

: 

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()