# Installations and client initialisation

In [None]:
!pip install -q mistralai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.0/145.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
import os

api_key = ""

client = MistralClient(api_key=api_key)

# Finetuning mistral-small-latest on 50 data points

## Checking the data format

In [None]:
# download the validation and reformat script
!wget https://raw.githubusercontent.com/mistralai/mistral-finetune/main/utils/reformat_data.py

# validate and reform the training data
!python reformat_data.py /content/drive/MyDrive/train_data.jsonl

# validate and reform the validation data
!python reformat_data.py /content/drive/MyDrive/validation_data.jsonl

--2024-06-28 17:35:30--  https://raw.githubusercontent.com/mistralai/mistral-finetune/main/utils/reformat_data.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3381 (3.3K) [text/plain]
Saving to: ‘reformat_data.py.2’


2024-06-28 17:35:30 (44.2 MB/s) - ‘reformat_data.py.2’ saved [3381/3381]



## Creating the files in client

In [None]:
with open("/content/drive/MyDrive/train_data.jsonl", "rb") as f:
    ultrachat_chunk_train = client.files.create(file=("train_data.jsonl", f))
with open("/content/drive/MyDrive/validation_data.jsonl", "rb") as f:
    ultrachat_chunk_eval = client.files.create(file=("validation_data.jsonl", f))

## Estimating number of train steps

In [None]:
approximate_epochs = 100

def get_size_in_mb(file_path: str) -> float:
    file_size_bytes = os.path.getsize(file_path)
    file_size_mb = file_size_bytes / (1000 * 1000)
    return file_size_mb

size_file = get_size_in_mb("/content/drive/MyDrive/train_data.jsonl")
print("File Size:", size_file, "mb")
training_steps = int(approximate_epochs * size_file)
print("Training steps:", training_steps)

File Size: 0.269834 mb
Training steps: 26


## Creating a fine-tuning job and visualising in WB

In [None]:
from mistralai.models.jobs import WandbIntegrationIn, TrainingParameters

wandb_api_key = "" # Removed the api key

created_jobs = client.jobs.create(
    model="mistral-small-latest",
    training_files=[ultrachat_chunk_train.id],
    validation_files=[ultrachat_chunk_eval.id],
    hyperparameters=TrainingParameters(
        training_steps=26,
        learning_rate=0.0001,
    ),
    integrations=[
        WandbIntegrationIn(
            project="test_api",
            run_name="test",
            api_key=wandb_api_key,
        ).dict()
    ]
)

In [None]:
print(created_jobs)

id='9672d988-d6c9-4007-816f-ce6895c94ba6' hyperparameters=TrainingParameters(training_steps=26, learning_rate=0.0001) fine_tuned_model=None model='mistral-small-latest' status='QUEUED' job_type='FT' created_at=1719596804 modified_at=1719596804 training_files=['a0f75376-861c-4c96-bf8a-14a58f246221'] validation_files=['527c7872-a89d-4e40-9dc1-b7ca2123fc0f'] object='job' integrations=[WandbIntegration(type='wandb', project='test_api', name=None, run_name='test')]


# Retrieve the finetuned mistral-small-latest model

In [None]:
# Retrieve a jobs
retrieved_jobs = client.jobs.retrieve(created_jobs.id)
print(retrieved_jobs)

id='9672d988-d6c9-4007-816f-ce6895c94ba6' hyperparameters=TrainingParameters(training_steps=26, learning_rate=0.0001) fine_tuned_model='ft:mistral-small-latest:38a3e9c5:20240628:9672d988' model='mistral-small-latest' status='SUCCESS' job_type='FT' created_at=1719596804 modified_at=1719597269 training_files=['a0f75376-861c-4c96-bf8a-14a58f246221'] validation_files=['527c7872-a89d-4e40-9dc1-b7ca2123fc0f'] object='job' integrations=[WandbIntegration(type='wandb', project='test_api', name=None, run_name='test')] events=[Event(name='status-updated', data={'status': 'SUCCESS'}, created_at=1719597269), Event(name='status-updated', data={'status': 'RUNNING'}, created_at=1719596805), Event(name='status-updated', data={'status': 'QUEUED'}, created_at=1719596804)] checkpoints=[Checkpoint(metrics=Metric(train_loss=0.063598, valid_loss=0.553382, valid_mean_token_accuracy=1.467522), step_number=20, created_at=1719597134), Checkpoint(metrics=Metric(train_loss=0.353003, valid_loss=0.478452, valid_mean

# Finetuning open-mistral-7b on 50 data points



In [None]:
from mistralai.models.jobs import WandbIntegrationIn, TrainingParameters

wandb_api_key = "" # Removed the api key

created_jobs = client.jobs.create(
    model="open-mistral-7b",
    training_files=[ultrachat_chunk_train.id],
    validation_files=[ultrachat_chunk_eval.id],
    hyperparameters=TrainingParameters(
        training_steps=30,
        learning_rate=0.0001,
    ),
    integrations=[
        WandbIntegrationIn(
            project="test_api",
            run_name="test",
            api_key=wandb_api_key,
        ).dict()
    ]
)

In [None]:
print(created_jobs)

id='a0588788-2d7f-4aa8-acb0-5f46e29ff13a' hyperparameters=TrainingParameters(training_steps=30, learning_rate=0.0001) fine_tuned_model=None model='open-mistral-7b' status='QUEUED' job_type='FT' created_at=1719606853 modified_at=1719606853 training_files=['a0f75376-861c-4c96-bf8a-14a58f246221'] validation_files=['527c7872-a89d-4e40-9dc1-b7ca2123fc0f'] object='job' integrations=[WandbIntegration(type='wandb', project='test_api', name=None, run_name='test')]


# Retrieve the finetuned open-mistral-7b model

In [None]:
# Retrieve a jobs
retrieved_jobs = client.jobs.retrieve(created_jobs.id)
print(retrieved_jobs)

id='a0588788-2d7f-4aa8-acb0-5f46e29ff13a' hyperparameters=TrainingParameters(training_steps=30, learning_rate=0.0001) fine_tuned_model='ft:open-mistral-7b:38a3e9c5:20240628:a0588788' model='open-mistral-7b' status='SUCCESS' job_type='FT' created_at=1719606853 modified_at=1719607068 training_files=['a0f75376-861c-4c96-bf8a-14a58f246221'] validation_files=['527c7872-a89d-4e40-9dc1-b7ca2123fc0f'] object='job' integrations=[WandbIntegration(type='wandb', project='test_api', name=None, run_name='test')] events=[Event(name='status-updated', data={'status': 'SUCCESS'}, created_at=1719607068), Event(name='status-updated', data={'status': 'RUNNING'}, created_at=1719606853), Event(name='status-updated', data={'status': 'QUEUED'}, created_at=1719606853)] checkpoints=[Checkpoint(metrics=Metric(train_loss=0.171175, valid_loss=0.663745, valid_mean_token_accuracy=1.58419), step_number=30, created_at=1719607035), Checkpoint(metrics=Metric(train_loss=0.250904, valid_loss=0.620961, valid_mean_token_accu

# Finally we have decided on using open-mistral-7b finetuned due to less latency and comparable output quality to mistral-small-latest finetuned