# Finetuning a Black-box Embedding Adapter

## Generate Corpus

In [1]:
import json

from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import MetadataMode

In [2]:
TRAIN_FILES = ["../../../examples/data/10k/lyft_2021.pdf"]
VAL_FILES = ["../../../examples/data/10k/uber_2021.pdf"]

TRAIN_CORPUS_FPATH = "./data/train_corpus.json"
VAL_CORPUS_FPATH = "./data/val_corpus.json"

In [3]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

We do a very naive train/val split by having the Lyft corpus as the train dataset, and the Uber corpus as the val dataset.

In [None]:
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

### Generate synthetic queries

Now, we use an LLM (gpt-3.5-turbo) to generate questions using each text chunk in the corpus as context.

Each pair of (generated question, text chunk used as context) becomes a datapoint in the finetuning dataset (either for training or evaluation).

In [1]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)

In [None]:
train_dataset = generate_qa_embedding_pairs(train_nodes)
val_dataset = generate_qa_embedding_pairs(val_nodes)

train_dataset.save_json("train_dataset.json")
val_dataset.save_json("val_dataset.json")

In [2]:
# [Optional] Load
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

## Run Embedding Finetuning

In [5]:
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.embeddings import resolve_embed_model

base_embed_model = resolve_embed_model("local:BAAI/bge-small-en")

finetune_engine = EmbeddingAdapterFinetuneEngine(
    train_dataset,
    base_embed_model,
    model_output_path="model_output_test",
    epochs=2,
    verbose=True
)

In [6]:
finetune_engine.finetune()

[36;1m[1;3m> Prepared optimizer, scheduler, and loss model.
[0m

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/67 [00:00<?, ?it/s]

[36;1m[1;3m> [Epoch 0] Current loss: 1.6461929082870483
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.6520378589630127
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.420575499534607
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.6287320852279663
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.541299819946289
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.855969786643982
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.8498220443725586
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.5851109027862549
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.6538136005401611
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.648180603981018
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.6761770248413086
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.7432136535644531
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.5535331964492798
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.6446977853775024
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.5990689992904663
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.6190745830535889
[0m[36;1m[1;3

Iteration:   0%|          | 0/67 [00:00<?, ?it/s]

[36;1m[1;3m> [Epoch 1] Current loss: 1.6267112493515015
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.6393356323242188
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.402120590209961
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.6083265542984009
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.5210767984390259
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.8446979522705078
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.8354556560516357
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.565800428390503
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.6345064640045166
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.6302909851074219
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.6577794551849365
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.7276394367218018
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.5358024835586548
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.6254596710205078
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.5831323862075806
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.6007719039916992
[0m[36;1m[1

In [7]:
embed_model = finetune_engine.get_finetuned_model()

In [8]:
embed_model._adapter.linear.weight

Parameter containing:
tensor([[ 1.0005e+00, -5.3062e-04,  3.2115e-04,  ..., -1.0009e-04,
          2.4751e-04,  1.5288e-04],
        [-1.4233e-04,  1.0004e+00,  1.2545e-04,  ..., -4.3216e-04,
          3.6817e-04,  2.2970e-04],
        [ 1.4492e-04, -5.1278e-05,  1.0008e+00,  ...,  9.4560e-05,
          7.7865e-05, -1.1534e-04],
        ...,
        [ 7.5322e-05, -5.5692e-04, -9.5354e-05,  ...,  1.0002e+00,
          3.6540e-04,  1.9941e-04],
        [ 3.2894e-04,  1.9668e-04, -1.3121e-04,  ...,  5.3004e-04,
          1.0002e+00, -2.6122e-04],
        [-4.7346e-05,  3.4990e-04, -1.2351e-04,  ..., -4.2361e-05,
          2.6185e-04,  1.0005e+00]], requires_grad=True)

## Evaluate Finetuned Model

In [12]:
from llama_index.embeddings import OpenAIEmbedding
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

from eval_utils import evaluate, display_results

In [11]:
ada = OpenAIEmbedding()
ada_val_results = evaluate(val_dataset, ada)

Generating embeddings:   0%|          | 0/395 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 790/790 [02:41<00:00,  4.89it/s]


In [12]:
display_results(["ada"], [ada_val_results])

Unnamed: 0,retrievers,hit_rate,mrr
0,ada,0.870886,0.729367


In [7]:
bge = "local:BAAI/bge-small-en"
bge_val_results = evaluate(val_dataset, bge)

Generating embeddings:   0%|          | 0/395 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 790/790 [00:23<00:00, 34.00it/s]


In [8]:
display_results(["bge"], [bge_val_results])

Unnamed: 0,retrievers,hit_rate,mrr
0,bge,0.787342,0.643038


In [13]:
ft_val_results = evaluate(val_dataset, embed_model)

Generating embeddings:   0%|          | 0/395 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 790/790 [00:21<00:00, 37.15it/s]


In [14]:
display_results(["ft"], [ft_val_results])

Unnamed: 0,retrievers,hit_rate,mrr
0,ft,0.792405,0.644241


In [24]:
# TMP (look at train results)

ada = OpenAIEmbedding()
ada_train_results = evaluate(train_dataset, ada)

Generating embeddings:   0%|          | 0/334 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 668/668 [02:06<00:00,  5.28it/s]


In [25]:
display_results(["ada"], [ada_train_results])

Unnamed: 0,retrievers,hit_rate,mrr
0,ada,0.883234,0.684107


In [10]:
ft_train_results = evaluate(train_dataset, embed_model)

Generating embeddings:   0%|          | 0/334 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 668/668 [00:16<00:00, 39.78it/s]


In [11]:
display_results(["ft"], [ft_train_results])

Unnamed: 0,retrievers,hit_rate,mrr
0,ft,0.806886,0.634531
