# Finetuning a Black-box Embedding Adapter

## Generate Corpus

In [1]:
import json

from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import MetadataMode

In [2]:
TRAIN_FILES = ["../../../examples/data/10k/lyft_2021.pdf"]
VAL_FILES = ["../../../examples/data/10k/uber_2021.pdf"]

TRAIN_CORPUS_FPATH = "./data/train_corpus.json"
VAL_CORPUS_FPATH = "./data/val_corpus.json"

In [3]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

We do a very naive train/val split by having the Lyft corpus as the train dataset, and the Uber corpus as the val dataset.

In [4]:
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

Loading files ['../../../examples/data/10k/lyft_2021.pdf']
Loaded 238 docs


Parsing documents into nodes:   0%|          | 0/238 [00:00<?, ?it/s]

Parsed 349 nodes
Loading files ['../../../examples/data/10k/uber_2021.pdf']
Loaded 307 docs


Parsing documents into nodes:   0%|          | 0/307 [00:00<?, ?it/s]

Parsed 418 nodes


### Generate synthetic queries

Now, we use an LLM (gpt-3.5-turbo) to generate questions using each text chunk in the corpus as context.

Each pair of (generated question, text chunk used as context) becomes a datapoint in the finetuning dataset (either for training or evaluation).

In [1]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)

In [None]:
train_dataset = generate_qa_embedding_pairs(train_nodes)
val_dataset = generate_qa_embedding_pairs(val_nodes)

train_dataset.save_json("train_dataset.json")
val_dataset.save_json("val_dataset.json")

In [2]:
# [Optional] Load
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

## Run Embedding Finetuning

In [11]:
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.embeddings import resolve_embed_model
import torch

base_embed_model = resolve_embed_model("local:BAAI/bge-small-en")

finetune_engine = EmbeddingAdapterFinetuneEngine(
    train_dataset,
    base_embed_model,
    # model_output_path="model_output_test",
    # model_output_path="model_output_test_v2",
    # model_output_path="model_output_test_bias",
    # model_output_path="model_output_no_bias_4",
    # model_output_path="model_output_no_bias_5",
    model_output_path="model_output_bias_6",
    bias=True,
    # epochs=2,
    epochs=4,
    verbose=True,
    # optimizer_class=torch.optim.SGD,
    # optimizer_params={"lr": 0.01}
)

In [12]:
finetune_engine.finetune()

[36;1m[1;3m> Prepared optimizer, scheduler, and loss model.
[0m

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/67 [00:00<?, ?it/s]

[36;1m[1;3m> [Epoch 0] Current loss: 1.6461929082870483
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.6520378589630127
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.4205775260925293
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.6287790536880493
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.5413802862167358
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.855971336364746
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.8499667644500732
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.5853407382965088
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.6540038585662842
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.6485378742218018
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.6767240762710571
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.7436771392822266
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.5540363788604736
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.6454252004623413
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.59978449344635
[0m[36;1m[1;3m> [Epoch 0] Current loss: 1.6202552318572998
[0m[36;1m[1;

Iteration:   0%|          | 0/67 [00:00<?, ?it/s]

[36;1m[1;3m> [Epoch 1] Current loss: 1.6278995275497437
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.637595772743225
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.3984235525131226
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.6125218868255615
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.5274169445037842
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.844879388809204
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.8354038000106812
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.5656654834747314
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.633568525314331
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.6286585330963135
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.6574878692626953
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.7275493144989014
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.534127950668335
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.620722770690918
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.578970193862915
[0m[36;1m[1;3m> [Epoch 1] Current loss: 1.5955476760864258
[0m[36;1m[1;3m>

Iteration:   0%|          | 0/67 [00:00<?, ?it/s]

[36;1m[1;3m> [Epoch 2] Current loss: 1.607826590538025
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.6231151819229126
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.3773081302642822
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.596382737159729
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.514119267463684
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.8348331451416016
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.8226051330566406
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.5489269495010376
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.6163933277130127
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.6122478246688843
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.6416947841644287
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.7144181728363037
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.51851224899292
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.6015201807022095
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.563701868057251
[0m[36;1m[1;3m> [Epoch 2] Current loss: 1.5776357650756836
[0m[36;1m[1;3m>

Iteration:   0%|          | 0/67 [00:00<?, ?it/s]

[36;1m[1;3m> [Epoch 3] Current loss: 1.5954265594482422
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.6144605875015259
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.3646655082702637
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.586377501487732
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.5061616897583008
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.8287767171859741
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.8149572610855103
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.5390233993530273
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.6061809062957764
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.6024984121322632
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.6323038339614868
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.7065999507904053
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.5093562602996826
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.5903894901275635
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.5551114082336426
[0m[36;1m[1;3m> [Epoch 3] Current loss: 1.5675818920135498
[0m[36;1m[

In [16]:
embed_model = finetune_engine.get_finetuned_model()

In [17]:
# embed_model._adapter.linear.bias

## Evaluate Finetuned Model

In [7]:
from llama_index.embeddings import OpenAIEmbedding
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

from eval_utils import evaluate, display_results

In [15]:
ada = OpenAIEmbedding()
ada_val_results = evaluate(val_dataset, ada)

Generating embeddings:   0%|          | 0/395 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 790/790 [02:41<00:00,  4.88it/s]


In [16]:
display_results(["ada"], [ada_val_results])

Unnamed: 0,retrievers,hit_rate,mrr
0,ada,0.870886,0.729156


In [7]:
bge = "local:BAAI/bge-small-en"
bge_val_results = evaluate(val_dataset, bge)

Generating embeddings:   0%|          | 0/395 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 790/790 [00:23<00:00, 34.00it/s]


In [8]:
display_results(["bge"], [bge_val_results])

Unnamed: 0,retrievers,hit_rate,mrr
0,bge,0.787342,0.643038


In [36]:
ft_val_results = evaluate(val_dataset, embed_model)

Generating embeddings:   0%|          | 0/395 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 790/790 [00:21<00:00, 36.73it/s]


In [37]:
display_results(["ft"], [ft_val_results])

Unnamed: 0,retrievers,hit_rate,mrr
0,ft,0.792405,0.644241


In [24]:
# TMP (look at train results)

ada = OpenAIEmbedding()
ada_train_results = evaluate(train_dataset, ada)

Generating embeddings:   0%|          | 0/334 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 668/668 [02:06<00:00,  5.28it/s]


In [25]:
display_results(["ada"], [ada_train_results])

Unnamed: 0,retrievers,hit_rate,mrr
0,ada,0.883234,0.684107


In [23]:
ft_train_results = evaluate(train_dataset, embed_model)

Generating embeddings:   0%|          | 0/334 [00:00<?, ?it/s]

100%|██████████████████████████████████████| 668/668 [00:16<00:00, 39.60it/s]


In [24]:
display_results(["ft"], [ft_train_results])

Unnamed: 0,retrievers,hit_rate,mrr
0,ft,0.781437,0.619736


In [None]:
## TMP: with epochs=4
ft_val_results_v2 = evaluate(val_dataset, embed_model)

In [46]:
display_results(["ft"], [ft_val_results_v2])

Unnamed: 0,retrievers,hit_rate,mrr
0,ft,0.797468,0.648945


In [10]:
## TMP: with bias term
ft_val_results_v3 = evaluate(val_dataset, embed_model)

Generating embeddings:   0%|          | 0/395 [00:00<?, ?it/s]

100%|██████████████████████████████████████| 790/790 [00:22<00:00, 35.91it/s]


In [11]:
display_results(["ft"], [ft_val_results_v3])

Unnamed: 0,retrievers,hit_rate,mrr
0,ft,0.787342,0.636751


In [13]:
## TMP: with no bias term
ft_val_results_v4 = evaluate(val_dataset, embed_model)

Generating embeddings:   0%|          | 0/395 [00:00<?, ?it/s]

100%|██████████████████████████████████████| 790/790 [00:22<00:00, 35.63it/s]


In [14]:
display_results(["ft"], [ft_val_results_v4])

Unnamed: 0,retrievers,hit_rate,mrr
0,ft,0.787342,0.636118


In [21]:
## TMP: with no bias term + 4 epochs
ft_val_results_v5 = evaluate(val_dataset, embed_model)

Generating embeddings:   0%|          | 0/395 [00:00<?, ?it/s]

100%|██████████████████████████████████████| 790/790 [00:20<00:00, 38.55it/s]


In [22]:
display_results(["ft"], [ft_val_results_v5])

Unnamed: 0,retrievers,hit_rate,mrr
0,ft,0.778481,0.631751


In [8]:
## TMP: with no bias term + 4 epochs (try again)
ft_val_results_v6 = evaluate(val_dataset, embed_model)

Generating embeddings:   0%|          | 0/395 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 790/790 [00:22<00:00, 35.74it/s]


In [9]:
display_results(["ft"], [ft_val_results_v6])

Unnamed: 0,retrievers,hit_rate,mrr
0,ft,0.8,0.661983


In [18]:
## TMP: with bias term + 4 epochs (try again)
ft_val_results_v7 = evaluate(val_dataset, embed_model)

Generating embeddings:   0%|          | 0/395 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 790/790 [00:21<00:00, 37.12it/s]


In [19]:
display_results(["ft"], [ft_val_results_v7])

Unnamed: 0,retrievers,hit_rate,mrr
0,ft,0.798734,0.662152
