# Embedding Metadata for Improved Retrieval 
https://haystack.deepset.ai/tutorials/39_embedding_metadata_for_improved_retrieval 

working

In [1]:
#!pip install haystack-ai wikipedia sentence-transformers

In [2]:
from haystack.telemetry import tutorial_running
tutorial_running(39)

In [6]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

In [7]:
embedder = SentenceTransformersDocumentEmbedder(meta_fields_to_embed=["url"])

In [8]:
from haystack import Pipeline
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
from haystack.utils import ComponentDevice


def create_indexing_pipeline(document_store, metadata_fields_to_embed=None):
    document_cleaner = DocumentCleaner()
    document_splitter = DocumentSplitter(split_by="sentence", split_length=2)
    document_embedder = SentenceTransformersDocumentEmbedder(
        model="thenlper/gte-large", meta_fields_to_embed=metadata_fields_to_embed
    )
    document_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE)

    indexing_pipeline = Pipeline()
    indexing_pipeline.add_component("cleaner", document_cleaner)
    indexing_pipeline.add_component("splitter", document_splitter)
    indexing_pipeline.add_component("embedder", document_embedder)
    indexing_pipeline.add_component("writer", document_writer)

    indexing_pipeline.connect("cleaner", "splitter")
    indexing_pipeline.connect("splitter", "embedder")
    indexing_pipeline.connect("embedder", "writer")

    return indexing_pipeline


In [None]:
import wikipedia
from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore

some_bands = """The Beatles,The Cure""".split(",")

raw_docs = []

for title in some_bands:
    page = wikipedia.page(title=title, auto_suggest=False)
    doc = Document(content=page.content, meta={"title": page.title, "url": page.url})
    raw_docs.append(doc)

In [10]:
raw_docs

[Document(id=6494ed77e8183f074846b2a731be7c3b77138973972ef3777ca109b838ac085f, content: 'The Beatles were an English rock band formed in Liverpool in 1960, comprising John Lennon, Paul McCa...', meta: {'title': 'The Beatles', 'url': 'https://en.wikipedia.org/wiki/The_Beatles'}),
 Document(id=a341fa65c42f4a503c9d13caacf97b2a8241a39b7a26fb92e3af40d27528eba6, content: 'The Cure are an English rock band formed in 1978 in Crawley, West Sussex. Throughout numerous lineup...', meta: {'title': 'The Cure', 'url': 'https://en.wikipedia.org/wiki/The_Cure'})]

In [None]:
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
document_store_with_embedded_metadata = InMemoryDocumentStore(embedding_similarity_function="cosine")

In [None]:
indexing_pipeline = create_indexing_pipeline(document_store=document_store)
indexing_with_metadata_pipeline = create_indexing_pipeline(
    document_store=document_store_with_embedded_metadata, metadata_fields_to_embed=["title"]
)

In [9]:
indexing_pipeline.run({"cleaner": {"documents": raw_docs}})
indexing_with_metadata_pipeline.run({"cleaner": {"documents": raw_docs}})

modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

{'writer': {'documents_written': 544}}

In [11]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

retrieval_pipeline = Pipeline()
retrieval_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder(model="thenlper/gte-large"))
retrieval_pipeline.add_component(
    "retriever", InMemoryEmbeddingRetriever(document_store=document_store, scale_score=False, top_k=3)
)
retrieval_pipeline.add_component(
    "retriever_with_embeddings",
    InMemoryEmbeddingRetriever(document_store=document_store_with_embedded_metadata, scale_score=False, top_k=3),
)

retrieval_pipeline.connect("text_embedder", "retriever")
retrieval_pipeline.connect("text_embedder", "retriever_with_embeddings")


<haystack.core.pipeline.pipeline.Pipeline object at 0x7fdf8e702790>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - retriever_with_embeddings: InMemoryEmbeddingRetriever
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])
  - text_embedder.embedding -> retriever_with_embeddings.query_embedding (List[float])

In [12]:
result = retrieval_pipeline.run({"text_embedder": {"text": "Have the Beatles ever been to Bangor?"}})

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
print("Retriever Results:\n")
for doc in result["retriever"]["documents"]:
    print(doc)

Retriever Results:

Document(id=aa2594f43e90c7816ea301f106a350c54270e1b4505bc089efd186a18a58b00c, content: ' The band flew to Florida, where they appeared on The Ed Sullivan Show a second time, again before 7...', meta: {'title': 'The Beatles', 'url': 'https://en.wikipedia.org/wiki/The_Beatles', 'source_id': 'a16ec70ab1bbfacaab011d7f4e14fdac08e16a7ff80b0e1082e5ca558b78a0df'}, score: 0.8637015943076489)
Document(id=40fdb3a39cc7d76540ab0c84fe09d5ff0ce25f2f02fcc705ebfe6c94676559fb, content: 'During the 1964 US tour, the group were confronted with racial segregation in the country at the tim...', meta: {'title': 'The Beatles', 'url': 'https://en.wikipedia.org/wiki/The_Beatles', 'source_id': 'a16ec70ab1bbfacaab011d7f4e14fdac08e16a7ff80b0e1082e5ca558b78a0df'}, score: 0.8558953210750357)
Document(id=81fd72b03a0cf60dc245136b89ff0e4476813d02fa31147d63bd9bbba1035778, content: 'The Beatles were an English rock band formed in Liverpool in 1960, comprising John Lennon, Paul McCa...', meta: {'title'

In [14]:
print("Retriever with Embeddings Results:\n")
for doc in result["retriever_with_embeddings"]["documents"]:
    print(doc)

Retriever with Embeddings Results:

Document(id=b5ee04e82c8670b261f5e9f2dc86e64eb8338f29b87e6be46a8bb34a11dbe13a, content: ' The next day, they travelled to Bangor for his Transcendental Meditation retreat. On 27 August, the...', meta: {'title': 'The Beatles', 'url': 'https://en.wikipedia.org/wiki/The_Beatles', 'source_id': 'a16ec70ab1bbfacaab011d7f4e14fdac08e16a7ff80b0e1082e5ca558b78a0df'}, score: 0.8879933229812993)
Document(id=7f5f35be1e4554195491842772c116d41b044b071588d481c1c801dd3a92f500, content: '" City officials relented and agreed to allow an integrated show. The group also cancelled their res...', meta: {'title': 'The Beatles', 'url': 'https://en.wikipedia.org/wiki/The_Beatles', 'source_id': 'a16ec70ab1bbfacaab011d7f4e14fdac08e16a7ff80b0e1082e5ca558b78a0df'}, score: 0.8637997720993015)
Document(id=aa2594f43e90c7816ea301f106a350c54270e1b4505bc089efd186a18a58b00c, content: ' The band flew to Florida, where they appeared on The Ed Sullivan Show a second time, again before 7...'