# Retrieval
Semantic retrieval using vector embeddings and full text search, followed by cross-encoder reranking

In [1]:
from modules.document_retriever import DocumentRetriever
from modules.notebook_helpers import display_search_results

dr = DocumentRetriever()

def retrieval_query(query):
    rows = dr.search_rerank(query, k=20, top_k=3)
    display_search_results(rows)

  from pydantic.v1.fields import FieldInfo as FieldInfoV1


In [2]:
retrieval_query("Wann starb Pele?")

In [3]:
# Question in different language
retrieval_query("Was there a fire in Chur GR?")

In [4]:
retrieval_query("Hat der ZSC gegen Davos gewonnen?")

In [5]:
retrieval_query("Was US-President Trump at the WEF in Davos?")

In [6]:
retrieval_query("Wann wurde in Trimmis eine Bank überfallen?")

In [12]:
retrieval_query("Does Migros sell Feldschlösschen?")

# Comments on full text (FTS) vs semantic similarity (vector) search
## Postgres Full text search
- based on [tsvector](https://www.postgresql.org/docs/current/datatype-textsearch.html)
- lexical (token-based) information retrieval system
- efficient normalization and indexing and retrieval
- restricted to same language (german)
- **Which documents contain these words, and how important are they?**
## Semantic similarity search
- Based on embedding models ([multilingual-e5-large](https://arxiv.org/pdf/2402.05672))
- allows cross-language queries
- **Which documents are semantically similar in meaning?**

Short queries (e.g. single keywords) have a better recall using full text search, whereas long queries (e.g. questions) work much better with semantic similarity search

*Conclusion*: Do both, deduplicate results and feed them to the reranker



In [7]:
# Example: short query with FTS with a very specific keyword (name of small village) ==> matching result
r = dr.fts_search("Mastrils", k=2)
display_search_results(r)

In [8]:
# Example: short query with vector search with a very specific keyword (name of small village) ==> nonsense results
r = dr.vector_similarity_search("Mastrils", k=2)
display_search_results(r)

In [9]:
# Example: long, semantic query in other language with FTS ==> no matches
r = dr.fts_search("When did the Rolling Stones publish a song? ", k=2)
display_search_results(r)

In [10]:
# Example: long, semantic query in other language with vector ==> relevant matches
r = dr.vector_similarity_search("When did the Rolling Stones publish a song? ", k=2)
display_search_results(r)