In [None]:
%%capture
!pip install llama-index==0.10.37 llama-index-embeddings-openai==0.1.9 qdrant-client==1.9.1 llama-index-vector-stores-qdrant==0.2.8 llama-index-llms-openai==0.1.19 llama-index-retrievers-bm25

Note, you will need to run the following:

`pip install llama-index-retrievers-bm25`

In [1]:
import os
import sys
from getpass import getpass
import nest_asyncio

from IPython.display import Markdown, display

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv("")

sys.path.append('../helpers')

from utils import setup_llm, setup_embed_model, setup_vector_store


In [2]:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] or getpass("Enter your OpenAI API key: ")

In [3]:
QDRANT_URL = os.environ['QDRANT_URL'] or getpass("Enter your Qdrant URL:")

In [4]:
QDRANT_API_KEY = os.environ['QDRANT_API_KEY'] or  getpass("Enter your Qdrant API Key:")

In [5]:
from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI
from utils import setup_llm, setup_embed_model

setup_llm(
    provider="openai", 
    model="gpt-4o", 
    api_key=OPENAI_API_KEY
    )

setup_embed_model(
    provider="openai", 
    model="text-embedding-3-small",
    api_key=OPENAI_API_KEY
    )

In [6]:
import random
from llama_index.core.storage.docstore import SimpleDocumentStore
from utils import get_documents_from_docstore, group_documents_by_author, sample_documents

documents = get_documents_from_docstore("../data/words-of-the-senpais")

random.seed(42)

documents_by_author = group_documents_by_author(documents)

senpai_documents = sample_documents(documents_by_author, num_samples=10)

In [7]:
smol_senpai_docstore = SimpleDocumentStore()
smol_senpai_docstore.add_documents(senpai_documents)

## Setup Qdrant Vector Store

In [None]:
from llama_index.core import StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage.index_store.simple_index_store import SimpleIndexStore
from llama_index.core.settings import Settings
from utils import setup_vector_store

COLLECTION_NAME = "rr-fusion"

rr_fusion_vector_store = setup_vector_store(
    QDRANT_URL, 
    QDRANT_API_KEY, 
    COLLECTION_NAME, 
    enable_hybrid=True
    )

rr_fusion_storage_context = StorageContext.from_defaults(
    docstore = smol_senpai_docstore,
    index_store=SimpleIndexStore(),
    vector_store = rr_fusion_vector_store
    )

### Ingest with a docstore

In [9]:
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.storage.docstore import SimpleDocumentStore

from utils import ingest 

sentence_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=16)

index = VectorStoreIndex.from_documents(
    documents=senpai_documents, 
    embed_model=Settings.embed_model,
    storage_context=rr_fusion_storage_context,
    transformations=[sentence_splitter, Settings.embed_model]
    )

# A brief word on vector store query modes

The `vector_store_query_mode` in LlamaIndex determines the type of search to be performed. Here's a brief description of each mode:

 - `default`: This mode performs a vector search. It retrieves the most similar vectors based on the query vector.  They create a numerical representation of a piece of text, represented as a long list of numbers. These dense vectors can capture rich semantics across the entire piece of text. `alpha=0.75` is used by default.

 - `hybrid`: This mode performs a hybrid search. It combines vector search with traditional search methods. `alpha` parameter determines weighting (`alpha = 0` -> bm25, `alpha = 1` -> vector search). 

 - `semantic_hybrid`: Semantic hybrid search combines text search with vector embeddings. Text search provides keyword matching and lexical retrieval. Vector embeddings allow finding documents with similar meaning, even if they don't contain exact keyword matches. This mode incorporates semantic reranking to hybrid search results to improve search relevance.

 - `sparse`: Most of the elements in a sparse vector are zero, with only a few key values being non-zero. These sparse vectors are great at capturing specific keywords and similar small details. You need to use a specialized embedding model to create sparse vectors. 
   - `FastEmbed` has a few choices for sparse text embedding models, for example you can pass in `prithvida/Splade_PP_en_v1` as the model name when you run `setup_embed_model` if you want to use them. 
    - We didn't use a sparse vector here, so we won't see this in action.  
    - Note, if you try this you'll need to set the `sparse_top_k` argument, which represents how many nodes will be retrieved from each dense and sparse query. For example, if `sparse_top_k=5` is set, that means I will retrieve 5 nodes using sparse vectors and 5 nodes using dense vectors.

 - `text_search`: Text search looks for exact keyword matches between the query and documents.

 - `similarity_top_k`: controls the final number of returned nodes. A fusion algorithm is applied to rank and order the nodes from different vector spaces, `similarity_top_k=2` means the top two nodes after fusion are returned.

 - `hybrid_top_k`: return top k results from `hybrid` search. `similarity_top_k` is used for dense search top k

In [10]:
QUERY_STRING = "How can I create my own luck?"

def test_retrievers(query=QUERY_STRING, index=index, **kwargs):
    retriever_engine = index.as_retriever(**kwargs)
    retrieved_docs = retriever_engine.retrieve(query)
    print(f"Retrieved {len(retrieved_docs)} nodes.")
    print("\n")
    for node in retrieved_docs:
        print(f"Score: {node.score:.2f} - {node.text}...\n-----\n")
    
mode_kwargs = {
    'default': {'vector_store_query_mode': 'default', 'similarity_top_k': 3},
    'bm25': {'vector_store_query_mode':'hybrid', 'alpha': 0.0, 'hybrid_top_k': 3}, 
    'hybrid': {'vector_store_query_mode':'hybrid', 'alpha': 0.25, 'hybrid_top_k': 3},
    'semantic_hybrid': {'vector_store_query_mode':'semantic_hybrid', 'alpha': 0.75, 'hybrid_top_k': 3},
    # 'sparse': {"sparse_top_k":5},
    'text_search': {'vector_store_query_mode':'text_search', 'similarity_top_k': 3},
}

for mode, kwargs in mode_kwargs.items():
    print(f"Retrieving nodes using: {mode} retrieval")
    test_retrievers(**kwargs)
    print(f"Retrieval with {mode} complete...")        
    print("\n")

Retrieving nodes using: default retrieval
Retrieved 3 nodes.


Score: 0.37 - became extremely successful. You just had to give them a long enough timescale. It never happens in the timescale you want, or they want, but it does happen. Apply specific knowledge with leverage and eventually, you will get what you deserve. It takes timeeven once you have all of these pieces in place, there is an indeterminate amount of time you have to put in. If youre counting, youll run out of patience before success actually arrives. Everybody wants to get rich immediately, but the world is an efficient place; immediate doesnt work. You do have to put in the time. You do have to put in the hours, and so I think you have to put yourself in the position with the specific knowledge, with accountability, with leverage, with the authentic skill set you have, to be the best in the world at what you do. You have to enjoy it and keep doing it, keep doing it, and keep doing it....
-----

Score: 0.37 - TIME ACTIO

# Hybrid Fusion Retriever

The Hybrid Fusion Retriever combines of semantic and keyword-based approaches.  This uses a [BM25-based retriever](https://en.wikipedia.org/wiki/Okapi_BM25) with a semantic index. [BM25](https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/retrievers/llama-index-retrievers-bm25/llama_index/retrievers/bm25/base.py) is a ranking function used by search engines to estimate the relevance of documents to a given search query. 

#### How it works

The system follows a three-step process:

- **Query Generation/Rewriting**: It creates multiple queries from the original user query to better match the user's intent and improve the precision and recall of the retrieved results.

- **Retrieval**: It performs the retrieval for each query over an ensemble of retrievers.

- **Reranking/Fusion**: It combines the results from all queries and applies a reranking step to fuse the top relevant results.

#### ℹ️ Useful knowledge to have as a RAG practitioner

##### Index Fusion Mode

We set the mode to `reciprocal_rerank`. The system merges its index with a BM25 based retriever. This allows it to understand both the semantic relationships (meaningful connections between words) and keywords in the input queries. Other modes are `relative_score`, `dist_based_score`, `simple` .

  - [`reciprocal_rerank`](https://github.com/run-llama/llama_index/blob/f116d75557d6867ed2cc61811a1c2f0b0c4d4ddb/llama-index-core/llama_index/core/retrievers/fusion_retriever.py#L99): Reciprocal rank is a measure of how early a relevant item appears in a ranked list. Lower ranks correspond to higher relevance. This mode fuses the results from multiple sources by giving higher importance to nodes that appear earlier in the rankings across those sources.

  - [`relative_score`](https://github.com/run-llama/llama_index/blob/f116d75557d6867ed2cc61811a1c2f0b0c4d4ddb/llama-index-core/llama_index/core/retrievers/fusion_retriever.py#L135): It scales each score to a range from 0 to 1 using min-max scaling. Then it multiplies each scaled score by a retriever-specific weight. After that, it divides each score by the total number of queries. Basically, it scales, weights, and combines scores from multiple retrieval sources.

  - `dist_based_score`: Same as `relative_score`, but, instead of using the minimum and maximum scores directly, the function calculates them based on the mean and standard deviation of the scores. This reduces the impact of outliers on the scaling process.

  - `simple`: re-orders results based on original scores


##### **[Reciprocal Rerank Algorithm](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf)**

 Since both retrievers calculate a score for the relevance of results, the system uses the reciprocal rerank algorithm to reshuffle the results. This is done without employing additional models or excessive computation, making the process more efficient.
 
  - 🧮 **Rank Calculation**: For each unique node, calculate its reciprocal rank from each list where it appears. The reciprocal rank of a node in a list is defined as 1 divided by its position in that list (e.g., a node at rank 3 has a reciprocal rank of 1/3).

  - 📊 **Score Aggregation**: Sum up the reciprocal ranks for each node across all lists in which it appears. This aggregated score represents the overall relevance of the node, taking into account its performance across multiple retrieval scenarios.

  - 🥇🥈🥉 **Reordering**: Finally, reorder all nodes based on their aggregated scores, from highest to lowest. This re-ranking step prioritizes nodes that consistently appear in higher ranks across multiple lists, thus likely to be more relevant to the query.

In [11]:
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core import PromptTemplate
from prompts import QUESTION_GEN_PROMPT

vector_retriever = index.as_retriever(similarity_top_k=1)

bm25_retriever = BM25Retriever.from_defaults(docstore=index.docstore, similarity_top_k=1)

QUERY_GEN_PROMPT_TEMPLATE=PromptTemplate(QUESTION_GEN_PROMPT)

retriever = QueryFusionRetriever(
    [vector_retriever, bm25_retriever],
    similarity_top_k=15,
    num_queries=3,  # set this to 1 to disable query generation
    mode="reciprocal_rerank",
    use_async=True,
    verbose=True,
    # query_gen_prompt=QUERY_GEN_PROMPT_TEMPLATE, 
)

In [12]:
nodes_with_scores = retriever.retrieve(
    "How can I stop wasting energy on projecting a facade and focus on expanding my potential as a human being?"
)

Generated queries:
1. Techniques to stop projecting a facade and embrace authenticity
2. Strategies for personal growth and maximizing human potential


In [13]:
for node in nodes_with_scores:
    print(f"Score: {node.score:.2f} - {node.text}...\n-----\n")

Score: 0.03 - TIME ACTIONS RESULTS CHOOSING TO GROW YOURSELF I dont believe in specific goals. Scott Adams famously said, Set up systems, not goals. Use your judgment to figure out what kinds of environments you can thrive in, and then create an environment around you so youre statistically likely to succeed. The current environment programs the brain, but the clever brain can choose its upcoming environment. Im not going to be the most successful person on the planet, nor do I want to be. I just want to be the most successful version of myself while working the least hard possible. I want to live in a way that if my life played out 1,000 times, Naval is successful 999 times. Hes not a billionaire, but he does pretty well each time. He may not have nailed life in every regard, but he sets up systems so hes failed in very few places....
-----

Score: 0.02 - Motivation Your mind determines the effect. - Everyone - no matter who he is or where - must know from childhood that whatever occu

In [None]:
from llama_index.core.response_synthesizers import ResponseMode
from llama_index.core.query_pipeline import InputComponent

from utils import create_query_pipeline
from prompts import HYPE_ANSWER_GEN_PROMPT

input_component = InputComponent()

HYPE_ANSWER_GEN_PROMPT_TEMPLATE = PromptTemplate(HYPE_ANSWER_GEN_PROMPT)

rr_fusion_query_engine = RetrieverQueryEngine.from_args(
    retriever,
    response_mode = ResponseMode.COMPACT_ACCUMULATE,
    use_async = True,
    text_qa_template = HYPE_ANSWER_GEN_PROMPT_TEMPLATE
    )

rr_fusion_chain = [input_component, rr_fusion_query_engine]

rr_fusion_query_pipeline = create_query_pipeline(rr_fusion_chain)