In [1]:
from util import load_config, set_embedding, set_llm
from ingestion import IngestionEngine
from retrieval import RetrievalEngine
from prompt_settings import PrompSettingsFactory
from pinecone import Pinecone
from typing import Dict, List
from data import Dependency
import os
import json
import backoff


def transform(x: Dict) -> Dependency:
    dependency = Dependency(
        project=x["project"],
        option_name=x["option_name"],
        option_value=x["option_value"],
        option_type=x["option_type"].split(".")[-1],
        option_file=x["option_file"],
        option_technology=x["option_technology"],
        dependent_option_name=x["dependent_option_name"],
        dependent_option_value=x["dependent_option_value"],
        dependent_option_type=x["dependent_option_type"].split(".")[-1],
        dependent_option_file=x["dependent_option_file"],
        dependent_option_technology=x["dependent_option_technology"]
    )

    return dependency


def scrape(ingestion_engine, retrieval_str, num_websites):
    print(f"Start Scraping {num_websites} documents.")
    docs = ingestion_engine.docs_from_web(
        query_str=retrieval_str, 
        num_websites=num_websites
    )
    
    print("Documents found: ", len(docs))

    for d in docs:
        d.metadata["index_name"] = "web-search"

    ingestion_engine.index_documents(
        index_name="web-search",
        documents=docs,
        delete_index=True
    )


@backoff.on_exception(backoff.expo, Exception, max_tries=10)
def retrieve(retrieval_engine, index_name, retrieval_str):
    nodes = retrieval_engine.retrieve(
            index_name=index_name,
            query_str=retrieval_str
    )

    if not nodes:
        raise Exception("Nodes are empty.")

    return nodes


def run_retrieval(config: Dict, index_name: str):
    
    # set up embedding, llm, and pinecone client
    pinecone_client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
    set_llm(inference_model_name=None)
    set_embedding(embed_model_name=config["embedding_model"])
    dimension = config["embedding_dimension"]

    # set up ingestion engine
    ingestion_engine = IngestionEngine(
        pinecone_client=pinecone_client,
        dimension=dimension,
        splitting=config["splitting"],
        extractors=config["extractors"]
    )

    # set up retrieval engine
    retrieval_engine = RetrievalEngine(
        pinecone_client=pinecone_client,
        rerank=config["rerank"],
        top_k=config["top_k"],
        top_n=config["top_n"],
        alpha=config["alpha"]
    )

    # set up prompt settings
    prompt_settings = PrompSettingsFactory.get_prompt_settings(tool_name=config["tool_name"])
    
    with open("../data/evaluation/config6/all_dependencies_all_updated.json", "r", encoding="utf-8") as src:
        all_data = json.load(src)

    with open("../data/evaluation/config6/all_dependencies_web-search_updated.json", "r", encoding="utf-8") as src:
        web_data = json.load(src)
    
    for entry, web_entry in zip(all_data, web_data):

        if not len(web_entry["context"]) == 0:
            continue
        
        print(f"Index {entry['index']}")


        dependency = transform(x=entry["dependency"])
        retrieval_str = prompt_settings.get_retrieval_prompt(dependency=dependency)

        # scrape web
        if index_name == "all":
            scrape(
                ingestion_engine=ingestion_engine,
                retrieval_str=retrieval_str,
                num_websites=config["num_websites"]
            )

            try:
                retrieved_nodes = retrieve(
                    retrieval_engine=retrieval_engine,
                    index_name="web-search",
                    retrieval_str=retrieval_str
                )
            except Exception:
                retrieved_nodes = []

            context_str = "\n\n".join([source_node.node.get_content() for source_node in retrieved_nodes])
            context = [
                {
                    "content": node.get_content(),
                    "score": str(node.get_score()),
                    "index": node.metadata["index_name"] if "index_name" in node.metadata else None,
                    "id": str(node.node_id)
                } for node in retrieved_nodes
            ]

            web_entry["context_str"] = context_str
            web_entry["context"] = context


        retrieved_nodes = retrieve(
            retrieval_engine=retrieval_engine,
            index_name=index_name,
            retrieval_str=retrieval_str
        )
        
        context_str = "\n\n".join([source_node.node.get_content() for source_node in retrieved_nodes])

        context = [
            {
                "content": node.get_content(),
                "score": str(node.get_score()),
                "index": node.metadata["index_name"] if "index_name" in node.metadata else None,
                "id": str(node.node_id)
            } for node in retrieved_nodes
        ]
        
        entry["context_str"] = context_str
        entry["context"] = context
        
        print(f"Done with index {entry['index']}.")
   
    all_output_file = f"../data/evaluation/config6/all_dependencies_all_updated.json"
    with open(all_output_file, "w", encoding="utf-8") as dest:
        json.dump(all_data, dest, indent=2)


    web_output_file = "../data/evaluation/config6/all_dependencies_web-search_updated.json"
    with open(web_output_file, "w", encoding="utf-8") as dest:
        json.dump(web_data, dest, indent=2)

In [2]:
config_file = "../retrieval_config.toml"
index_name = "all"
target_index = []


# load config
config = load_config(config_file=config_file)

os.environ["PINECONE_API_KEY"] = config["pinecone_key"]
print("Pinecone Key: ", os.getenv("PINECONE_API_KEY"))
    
# TODO


run_retrieval(
    config=config, 
    index_name=index_name
)

Pinecone Key:  4bc3fa0d-a789-4187-aa8f-d6b17d0ea6a3
LLM is explicitly disabled. Using MockLLM.
Set Qwen Embedding.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Index 34
Start Scraping 3 documents.
Documents found:  3


Parsing nodes:   0%|          | 0/3 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/30 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserted vectors:   0%|          | 0/30 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Done with index 34.
Index 80
Start Scraping 3 documents.
Documents found:  3


Parsing nodes:   0%|          | 0/3 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/72 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserted vectors:   0%|          | 0/72 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Done with index 80.
Index 362
Start Scraping 3 documents.
Documents found:  0


Parsing nodes: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

Upserted vectors: 0it [00:00, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Done with index 362.
Index 381
Start Scraping 3 documents.
Documents found:  0


Parsing nodes: 0it [00:00, ?it/s]

Generating embeddings: 0it [00:00, ?it/s]

Upserted vectors: 0it [00:00, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Done with index 381.
