In [1]:
from util import load_config, set_embedding, get_embedding_dimension, set_llm
from dotenv import load_dotenv
from pinecone import Pinecone
from llama_index.core import Settings
from ingestion import IngestionEngine
from retrieval import RetrievalEngine
from prompt_settings import PrompSettingsFactory
import os

config_file = "../retrieval_config.toml"
env_file = "../.env"

config = load_config(config_file=config_file)

load_dotenv(dotenv_path=env_file)

print("Pinecone Key: ", os.getenv("PINECONE_API_KEY"))

Pinecone Key:  4bc3fa0d-a789-4187-aa8f-d6b17d0ea6a3


In [2]:
pinecone_client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
set_llm(inference_model_name=None)
set_embedding(embed_model_name=config["embedding_model"])
dimension = config["embedding_dimension"]

LLM is explicitly disabled. Using MockLLM.
Set Qwen Embedding.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
ingestion_engine = IngestionEngine(
    pinecone_client=pinecone_client,
    dimension=dimension,
    splitting=config["splitting"],
    extractors=config["extractors"]
)

retrieval_engine = RetrievalEngine(
    pinecone_client=pinecone_client,
    rerank=config["rerank"],
    top_k=config["top_k"],
    top_n=config["top_n"],
    alpha=config["alpha"]
)

prompt_settings = PrompSettingsFactory.get_prompt_settings(tool_name=config["tool_name"])

In [4]:
from data import Dependency
import pandas as pd

def transform(row: pd.Series) -> Dependency:
    dependency = Dependency(
        project=row["project"],
        option_name=row["option_name"],
        option_value=row["option_value"],
        option_type=row["option_type"].split(".")[-1],
        option_file=row["option_file"],
        option_technology=row["option_technology"],
        dependent_option_name=row["dependent_option_name"],
        dependent_option_value=row["dependent_option_value"],
        dependent_option_type=row["dependent_option_type"].split(".")[-1],
        dependent_option_file=row["dependent_option_file"],
        dependent_option_technology=row["dependent_option_technology"]
    )

    return dependency

In [12]:
def scrape(dependency: Dependency) -> None:
    docs = ingestion_engine.docs_from_web(
        query_str=prompt_settings.get_retrieval_prompt(dependency=dependency), 
        num_websites=config["num_websites"]
    )
    
    
    for d in docs:
        d.metadata["index_name"] = "web-search"

    ingestion_engine.index_documents(
        index_name="web-search",
        documents=docs,
        delete_index=True
    )

In [13]:
import pandas as pd
import json

index_name = "web-search"
eval_data_file = "../data/evaluation/all_dependencies.csv"
output_file = f"../data/evaluation/all_dependencies_{index_name}.json"
df = pd.read_csv(eval_data_file)

queries = []
for index, row in df.iterrows():
    dependency = transform(row=row)

    system_str = prompt_settings.get_system_str(dependency=dependency)
    task_str = prompt_settings.get_task_str(dependency=dependency)
    retrieval_str = prompt_settings.get_retrieval_prompt(dependency=dependency)

    if index_name == "web-search" or index_name == all:
        scrape(dependency=dependency)


    retrieved_nodes = retrieval_engine.retrieve(
        index_name=index_name,
        query_str=retrieval_str
    )

    context_str = "\n\n".join([source_node.node.get_content() for source_node in retrieved_nodes])

    context = [
        {
            "content": node.get_content(),
            "score": node.get_score(),
            "index": node.metadata["index_name"] if "index_name" in node.metadata else None,
            "id": node.node_id
        } for node in retrieved_nodes
    ]

    queries.append({
        "index": index,
        "dependency": dependency.to_dict(),
        "system_str": system_str,
        "task_str": task_str,
        "context_str": context_str,
        "context": context

    })


with open(output_file, "w", encoding="utf-8") as dest:
    json.dump(queries, dest, indent=2)

Parsing nodes:   0%|          | 0/5 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/43 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Upserted vectors:   0%|          | 0/43 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]