In [10]:
from dotenv import load_dotenv
load_dotenv('/Users/srishtysuman/.env')

True

In [23]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer
from llama_index.schema import Document
from llama_index.evaluation import generate_question_context_pairs
from llama_index import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
)
from llama_index.vector_stores import DeepLakeVectorStore
from llama_index.node_parser import SimpleNodeParser
from llama_index.llms import OpenAI
from llama_index.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
import random
from langchain.embeddings.openai import OpenAIEmbeddings
import pandas as pd
from llama_index.evaluation import RetrieverEvaluator


In [4]:
def get_all_links(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page: {url}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")

    # Finding all 'a' tags which typically contain href attribute for links
    links = [ urljoin(url, a["href"]) for a in soup.find_all("a", href=True) if a["href"] ]

    return links

In [7]:
def load_documents(url):
    all_links = get_all_links(url)
    all_links=all_links[:10]
    loader = AsyncHtmlLoader(all_links)
    docs = loader.load()

    html2text = Html2TextTransformer()
    docs_transformed = html2text.transform_documents(docs)
    docs = [Document.from_langchain_format(doc) for doc in docs_transformed]
    return docs


docs = load_documents("https://docs.deeplake.ai/en/latest/")
docs

Fetching pages: 100%|##########| 10/10 [00:05<00:00,  1.99it/s]


[Document(id_='454bec35-eeb8-46d6-af40-3e2923bccc16', embedding=None, metadata={'source': 'https://docs.deeplake.ai/en/latest/', 'title': 'Deep Lake API Reference — Deep Lake 3.8.20 documentation', 'language': 'en'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='latest\n\nGetting Started\n\n  * Installation\n\nKey Concepts\n\n  * Datasets\n  * Vector Store\n  * Tensors\n  * Htypes\n  * Compressions\n  * PyTorch and Tensorflow Support\n  * Utility Functions\n\nIntegrations\n\n  * Weights and Biases\n  * MMDetection\n\nHigh-Performance Features\n\n  * Dataloader\n  * Sampler\n  * Tensor Query Language\n  * Random Split\n  * Deep Memory\n\nAPI Reference\n\n  * deeplake\n  * deeplake.VectorStore\n  * deeplake.core\n  * deeplake.core.dataset\n  * deeplake.core.tensor\n  * deeplake.api\n  * deeplake.auto\n  * deeplake.util\n  * deeplake.client.log\n  * deeplake.core.transform\n  * deeplake.core.vectorstore.deep_memory\n  * deeplake.random.seed\n\n__D

In [11]:
vector_store = DeepLakeVectorStore(
    dataset_path="hub://srishtysuman2919/deeplake_docs_deepmemory2",
    overwrite=False,  # set to True to overwrite the existing dataset
    runtime={"tensor_db": True},
)

Your Deep Lake dataset has been successfully created!


 

In [12]:
def create_storage_and_service_contexts( vector_store, docs=[], populate_vector_store=True):
    if populate_vector_store:
        node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
        nodes = node_parser.get_nodes_from_documents(docs)
    else:
        nodes = []

    # by default, the node ids are set to random uuids. To ensure same id's per run, we manually set them.
    for idx, node in enumerate(nodes):
        node.id_ = f"node_{idx}"

    llm = OpenAI(model="gpt-3.5-turbo-instruct")
    service_context = ServiceContext.from_defaults(llm=llm)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    return service_context, storage_context, nodes, llm

In [13]:
service_context, storage_context, nodes, llm = create_storage_and_service_contexts( docs=docs, vector_store=vector_store, populate_vector_store=True)

In [14]:
vector_index = VectorStoreIndex( nodes, service_context=service_context, storage_context=storage_context)
deep_memory_retriever = vector_index.as_retriever(similarity_top_k=4, deep_memory=True)

Uploading data to deeplake dataset.


100%|██████████| 55/55 [00:04<00:00, 11.26it/s]
/

Dataset(path='hub://srishtysuman2919/deeplake_docs_deepmemory2', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (55, 1)      str     None   
 metadata     json      (55, 1)      str     None   
 embedding  embedding  (55, 1536)  float32   None   
    id        text      (55, 1)      str     None   


 

    Training Deep Memory
    
    We need relevance, queries together with corpus data (data that we want to query). The corpus data was already populated in the previous section; here, we will be generating questions and relevance.

    questions - is a text of strings, where each string represents a query.

    relevance - contains links to the ground truth for each question. There might be several docs that contain an answer to the given question. Because of this, relevance is List[List[tuple[str, float]]], where the outer list represents queries and the inner list relevant documents. The tuple contains a str, float pair where the string represents the id of the source doc (corresponds to the id tensor in the dataset), while the float corresponds to how much the current document is related to the question.

In [None]:
def create_train_test_datasets( number_of_samples=600, llm=None, nodes=None, save=False):
    random_indices = random.sample(range(len(nodes)), number_of_samples)

    ratio = int(len(random_indices) * 0.8)

    train_indices = random_indices[:ratio]
    test_indices = random_indices[ratio:]

    train_nodes = [nodes[i] for i in train_indices]
    test_nodes = [nodes[i] for i in test_indices]

    train_qa_dataset = generate_question_context_pairs(train_nodes, llm=llm, num_questions_per_chunk=1)

    test_qa_dataset = generate_question_context_pairs(test_nodes, llm=llm, num_questions_per_chunk=1)

    # [optional] save
    if save:
        train_qa_dataset.save_json(f"deeplake_docs_{number_of_samples}_train.json")
        test_qa_dataset.save_json(f"deeplake_docs_{number_of_samples}_test.json")
    return train_qa_dataset, test_qa_dataset

In [19]:
train_qa_dataset, test_qa_dataset = create_train_test_datasets(number_of_samples=10, llm=llm, nodes=nodes, save=True)

100%|██████████| 8/8 [01:42<00:00, 12.83s/it]
100%|██████████| 2/2 [00:46<00:00, 23.27s/it]


In [24]:
train_qa_dataset = EmbeddingQAFinetuneDataset.from_json("deeplake_docs_10_train.json")
test_qa_dataset = EmbeddingQAFinetuneDataset.from_json("deeplake_docs_10_test.json")

In [25]:
def create_query_relevance(qa_dataset):
    """Function for converting llama-index dataset to correct format for deep memory training"""
    queries = [text for _, text in qa_dataset.queries.items()]
    relevant_docs = qa_dataset.relevant_docs
    relevance = []
    for doc in relevant_docs:
        relevance.append([(relevant_docs[doc][0], 1)])
    return queries, relevance

In [26]:
train_queries, train_relevance = create_query_relevance(train_qa_dataset)
test_queries, test_relevance = create_query_relevance(test_qa_dataset)

In [28]:
embeddings = OpenAIEmbeddings()

job_id = vector_store._vectorstore.deep_memory.train(
    queries=train_queries,
    relevance=train_relevance,
    embedding_function=embeddings.embed_documents,
)

Starting DeepMemory training job


Your Deep Lake dataset has been successfully created!


 

Preparing training data for deepmemory:


Creating 85 embeddings in 1 batches of size 85:: 100%|██████████| 1/1 [00:22<00:00, 22.70s/it]


DeepMemory training job started. Job ID: 65c5d4b03ccbda4c0dba81ed


    DeepMemory Evaluation

In [None]:
recalls = vector_store.vectorstore.deep_memory.evaluate(
    queries=test_queries,
    relevance=test_relevance,
    embedding_function=embeddings.embed_documents,
)

RetrieverEvaluator to examine the MRR (Mean Reciprocal Rank) and hit rates.

In [22]:
def display_results(eval_results):
    """Display results from evaluate."""
    hit_rates = []
    mrrs = []
    names = []
    for name, eval_result in eval_results.items():
        metric_dicts = []
        for er in eval_result:
            metric_dict = er.metric_vals_dict
            metric_dicts.append(metric_dict)

        full_df = pd.DataFrame(metric_dicts)

        hit_rate = full_df["hit_rate"].mean()
        mrr = full_df["mrr"].mean()

        hit_rates.append(hit_rate)
        mrrs.append(mrr)
        names.append(name)

    metric_df = pd.DataFrame(
        [
            {"retrievers": names[i], "hit_rate": hit_rates[i], "mrr": mrrs[i]}
            for i in range(2)
        ],
    )

    return metric_df

Evaluating performance of retrieval with deep memory:

In [29]:

deep_memory_retriever = vector_index.as_retriever( similarity_top_k=10, vector_store_kwargs={"deep_memory": True})
dm_retriever_evaluator = RetrieverEvaluator.from_metric_names(["mrr", "hit_rate"], retriever=deep_memory_retriever)

dm_eval_results = await dm_retriever_evaluator.aevaluate_dataset(test_qa_dataset, retriever=dm_retriever_evaluator)

In [30]:
naive_retriever = vector_index.as_retriever(similarity_top_k=10)
naive_retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=naive_retriever
)

naive_eval_results = await naive_retriever_evaluator.aevaluate_dataset(
    test_qa_dataset, retriever=naive_retriever
)

In [31]:
eval_results = {
    f"{mode} with Deep Memory top-10 eval": eval_result
    for mode, eval_result in zip(
        ["with", "without"], [dm_eval_results, naive_eval_results]
    )
}

display_results(eval_results)

Unnamed: 0,retrievers,hit_rate,mrr
0,with with Deep Memory top-10 eval,0.363636,0.129347
1,without with Deep Memory top-10 eval,0.909091,0.407197


Deep Memory Inference

In [32]:
query_engine = vector_index.as_query_engine(vector_store_kwargs={"deep_memory": True})
response = query_engine.query(
    "How can you connect your own storage to the deeplake?"
)
print(response)



You can connect your own storage to the deeplake by using the Vector Store feature, which allows you to store and access your data from any storage provider. You can also use the deeplake API to connect to your own storage and access your data.


In [33]:
query_engine = vector_index.as_query_engine(
    vector_store_kwargs={"deep_memory": False}
)
response = query_engine.query(
    "How can you connect your own storage to the deeplake?"
)
print(response)


You can connect your own storage to deeplake by using the deeplake.VectorStore class, which allows you to create a custom vector store that can be used to store and retrieve data from your own storage system. This can be done by implementing the necessary methods and interfaces in your custom vector store class.
