## ActiveLoop's Deep Memory

Idea: train a small NN to improve retrieval

! Paid

In [5]:
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())

import nest_asyncio
nest_asyncio.apply()

import random

from llama_parse import LlamaParse

from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    Settings
)
from llama_index.core.node_parser import (
    SimpleNodeParser,
    MarkdownElementNodeParser
)

from llama_index.vector_stores.deeplake import DeepLakeVectorStore

from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

'''
HF models 
pip install llama-index-llms-huggingface

from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
'''

In [6]:
# global configuration (act as defaults)
# replaces the old ServiceContext configuration
#Settings.llm =
#Settings.embed_model =
llm = OpenAI(model="gpt-3.5-turbo-0125")
embed_model = OpenAIEmbedding(model="text-embedding-3-small")

In [7]:
# parse pdf files into markdown
# up to 1000 pages per day for free with LlamaParse
parser = LlamaParse(result_type="markdown", verbose=True)
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(
    #"./data", 
    "./sample_data",
    file_extractor=file_extractor
).load_data(show_progress=True)

Loading files:   0%|          | 0/2 [00:00<?, ?file/s]

Started parsing the file under job_id 82ef18f9-a08d-41dd-82cb-8010d276b8af


Loading files:  50%|█████     | 1/2 [00:04<00:04,  4.61s/file]

Started parsing the file under job_id cc90d21c-de01-48bf-99bf-58674a9b7639


Loading files: 100%|██████████| 2/2 [00:10<00:00,  5.25s/file]


In [8]:
documents

[Document(id_='ada756f2-fd03-4eb3-9155-2fda132a1201', embedding=None, metadata={'file_path': '/Users/sofy/Workspace/finRAG/sample_data/microsoft_sec-10-k_2022_small.pdf', 'file_name': 'microsoft_sec-10-k_2022_small.pdf', 'file_type': 'application/pdf', 'file_size': 947693, 'creation_date': '2024-04-01', 'last_modified_date': '2024-04-01'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='## UNITED STATES SECURITIES AND EXCHANGE COMMISSION FORM 10-K\n\nANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Fiscal Year Ended June 30, 2022\n\nCommission File Number 001-37845 MICROSOFT CORPORATION WASHINGTON 91-1144442 (STATE OF INCORPORATION) (I.R.S. ID)\n\nONE MICROSOFT WAY, REDMOND, WASHINGTON 98052-6399 (425) 882-8080

In [9]:
documents[0].text[:1000]

'## UNITED STATES SECURITIES AND EXCHANGE COMMISSION FORM 10-K\n\nANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Fiscal Year Ended June 30, 2022\n\nCommission File Number 001-37845 MICROSOFT CORPORATION WASHINGTON 91-1144442 (STATE OF INCORPORATION) (I.R.S. ID)\n\nONE MICROSOFT WAY, REDMOND, WASHINGTON 98052-6399 (425) 882-8080 www.microsoft.com/investor\n\n|Title of each class|Trading Symbol|Name of exchange on which registered|\n|---|---|---|\n|Common stock, $0.00000625 par value per share|MSFT|NASDAQ|\n|3.125% Notes due 2028|MSFT|NASDAQ|\n|2.625% Notes due 2033|MSFT|NASDAQ|\n\nSecurities registered pursuant to Section 12(g) of the Act: NONE\n\nIndicate by check mark if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒ No ☐\n\nIndicate by check mark if the registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act. Yes ☐ No ☒\n\nIndicate by check mark whe

In [None]:
# parse markdown result into LlamaIndex nodes (text and index nodes)
# nodes are smaller chunks from the documents

#node_parser = MarkdownElementNodeParser(llm=llm, num_workers=4, verbose=True)
#nodes = node_parser.get_nodes_from_documents(documents)

In [11]:
# activeloop
my_activeloop_org_id = "sofdog"
my_activeloop_dataset_name = "finRAG"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

# Create an index over the documnts
vector_store = DeepLakeVectorStore(
    dataset_path=dataset_path,
    overwrite=False,  # set to True to overwrite the existing dataset
    runtime={"tensor_db": True}
)



Your Deep Lake dataset has been successfully created!


 

In [14]:
def create_nodes(vector_store, docs=[], populate_vector_store=True):
    if populate_vector_store:
        node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
        nodes = node_parser.get_nodes_from_documents(docs)
    else:
        nodes = []

    # by default, the node ids are set to random uuids. To ensure same id's per run, we manually set them.
    for idx, node in enumerate(nodes):
        node.id_ = f"node_{idx}"

    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    return storage_context, nodes

In [15]:
storage_context, nodes = create_nodes(
    docs=documents,
    vector_store=vector_store,
    #populate_vector_store=False, # empty vector store
)

vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
deep_memory_retriever = vector_index.as_retriever(
    similarity_top_k=4, deep_memory=True
)

Uploading data to deeplake dataset.


100%|██████████| 152/152 [00:02<00:00, 55.59it/s] 
/

Dataset(path='hub://sofdog/finRAG', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype       shape      dtype  compression
  -------    -------     -------    -------  ------- 
   text       text      (152, 1)      str     None   
 metadata     json      (152, 1)      str     None   
 embedding  embedding  (152, 1536)  float32   None   
    id        text      (152, 1)      str     None   


 

In [17]:
def create_train_test_datasets(
    number_of_samples=500, llm=None, nodes=None, save=False
):
    random_indices = random.sample(range(len(nodes)), number_of_samples)

    ratio = int(len(random_indices) * 0.8)

    train_indices = random_indices[:ratio]
    test_indices = random_indices[ratio:]

    train_nodes = [nodes[i] for i in train_indices]
    test_nodes = [nodes[i] for i in test_indices]

    train_qa_dataset = generate_question_context_pairs(
        train_nodes, llm=llm, num_questions_per_chunk=1
    )

    test_qa_dataset = generate_question_context_pairs(
        test_nodes, llm=llm, num_questions_per_chunk=1
    )

    if save:
        train_qa_dataset.save_json(
            f"finRAG_{number_of_samples}_train.json"
        )
        test_qa_dataset.save_json(
            f"finRAG_{number_of_samples}_test.json"
        )
    return train_qa_dataset, test_qa_dataset

In [20]:
n_samples = 100
train_qa_dataset, test_qa_dataset = create_train_test_datasets(
    number_of_samples=n_samples, llm=llm, nodes=nodes, save=True
)

train_qa_dataset = EmbeddingQAFinetuneDataset.from_json(
    f"finRAG_{n_samples}_train.json"
)
test_qa_dataset = EmbeddingQAFinetuneDataset.from_json(
    f"finRAG_{n_samples}_test.json"
)

  0%|          | 0/80 [00:00<?, ?it/s]

100%|██████████| 80/80 [01:58<00:00,  1.48s/it]
100%|██████████| 20/20 [00:29<00:00,  1.49s/it]


In [21]:
def create_query_relevance(qa_dataset):
    """Function for converting llama-index dataset to correct format for deep memory training"""
    queries = [text for _, text in qa_dataset.queries.items()]
    relevant_docs = qa_dataset.relevant_docs
    relevance = []
    for doc in relevant_docs:
        relevance.append([(relevant_docs[doc][0], 1)])
    return queries, relevance

In [22]:
train_queries, train_relevance = create_query_relevance(train_qa_dataset)
test_queries, test_relevance = create_query_relevance(test_qa_dataset)

 After Creating then synthetic dataset of question+query+relevance, you should train deep memory which can then be used for search - though, I realised it is a paid service :(