In [1]:
# ! mkdir -p 'data/srishty/'
# ! curl 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -o 'data/srishty/srishty_essay.txt'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 75042  100 75042    0     0   134k      0 --:--:-- --:--:-- --:--:--  135k


In [1]:
from dotenv import load_dotenv
load_dotenv('/Users/srishtysuman/.env')
print(load_dotenv('/Users/srishtysuman/.env'))

True


In [26]:
from llama_index.node_parser import SimpleNodeParser
from llama_index import SimpleDirectoryReader
from llama_index import VectorStoreIndex, ServiceContext, StorageContext
from llama_index.vector_stores import DeepLakeVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms import OpenAI
import deeplake
from openai import OpenAI
import random
from tqdm import tqdm
from langchain.embeddings.openai import OpenAIEmbeddings
from llama_index.llms import OpenAI

Create the Llama-index nodes/chunks

In [3]:
#  SimpleDirectoryReader: Load files from file directory and Automatically select the best file reader given file extensions.
documents = SimpleDirectoryReader("./data/srishty/").load_data()
# documents

In [4]:
# SimpleNodeParser deprecated by SentenceSplitter: Parse text with a preference for complete sentences.
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
# node_parser

In [5]:
# get_nodes_from_documents: Parse documents into nodes.
nodes = node_parser.get_nodes_from_documents(documents)
# nodes

In [6]:
# By default, the node/chunks ids are set to random uuids. To ensure same id's per run, we manually set them.
for idx, node in enumerate(nodes):
    node.id_ = f"node_{idx}"

In [7]:
print(f"Number of Documents: {len(documents)}")
print(f"Number of nodes: {len(nodes)} with the current chunk size of {node_parser.chunk_size}")

Number of Documents: 1
Number of nodes: 57 with the current chunk size of 512


Create a local Deep Lake vector store 

In [10]:
# Create a DeepLakeVectorStore locally to store the vectors
dataset_path = "./data/srishty/deep_lake_db"
vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=True)



In [8]:
# LLM that will answer questions with the retrieved context
llm = OpenAI(model="gpt-3.5-turbo-1106")
embed_model = OpenAIEmbedding()


In [11]:
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm,)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [12]:
vector_index = VectorStoreIndex(nodes, service_context=service_context, storage_context=storage_context, show_progress=True)

  from .autonotebook import tqdm as notebook_tqdm
Generating embeddings: 100%|██████████| 57/57 [00:02<00:00, 21.98it/s]


Uploading data to deeplake dataset.


100%|██████████| 57/57 [00:00<00:00, 443.63it/s]

Dataset(path='./data/srishty/deep_lake_db', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (57, 1)      str     None   
 metadata     json      (57, 1)      str     None   
 embedding  embedding  (57, 1536)  float32   None   
    id        text      (57, 1)      str     None   





Upload the local Vectore Store to Activeloop's platform and convert it into a managed database.

In [15]:
local = "./data/srishty/deep_lake_db"
hub_path = "hub://srishtysuman2919/optimization_srishty"
hub_managed_path = "hub://srishtysuman2919/optimization_srishty_managed"

In [14]:
# First upload our local vector store
deeplake.deepcopy(local, hub_path, overwrite=True)

Copying dataset: 96%|█████████▋| 27/28 [00:44<00:01


This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/srishtysuman2919/optimization_srishty
Your Deep Lake dataset has been successfully created!


Dataset(path='hub://srishtysuman2919/optimization_srishty', tensors=['embedding', 'id', 'metadata', 'text'])

In [14]:
ds = deeplake.load('hub://srishtysuman2919/optimization_srishty')

/

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/srishtysuman2919/optimization_srishty



-

hub://srishtysuman2919/optimization_srishty loaded successfully.



 

In [16]:
# Create a managed vector store under a different name
deeplake.deepcopy(hub_path, hub_managed_path, overwrite=True, runtime={"tensor_db": True})

Copying dataset: 96%|█████████▋| 27/28 [01:07<00:02


This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/srishtysuman2919/optimization_srishty_managed
Your Deep Lake dataset has been successfully created!


Dataset(path='hub://srishtysuman2919/optimization_srishty_managed', tensors=['embedding', 'id', 'metadata', 'text'])

Instantiate a Vector Store with the managed dataset that we just created.

In [17]:
db = DeepLakeVectorStore(dataset_path=hub_managed_path, overwrite=False, read_only=True)

Deep Lake Dataset in hub://srishtysuman2919/optimization_srishty_managed already exists, loading from the storage


Fetching our docs and ids from the vector store.

In [20]:
# Fetch dataset docs and ids 
docs = db._vectorstore.dataset.text.data(fetch_chunks=True, aslist=True)['value']
ids = db._vectorstore.dataset.id.data(fetch_chunks=True, aslist=True)['value']
print(len(docs))

57


Generating a synthetic training dataset.

In [22]:
client = OpenAI()

def generate_question(text):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-1106",
            messages=[
                {"role": "system", "content": "You are a world class expert for generating questions based on provided context. \
                        You make sure the question can be answered by the text."},
                {
                    "role": "user",
                    "content": text,
                },
            ],
        )
        return response.choices[0].message.content
    except:
        question_string = "No question generated"
        return question_string

In [24]:
def generate_queries(docs: list[str], ids: list[str], n: int):
    questions = []
    relevances = []
    pbar = tqdm(total=n)
    while len(questions) < n:
        # 1. randomly draw a piece of text and relevance id
        r = random.randint(0, len(docs)-1)
        text, label = docs[r], ids[r]

        # 2. generate queries and assign and relevance id
        generated_qs = [generate_question(text)]
        if generated_qs == ["No question generated"]:
            print("No question generated")
            continue

        questions.extend(generated_qs)
        relevances.extend([[(label, 1)] for _ in generated_qs])
        pbar.update(len(generated_qs))

    return questions[:n], relevances[:n]

Launch the query generation process with a desired size of 40 queries/questions. 

In [25]:
questions, relevances = generate_queries(docs, ids, n=40)
print(len(questions)) 
print(questions[0])

 65%|██████▌   | 26/40 [07:55<04:58, 21.35s/it]

No question generated


100%|██████████| 40/40 [14:00<00:00, 21.02s/it]

40
What was the distinctive thing about Y Combinator (YC) and how did the founders come up with the batch model concept?





Launch Deep Memory Training

In [28]:
openai_embeddings = OpenAIEmbeddings()

job_id = db._vectorstore.deep_memory.train(
    queries=questions,
    relevance=relevances,
    embedding_function=openai_embeddings.embed_documents,
)

Starting DeepMemory training job


  warn_deprecated(


Your Deep Lake dataset has been successfully created!


 

Preparing training data for deepmemory:


Creating 40 embeddings in 1 batches of size 40:: 100%|██████████| 1/1 [00:20<00:00, 20.24s/it]


DeepMemory training job started. Job ID: 65c5c5503ccbda4c0dba81e8


In [30]:
# During training you can check the status of the training run
db._vectorstore.deep_memory.status(job_id=job_id)

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/srishtysuman2919/optimization_srishty_managed
--------------------------------------------------------------
|                  65c5c5503ccbda4c0dba81e8                  |
--------------------------------------------------------------
| status                     | pending                       |
--------------------------------------------------------------
| progress                   | None                          |
--------------------------------------------------------------
| results                    | not available yet             |
--------------------------------------------------------------




Run a Deep Memory-enabled inference by setting deep_memory=True

In [None]:
query = "What are the main things Paul worked on before college?"

llm = OpenAI(model="gpt-3.5-turbo-1106")
embed_model = OpenAIEmbedding()

service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm,)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

db = DeepLakeVectorStore(dataset_path=hub_managed_path, overwrite=False, read_only=True,)
vector_index = VectorStoreIndex.from_vector_store(db, service_context=service_context, storage_context=storage_context, show_progress=True)

query_engine = vector_index.as_query_engine(similarity_top_k=3, vector_store_kwargs={"deep_memory": True})

In [32]:
response_vector = query_engine.query(query)
print(response_vector.response)

Paul worked on several different things before college, including writing essays, launching a software project, and working on an online store builder.



Now, let's run a quantitative evaluation on another set of synthetically generated test queries.

In [33]:
# Generate validation queries
validation_questions, validation_relevances = generate_queries(docs, ids, n=40)

# Launch the evaluation function
recalls = db._vectorstore.deep_memory.evaluate(
    queries=validation_questions,
    relevance=validation_relevances,
    embedding_function=openai_embeddings.embed_documents,
)
recalls

 70%|███████   | 28/40 [08:35<04:13, 21.13s/it]

No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated


 72%|███████▎  | 29/40 [12:46<16:29, 89.91s/it]

No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated


 75%|███████▌  | 30/40 [19:41<31:16, 187.64s/it]

No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated


 78%|███████▊  | 31/40 [34:18<59:10, 394.51s/it]

No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated


 80%|████████  | 32/40 [41:28<54:00, 405.04s/it]

No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated


 82%|████████▎ | 33/40 [48:44<48:19, 414.24s/it]

No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated


 85%|████████▌ | 34/40 [55:57<41:59, 419.88s/it]

No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated


 88%|████████▊ | 35/40 [1:03:11<35:21, 424.25s/it]

No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated


 90%|█████████ | 36/40 [1:10:08<28:08, 422.10s/it]

No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated


 92%|█████████▎| 37/40 [1:17:23<21:17, 425.77s/it]

No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated


 95%|█████████▌| 38/40 [1:24:38<14:17, 428.76s/it]

No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated


 98%|█████████▊| 39/40 [1:31:52<07:10, 430.15s/it]

No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated
No question generated


100%|██████████| 40/40 [1:39:10<00:00, 148.76s/it]


Embedding queries took 2.21 seconds
---- Evaluating without Deep Memory ---- 
Recall@1:	  55.0%
Recall@3:	  72.5%
Recall@5:	  77.5%
Recall@10:	  90.0%
Recall@50:	  100.0%
Recall@100:	  100.0%
---- Evaluating with Deep Memory ---- 
Recall@1:	  42.5%
Recall@3:	  60.0%
Recall@5:	  70.0%
Recall@10:	  85.0%
Recall@50:	  100.0%
Recall@100:	  100.0%


{'with model': {'recall@1': 0.425,
  'recall@3': 0.6,
  'recall@5': 0.7,
  'recall@10': 0.85,
  'recall@50': 1.0,
  'recall@100': 1.0},
 'without model': {'recall@1': 0.55,
  'recall@3': 0.725,
  'recall@5': 0.775,
  'recall@10': 0.9,
  'recall@50': 1.0,
  'recall@100': 1.0}}