In [96]:
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

True

In [100]:
# Uncomment if you are in a Jupyter Notebook
import nest_asyncio
nest_asyncio.apply()

from llama_parse import LlamaParse  # pip install llama-parse

parser = LlamaParse(
    result_type="markdown"  # "markdown" and "text" are available
)

file_extractor = {".pdf": parser}
documents = LlamaParse(result_type="markdown").load_data("../data/Gmail - ASW followup.pdf")

Started parsing the file under job_id 29fc98f8-6628-455b-a423-9a9d0aeb2599


In [101]:
print(documents[0].text)

# Email Conversation

## Gmail - ASW followup

From: Mehul Khetrapal <mehulkhetrapal1@gmail.com>

Subject: ASW followup

Messages: 5

Richy Chen <richychentl@gmail.com> - Sat, Feb 3, 2024 at 11:01 AM

To: Mehul Khetrapal <mehulkhetrapal1@gmail.com>

Hey Mehul,
Hope you had a fruitful ASW.
Here are our standard terms for manufacturing hoodies.

|Cost|$10 per garment|
|---|---|
|Minimum order quantity|1000|
|Turnaround|3 months|
|Design service|Included|

Best,
Richy

Mehul Khetrapal <mehulkhetrapal1@gmail.com> - Sat, Feb 3, 2024 at 11:02 AM

To: Richy Chen <richychentl@gmail.com>

Hey Richy,
We are aiming for $7.50 per unit production. Can we remove the design service to get the price lower?
[Quoted text hidden]
--
Mehul Khetrapal
University of Southern California ‘20 ‘21
B.S. Computational Neuroscience
M.S. Biomedical Engineering
https://www.linkedin.com/in/mehul-khetrapal/

Richy Chen <richychentl@gmail.com> - Sat, Feb 3, 2024 at 11:04 AM

To: Mehul Khetrapal <mehulkhetrapal1@gmail.co

In [102]:
import re 
from datetime import datetime

emails = re.split(".*<.*>.*\n*To:.*<.*>", documents[0].text)[1:]
headers = re.findall(".*<.*>.*\n*To:.*<.*>", documents[0].text)

emails_with_metadata = []

for email, header in zip(emails, headers):
    sender = re.findall(".*<.*>", header)[0]
    receiver = re.findall("To:.*<.*>", header)[0][4:]
    date = re.findall("(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (0?[1-9]|[1-3][0-9]), ([2][0-9]+) at ([0-1]?[0-9]|2[0-3]):([0-5][0-9]+) (AM|PM)", header)[0]
    date = datetime.strptime(f'{date[0]} {date[1]}, {date[2]} {date[3]}:{date[4]} {date[5]}', '%b %d, %Y %I:%M %p')
    emails_with_metadata.append((email, {"sender": sender, "receiver": receiver, "date": date.strftime("%m/%d/%Y, %H:%M")}))

In [103]:
from llama_index.text_splitter import SentenceSplitter

text_splitter = SentenceSplitter()

In [104]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(emails_with_metadata):
    cur_text_chunks = text_splitter.split_text(doc[0])
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [105]:
text_chunks

['Hey Mehul,\nHope you had a fruitful ASW.\nHere are our standard terms for manufacturing hoodies.\n\n|Cost|$10 per garment|\n|---|---|\n|Minimum order quantity|1000|\n|Turnaround|3 months|\n|Design service|Included|\n\nBest,\nRichy',
 'Hey Richy,\nWe are aiming for $7.50 per unit production. Can we remove the design service to get the price lower?\n[Quoted text hidden]\n--\nMehul Khetrapal\nUniversity of Southern California ‘20 ‘21\nB.S. Computational Neuroscience\nM.S. Biomedical Engineering\nhttps://www.linkedin.com/in/mehul-khetrapal/',
 "We can remove the design service to give you a discount. I'm not sure we can do 7.50. Let me get back to you on that.\nBest,\nRichy\n[Quoted text hidden]",
 'OK, we can do $8\n[Quoted text hidden]\n\nMehul Khetrapal <mehulkhetrapal1@gmail.com> - Sat, Feb 3, 2024 at 11:08 AM\n\nLink to the conversation\n---\n2/3/24, 11:09 AM                                                                     Gmail - ASW followup\n\nTo: Richy Chen <richychentl@gmail

In [106]:
from llama_index.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = emails_with_metadata[doc_idxs[idx]]
    node.metadata = src_doc[1]
    nodes.append(node)

In [107]:
import requests
import os
import json

upload_text = [i.text for i in nodes]

headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
embedding_endpoint = "https://api-inference.huggingface.co/pipeline/feature-extraction/BAAI/bge-small-en-v1.5"
embedding_payload = {
    "inputs": upload_text,
    "options": {
        "wait_for_model": True
    }
}
embedding_response = requests.post(embedding_endpoint, headers=headers, json=embedding_payload)
embeddings = list(json.loads(embedding_response.text))

In [108]:
for node, embedding in zip(nodes, embeddings):
    node.embedding = embedding
nodes

[TextNode(id_='81feeb58-a9d4-48c3-9198-970c8ae4781c', embedding=[-0.03649403899908066, 0.0162503719329834, 0.014731932431459427, -0.006188157480210066, 0.02331206016242504, 0.025560304522514343, 0.02934805490076542, -0.0026194658130407333, -0.02901148982346058, -0.008420626632869244, 0.02072172425687313, -0.03085785172879696, -0.03155037760734558, 0.019041717052459717, 0.09139528125524521, 0.006646739784628153, 0.056629374623298645, -0.043623365461826324, -0.06624197959899902, 0.04085138812661171, 0.06008777767419815, -0.0566716343164444, -0.05115705728530884, -0.024586331099271774, -0.007946134544909, -0.05128130316734314, -0.015880916267633438, -0.015058241784572601, -0.04831197112798691, -0.1511586457490921, 0.01870417781174183, -0.012419324368238449, 0.02714277058839798, 0.012271657586097717, -0.001939713372848928, 0.011911867186427116, -0.0023973186034709215, -0.01654396951198578, 0.01308090053498745, 0.0290573351085186, -0.03319508582353592, 0.03849608823657036, -0.02967843785881

In [109]:
from llama_index.vector_stores import AstraDBVectorStore

astra_db_store = AstraDBVectorStore(
    token=os.getenv('ASTRA_DB_APPLICATION_TOKEN'),
    api_endpoint=os.getenv('ASTRA_DB_API_ENDPOINT'),
    collection_name="midas_collection",
    embedding_dimension=384,
)

astra_db_store.add(nodes)



  astra_db_store = AstraDBVectorStore(


['81feeb58-a9d4-48c3-9198-970c8ae4781c',
 '99f36e66-73a8-48e7-a97e-9cd1c0e08ea6',
 'd7518fa0-0517-46d6-9f36-ae850a57bd3f',
 'e8051152-c52e-410f-835a-dc8b70022551']

In [110]:
nodes

[TextNode(id_='81feeb58-a9d4-48c3-9198-970c8ae4781c', embedding=[-0.03649403899908066, 0.0162503719329834, 0.014731932431459427, -0.006188157480210066, 0.02331206016242504, 0.025560304522514343, 0.02934805490076542, -0.0026194658130407333, -0.02901148982346058, -0.008420626632869244, 0.02072172425687313, -0.03085785172879696, -0.03155037760734558, 0.019041717052459717, 0.09139528125524521, 0.006646739784628153, 0.056629374623298645, -0.043623365461826324, -0.06624197959899902, 0.04085138812661171, 0.06008777767419815, -0.0566716343164444, -0.05115705728530884, -0.024586331099271774, -0.007946134544909, -0.05128130316734314, -0.015880916267633438, -0.015058241784572601, -0.04831197112798691, -0.1511586457490921, 0.01870417781174183, -0.012419324368238449, 0.02714277058839798, 0.012271657586097717, -0.001939713372848928, 0.011911867186427116, -0.0023973186034709215, -0.01654396951198578, 0.01308090053498745, 0.0290573351085186, -0.03319508582353592, 0.03849608823657036, -0.02967843785881

# Retrieval

In [111]:
query_str = "Can you tell me the final price agreed upon by the buyer and seller?"
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_TOKEN')}"}
embedding_endpoint = "https://api-inference.huggingface.co/pipeline/feature-extraction/BAAI/bge-small-en-v1.5"
embedding_payload = {
    "inputs": [query_str],
    "options": {
        "wait_for_model": True
    }
}
embedding_response = requests.post(embedding_endpoint, headers=headers, json=embedding_payload)
embeddings = list(json.loads(embedding_response.text))

In [118]:
from llama_index.vector_stores import VectorStoreQuery

query_mode = "default"
# query_mode = "sparse"
# query_mode = "hybrid"
vector_store_query = VectorStoreQuery(
    query_embedding= embeddings[0], similarity_top_k=4, mode=query_mode
)

In [119]:
from llama_index.vector_stores import AstraDBVectorStore
vector_store = AstraDBVectorStore(
    token=os.getenv('ASTRA_DB_APPLICATION_TOKEN'),
    api_endpoint=os.getenv('ASTRA_DB_API_ENDPOINT'),
    collection_name="midas_collection",
    embedding_dimension=384,
)

  vector_store = AstraDBVectorStore(


In [120]:
query_result = vector_store.query(vector_store_query)
print(query_result.nodes[0].get_content())

We can remove the design service to give you a discount. I'm not sure we can do 7.50. Let me get back to you on that.
Best,
Richy
[Quoted text hidden]


In [121]:
query_result.nodes

[TextNode(id_='d7518fa0-0517-46d6-9f36-ae850a57bd3f', embedding=None, metadata={'sender': 'Richy Chen <richychentl@gmail.com>', 'receiver': 'Mehul Khetrapal <mehulkhetrapal1@gmail.com>', 'date': '02/03/2024, 11:04'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="We can remove the design service to give you a discount. I'm not sure we can do 7.50. Let me get back to you on that.\nBest,\nRichy\n[Quoted text hidden]", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 TextNode(id_='99f36e66-73a8-48e7-a97e-9cd1c0e08ea6', embedding=None, metadata={'sender': 'Mehul Khetrapal <mehulkhetrapal1@gmail.com>', 'receiver': 'Richy Chen <richychentl@gmail.com>', 'date': '02/03/2024, 11:02'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Hey Richy,\nWe are aiming for $7.50 per unit production. Can we remove the design servi