In [1]:
!pip install pyzotero chromadb thepipe-api tqdm PyMuPDF



In [2]:
from pyzotero import zotero
from thepipe.scraper import scrape_file
from thepipe.chunker import chunk_by_page
from openai import OpenAI
from tqdm import tqdm
import chromadb
import time
import os
from google.colab import userdata

# Set up environment variables
os.environ["ZOTERO_USER_ID"] = userdata.get("ZOTERO_USER_ID")
os.environ["ZOTERO_API_KEY"] = userdata.get("ZOTERO_API_KEY")
os.environ["THEPIPE_API_KEY"] = userdata.get("THEPIPE_API_KEY")
os.environ["LLM_SERVER_API_KEY"] = userdata.get("LLM_SERVER_API_KEY")
os.environ["LLM_SERVER_BASE_URL"] = userdata.get("LLM_SERVER_BASE_URL")
#os.environ["ZOTERO_USER_ID"] = "16493983"
#os.environ["ZOTERO_API_KEY"] = "sOGd1rrYup2qg40GW81L00U8"
#os.environ["THEPIPE_API_KEY"] = "user_2tQhqRd21zKBNKjujV9kk1ul9oW"
#os.environ["LLM_SERVER_API_KEY"] = "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"
#os.environ["LLM_SERVER_BASE_URL"] = "https://openrouter.ai/api/v1"

In [3]:
# Initialize ChomaDB
chroma_client = chromadb.PersistentClient(path="chromadb")
collection = chroma_client.get_or_create_collection(name="zotero_papers")

# Initialize LLM client
llm_client = OpenAI(
    base_url=os.environ["LLM_SERVER_BASE_URL"],
    api_key=os.environ["LLM_SERVER_API_KEY"],
)

# Initialize Zotero client for user (use group id and "group" for group libraries)
zot = zotero.Zotero(
    library_id=os.environ.get("ZOTERO_USER_ID"),
    library_type="user",
    api_key=os.environ.get("ZOTERO_API_KEY")
)

In [5]:
from thepipe.scraper import scrape_file
from thepipe.chunker import chunk_by_page
from tqdm import tqdm
import time
import fitz

print(f"fitz path = {fitz.__file__}")
print(f"fitz version = {fitz.__version__}")
# Create 'pdfs' directory if it doesn't exist
os.makedirs("pdfs", exist_ok=True)

userid = os.getenv("ZOTERO_USER_ID")
print(f"zotero user id = {userid}")

# Fetch all collections
collections = zot.collections()
print(f"Number of collections: {len(collections)}")
# Print collection keys and names
for col in collections:
    print(f"Collection Name: {col['data']['name']}, Key: {col['key']}")


COLLECTION_KEY = "GV4MMALE"
# Fetch items from the specific collection
items = zot.everything(zot.collection_items(COLLECTION_KEY))
for item in items:
    print(f"item_key = {item['data']['key']}")
print(f"Number of items: {len(items)}")
# Retrieve all items
#items = zot.everything(zot.top())

#print(zotero.Zotero("shenzeng", "user", os.getenv("ZOTERO_API_KEY")).key_info())
#attachments = zot.children("GV4MMALE")
#print(f"attachments={attachments}")


for item in tqdm(items):
    if 'contentType' in item['data'] and item['data']['contentType'] == 'application/pdf':
        item_key = item['data']['key']
        filename = item['data'].get('filename', None)

        # Skip if not a PDF
        if not filename or not filename.endswith('.pdf') or filename.startswith("Dietary"):
            continue

        file_path = os.path.join("pdfs", filename)

        # Download the file
        #if item_key != "RPZBPCW5":
        with open(file_path, 'wb') as f:
          f.write(zot.file(item_key))
        print(f"Downloaded: {filename}")

        # Scrape the file
        chunks = scrape_file(file_path, ai_extraction=True, text_only=True, local=True, chunking_method=chunk_by_page)
        print(f"Scraped {len(chunks)} chunks from {filename}")

        # Add chunks to collection
        for chunk in chunks:
            chunk_text = '\n'.join(chunk.texts)
            collection.add(
                documents=[chunk_text],
                metadatas=[{"source": chunk.path}],
                ids=[str(time.time_ns())],
            )
        print(f"Added {len(chunks)} chunks to collection")

fitz path = /usr/local/lib/python3.11/dist-packages/fitz/__init__.py
fitz version = 1.25.3
zotero user id = 16493983
Number of collections: 1
Collection Name: tfp2021, Key: GV4MMALE
item_key = ERWBTXFU
item_key = EB2HPJPU
item_key = TA75FWBI
Number of items: 3


  0%|          | 0/3 [00:00<?, ?it/s]

Downloaded: 04-19-SNAP.pdf
Scraped 12 chunks from 04-19-SNAP.pdf


 67%|██████▋   | 2/3 [00:49<00:24, 24.72s/it]

Added 12 chunks to collection
Downloaded: 2021 - Thrifty Food Plan, 2021.pdf
Scraped 125 chunks from 2021 - Thrifty Food Plan, 2021.pdf


100%|██████████| 3/3 [06:56<00:00, 138.72s/it]

Added 125 chunks to collection





In [9]:
from typing import Dict

query = "How many modeling categories are there in Thrifty Food Plan?"
response = None

def ask(query: str)->Dict:
  # Query the collection
  results = collection.query(
      query_texts=[query],
      n_results=3  # Retrieve top 3 most relevant chunks
  )

  # Prepare context from retrieved chunks
  # context = "\n".join(results['documents'][0])

  # if you want cited sources, you can use the following code
  context = ""
  for source, text in zip(results['metadatas'][0], results['documents'][0]):
      context += f"<Document source='{source['source']}'>\n{text}\n</Document>\n"

  #print("Retrieved context to use for LLM generation:")
  #print(context)
  # Prepare messages for OpenRouter
  messages = [
      {"role": "system", "content": "You are a helpful scientific assistant. Use the provided context to answer the user's question."},
      {"role": "user", "content": f"Context:\n{context}\nUser query: {query}"}
  ]

  # Call OpenRouter API
  response = llm_client.chat.completions.create(
      model="meta-llama/llama-3.1-405b-instruct",
      messages=messages,
      temperature=0.2
  )
  return response

In [10]:
# Example query for retrieval-augmented generation
query = "What is Thrifty Food Plan?"

# Get text from response
response = ask(query)
response_text = response.choices[0].message.content
print("LLM generation:", response_text)

LLM generation: The Thrifty Food Plan is the United States Department of Agriculture's (USDA) lowest-cost food plan that describes the cost of a healthy, practical diet on a limited budget. It is used to determine the maximum amount of benefits that participants in the Supplemental Nutrition Assistance Program (SNAP) can receive. The plan is updated every five years to reflect changes in food prices, dietary recommendations, and other factors. The Thrifty Food Plan, 2021 is the latest update, which represents a reevaluation of the existing assumptions and methods used in past editions, informed by input from subject-matter experts and a review of existing literature.


In [12]:
query = "What is Dietary Guidelines for Americans?"
response = ask(query)
response_text = response.choices[0].message.content
print("LLM generation:", response_text)

LLM generation: The Dietary Guidelines for Americans is a set of recommendations for healthy eating patterns for Americans aged 2 and older. It is published jointly by the U.S. Department of Agriculture (USDA) and the U.S. Department of Health and Human Services (HHS) every five years. The guidelines provide evidence-based recommendations for healthy eating patterns, including the types and amounts of foods to consume, as well as foods to limit or avoid. The guidelines aim to promote healthy eating habits and reduce the risk of chronic diseases such as obesity, heart disease, and type 2 diabetes. The 2020-2025 edition of the Dietary Guidelines for Americans is referenced in the provided text.


In [13]:
query = "What is a Modeling Category in the context of Thrifty Food Plan?"
response = ask(query)
response_text = response.choices[0].message.content
print("LLM generation:", response_text)

LLM generation: In the context of the Thrifty Food Plan, a Modeling Category refers to a specific group of foods and beverages that are used as inputs in the mathematical optimization model to generate the Thrifty Food Plan Market Baskets. These categories are created based on the food group and subgroups in the Dietary Guidelines for Americans, 2020-2025, and are used to represent a nutritious diet. The Modeling Categories are then used to create the Market Basket Categories, which are the actual categories of foods and beverages that make up the Thrifty Food Plan Market Baskets. For example, the Modeling Category "processed soy products" is part of the larger Market Basket Category "nuts, seeds, and soy products".


In [14]:
query = "How many Modeling Categories are there in the Thrifty Food Plan?"
response = ask(query)
response_text = response.choices[0].message.content
print("LLM generation:", response_text)

LLM generation: The provided context does not explicitly state the number of Modeling Categories in the Thrifty Food Plan. However, it does mention that the Market Basket Categories were created based on the Modeling Categories and in consideration of the food group and subgroups in the Dietary Guidelines for Americans, 2020-2025.

To find the exact number of Modeling Categories, you may need to refer to Appendix 1 of the Thrifty Food Plan document, which is titled "Thrifty Food Plan, 2021 Modeling Categories and Description of Foods" (page 53). This appendix likely provides a detailed list and description of the Modeling Categories used in the Thrifty Food Plan.
