In [1]:
import json
from langchain.docstore.document import Document
import glob
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

In [2]:
def load_okw_json(path):
    documents = []
    for file_path in glob.glob(f'{path}/*'):
        with open(file_path, 'r') as f:
            data = json.load(f)
            content = create_document_content(data)
            doc = Document(page_content=content, metadata={'title':data.get('title', '')})
            documents.append(doc)
    return documents
        

In [3]:
def create_document_content(data):
    content_parts = []
    content_parts.append(f"Title: {data.get('title', '')}")
    content_parts.append(f"Description: {data.get('description', '')}")
    content_parts.append(f"Keywords: {', '.join(data.get('keywords', []))}")
    
    # Process Inventory Atoms
    inventory_atoms = data.get('inventory-atoms', [])
    if inventory_atoms:
        inventory_descriptions = []
        for atom in inventory_atoms:
            atom_desc = f"{atom.get('identifier', '')}: {atom.get('description', '')}"
            inventory_descriptions.append(atom_desc)
        content_parts.append(f"Inventory Atoms: {', '.join(inventory_descriptions)}")
    
    # Process Product Atoms
    product_atoms = data.get('product-atoms', [])
    if product_atoms:
        product_descriptions = []
        for atom in product_atoms:
            atom_desc = f"{atom.get('identifier', '')}: {atom.get('description', '')}"
            product_descriptions.append(atom_desc)
        content_parts.append(f"Product Atoms: {', '.join(product_descriptions)}")
    
    # Process Tool List Atoms
    tool_list_atoms = data.get('tool-list-atoms', [])
    if tool_list_atoms:
        tool_descriptions = []
        for atom in tool_list_atoms:
            atom_desc = f"{atom.get('identifier', '')}: {atom.get('description', '')}"
            tool_descriptions.append(atom_desc)
        content_parts.append(f"Tool List Atoms: {', '.join(tool_descriptions)}")
    
    # Process Processes
    processes = data.get('processes', [])
    if processes:
        process_descriptions = [atom.get('identifier', '') for atom in processes]
        content_parts.append(f"Processes: {', '.join(process_descriptions)}")
    
    return '\n'.join(content_parts)

In [4]:
directory_path = './OKWs/'
documents = load_okw_json(directory_path)


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 30
    )

splits = text_splitter.split_documents(documents)

In [17]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings
)

retriever = vectorstore.as_retriever(search_kwargs={'k': 5})


In [12]:
question = 'Is there any bakery near me that can give me organic flour? Or that produces fermented bread.'

In [18]:
initial_docs = retriever.get_relevant_documents(question)
documents_texts = [doc.page_content for doc in initial_docs]

data = {
    'model': 'jina-reranker-v2-base-en',
    'query': question,
    'top_n': 5,
    'documents': documents_texts
}


In [14]:
import requests

url = 'https://api.jina.ai/v1/rerank'
headers = {
    'Content-Type': 'application/json',
    'Authorization': 'Bearer jina_049cee63961b4bbaa869766604db4f30dSAnSs8PZbzANy4jBtbtYz-KIOea'
}

response = requests.post(url, headers=headers, json=data)
rerank_results = response.json()



In [15]:
# Process the reranker response
if 'results' in rerank_results:
    reranked_docs = []
    for result in rerank_results['results']:
        index = result['index']
        reranked_doc = initial_docs[index]
        reranked_docs.append(reranked_doc)
else:
    # Handle error
    print("Error in reranking:", rerank_results)
    reranked_docs = initial_docs  # Fallback to initial docs

Error in reranking: {'detail': 'Model jina-reranker-v2-base-en not found'}


In [16]:
# Option 1: Simple list of entities
entities = []
for doc in reranked_docs:
    title = doc.metadata.get('title', 'No Title')
    description = doc.page_content
    entities.append({'title': title, 'description': description})

print("Entities matching your requirements:")
for entity in entities:
    print(f"Title: {entity['title']}")
    print(f"Description: {entity['description']}\n")


Entities matching your requirements:
Title: Bread & Butter Bakery
Description: Description: A bakery specializing in sourdough bread and handcrafted pastries using locally

Title: Bread & Butter Bakery
Description: Keywords: bakery, artisanal bread, pastries, local ingredients

Title: Bread & Butter Bakery
Description: Inventory Atoms: Organic Flour: Flour milled from organically grown grains.

Title: Bread & Butter Bakery
Description: Title: Bread & Butter Bakery

Title: Bread & Butter Bakery
Description: Product Atoms: Sourdough Loaves: Bread made through natural fermentation.

