In [8]:
import os
import pandas as pd
import weaviate
from langchain_community.document_loaders import CSVLoader
from langchain_weaviate.vectorstores import WeaviateVectorStore
from langchain_ollama import OllamaEmbeddings

In [9]:
DATA_DIR = '../data'

UPLOAD_DIR = "uploads"

UPLOAD_PATH = os.path.join(DATA_DIR, UPLOAD_DIR)

os.makedirs(UPLOAD_PATH, exist_ok=True)

In [10]:
CHEESES_CSV_FILE_PATH = os.path.join(DATA_DIR, "cheeses.csv")
INDEX_NAME = 'cheeses'
TENANT_NAME = 'cheeses'

df = pd.read_csv(CHEESES_CSV_FILE_PATH)

display(df.head())

Unnamed: 0,cheese,url,milk,country,region,family,type,fat_content,calcium_content,texture,rind,color,flavor,aroma,vegetarian,vegan,synonyms,alt_spellings,producers
0,Aarewasser,https://www.cheese.com/aarewasser/,cow,Switzerland,,,semi-soft,,,buttery,washed,yellow,sweet,buttery,False,False,,,Jumi
1,Abbaye de Belloc,https://www.cheese.com/abbaye-de-belloc/,sheep,France,Pays Basque,,"semi-hard, artisan",,,"creamy, dense, firm",natural,yellow,burnt caramel,lanoline,True,False,Abbaye Notre-Dame de Belloc,,
2,Abbaye de Belval,https://www.cheese.com/abbaye-de-belval/,cow,France,,,semi-hard,40-46%,,elastic,washed,ivory,,aromatic,False,False,,,
3,Abbaye de Citeaux,https://www.cheese.com/abbaye-de-citeaux/,cow,France,Burgundy,,"semi-soft, artisan, brined",,,"creamy, dense, smooth",washed,white,"acidic, milky, smooth","barnyardy, earthy",False,False,,,
4,Abbaye de Tamié,https://www.cheese.com/tamie/,cow,France,Savoie,,"soft, artisan",,,"creamy, open, smooth",washed,white,"fruity, nutty","perfumed, pungent",False,False,,"Tamié, Trappiste de Tamie, Abbey of Tamie",


In [11]:
loader = CSVLoader(file_path=CHEESES_CSV_FILE_PATH, source_column="url")
data = loader.load()

for record in data[:2]:
    print(record)
display(len(data))

page_content='cheese: Aarewasser
url: https://www.cheese.com/aarewasser/
milk: cow
country: Switzerland
region: NA
family: NA
type: semi-soft
fat_content: NA
calcium_content: NA
texture: buttery
rind: washed
color: yellow
flavor: sweet
aroma: buttery
vegetarian: FALSE
vegan: FALSE
synonyms: NA
alt_spellings: NA
producers: Jumi' metadata={'source': 'https://www.cheese.com/aarewasser/', 'row': 0}
page_content='cheese: Abbaye de Belloc
url: https://www.cheese.com/abbaye-de-belloc/
milk: sheep
country: France
region: Pays Basque
family: NA
type: semi-hard, artisan
fat_content: NA
calcium_content: NA
texture: creamy, dense, firm
rind: natural
color: yellow
flavor: burnt caramel
aroma: lanoline
vegetarian: TRUE
vegan: FALSE
synonyms: Abbaye Notre-Dame de Belloc
alt_spellings: NA
producers: NA' metadata={'source': 'https://www.cheese.com/abbaye-de-belloc/', 'row': 1}


1187

In [12]:
weaviate_client = weaviate.connect_to_local()

embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
)

In [13]:
input_text = "Where is Provolone from?"
vector = embeddings.embed_query(input_text)
print(vector[:3])

[0.043801915, 0.040308535, -0.142904]


In [14]:
def update_vector_store(client, embeddings, documents, tenant = None):
    db_with_mt = WeaviateVectorStore.from_documents(
        documents, embeddings, client=client, index_name=INDEX_NAME, tenant=tenant
    )

    print("✅ Knowledge base updated successfully!")
    return db_with_mt


In [15]:
db = update_vector_store(client=weaviate_client, embeddings=embeddings, documents=data, tenant=TENANT_NAME)

2025-Mar-15 03:53 AM - langchain_weaviate.vectorstores - INFO - Tenant cheeses does not exist in index cheeses. Creating tenant.


✅ Knowledge base updated successfully!


In [16]:
query = "Soft cheese"
docs = db.similarity_search(query, tenant=TENANT_NAME)

# Print the first 100 characters of each result
for i, doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    # print(doc.page_content[:100] + "...")
    print(doc.page_content)


Document 1:
cheese: Bath Soft Cheese
url: https://www.cheese.com/bath-soft/
milk: cow
country: England
region: South West England
family: Brie
type: soft
fat_content: NA
calcium_content: NA
texture: creamy
rind: bloomy
color: ivory
flavor: citrusy, lemony, mushroomy
aroma: aromatic, grassy
vegetarian: NA
vegan: NA
synonyms: NA
alt_spellings: Bath Soft
producers: The Bath Soft Cheese Co.

Document 2:
cheese: Bath Soft Cheese Truffled
url: https://www.cheese.com/bath-soft-cheese-truffled/
milk: cow
country: United Kingdom
region: NA
family: Brie
type: soft
fat_content: NA
calcium_content: NA
texture: buttery, soft-ripened
rind: NA
color: white
flavor: NA
aroma: NA
vegetarian: NA
vegan: NA
synonyms: NA
alt_spellings: NA
producers: The Bath Soft Cheese Co.

Document 3:
cheese: Brefu Bach
url: https://www.cheese.com/brefu-bach/
milk: sheep
country: Wales
region: NA
family: NA
type: soft
fat_content: NA
calcium_content: NA
texture: soft
rind: NA
color: golden yellow
flavor: NA
aroma: NA
veg

In [22]:
from pydantic import BaseModel
from typing import List, Dict, Optional

from langchain import hub
from langchain_ollama import ChatOllama


prompt = hub.pull("rlm/rag-prompt")


class State(BaseModel):
    question: str
    context: Optional[List[Dict]]
    answer: str
    tenant: str


llm = ChatOllama(
    model="llama3.2:3b",
    temperature=0,
)


def retrieve(db, state: State):
    retrieved_docs = db.similarity_search(state["question"], tenant=state['tenant'])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke(
        {"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


def retrieve_and_generate(user_prompt):
    state = {'question': user_prompt, 'context': None, 'answer': '', 'tenant': TENANT_NAME}

    context = retrieve(db, state)
    state['context'] = context['context']
    
    ans = generate(state)['answer']
    return ans

def print_rag(user_prompt, rag_answer):
    print("Question: \t{}".format(user_prompt))
    print("Answer: \t{}".format(rag_answer))



In [23]:
state = {'question': "soft cheese", 'context': None, 'answer': '', 'tenant': TENANT_NAME}

context = retrieve(db, state)
# Print the first 100 characters of each result
for i, doc in enumerate(context['context']):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:100] + "...")
state['context'] = context['context']

print(generate(state)['answer'])


Document 1:
cheese: Bath Soft Cheese
url: https://www.cheese.com/bath-soft/
milk: cow
country: England
region: S...

Document 2:
cheese: Bath Soft Cheese Truffled
url: https://www.cheese.com/bath-soft-cheese-truffled/
milk: cow
c...

Document 3:
cheese: Brefu Bach
url: https://www.cheese.com/brefu-bach/
milk: sheep
country: Wales
region: NA
fam...

Document 4:
cheese: Bix
url: https://www.cheese.com/bix/
milk: cow
country: United Kingdom
region: NA
family: NA...
The Bath Soft Cheese is a type of soft cheese originating from South West England, made from cow's milk. It has a creamy texture and a flavor profile that includes citrusy, lemony, and mushroomy notes. The Bath Soft Cheese Co. produces this cheese.


In [24]:
ques = "Is Provolone a hard or soft cheese?"
ans = retrieve_and_generate(ques)
print_rag(ques, ans)

Question: 	Is Provolone a hard or soft cheese?
Answer: 	Provolone is a semi-hard cheese. It has a firm texture and can be described as having a grainy texture in some variations, such as Provolone del Monaco. Its texture is elastic and stringy in other variations like Provolone Mandarino Gran Riserva.


In [25]:
ques = "Which cheeses are the hardest?"
ans = retrieve_and_generate(ques)
print_rag(ques, ans)

Question: 	Which cheeses are the hardest?
Answer: 	I don't know which cheeses are the hardest. The provided context only mentions the texture of some cheeses as "hard" or "semi-hard", but it doesn't provide a ranking or comparison of their hardness levels. St Tola Hard Cheese is listed as a hard cheese, but its relative hardness compared to others is not specified.


In [26]:
ques = "Where is Provolone from?"
ans = retrieve_and_generate(ques)
print_rag(ques, ans)

Question: 	Where is Provolone from?
Answer: 	Provolone cheese originates from Italy, specifically from the Po Valley region and other regions such as Naples, Valpadana, and Veneto. The exact origin of Provolone can vary depending on the specific type or production area. Provolone is a semi-hard, artisanal cheese made from cow's milk.


In [27]:
ques = "I need a cheese with low sodium because I have high blood pressure"
ans = retrieve_and_generate(ques)
print_rag(ques, ans)

Question: 	I need a cheese with low sodium because I have high blood pressure
Answer: 	I don't know the sodium content of the cheese you're looking for. The provided context only includes information about Sartori Reserve Black Pepper BellaVitano, Bra Duro DOP, Stella Black Pepper Romano, and San Simón DOP, but not their sodium levels. If you need to check sodium content, I recommend checking a reliable nutrition source or the manufacturer's website for more detailed information.


In [28]:
print(db.__dict__)

{'_client': <weaviate.client.WeaviateClient object at 0x164dce4b0>, '_index_name': 'cheeses', '_embedding': OllamaEmbeddings(model='nomic-embed-text', base_url=None, client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None), '_text_key': 'text', '_query_attrs': ['text', 'source', 'row'], 'relevance_score_fn': <function _default_score_normalizer at 0x165e57380>, '_collection': <weaviate.collections.collection.sync.Collection object at 0x165d32960>, '_multi_tenancy_enabled': True}


## Previously stored documents

In [None]:
vec_db = WeaviateVectorStore(
    client=weaviate_client,
    index_name=INDEX_NAME,
    text_key='text',
    embedding=embeddings
)