In [None]:
import os
import pandas as pd
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# 1. Set up HuggingFace embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# 2. Load your pharmaceutical data
df = pd.read_csv("data/product.csv", sep=";", encoding="ISO-8859-1")

# 3. Convert each row to a LangChain Document
raw_documents = []
ids = []

for i, row in df.iterrows():
    content = f"""
    ProductId: {row['ProductId']}
    Product Name: {row['ProductName']}
    DCI: {row['DCI']}
    Dosage: {row['Dosage']}
    Form: {row['FormId']}
    Brand: {row['Brand']}
    Laboratory: {row['Laboratory']}
    Pharmacological Class: {row['PharmacologicalClass']}
    Therapeutic Class: {row['TherapeuticClass']}
    Refundable: {row['Refundable']}
    Price: {row['Ppa']}
    Stock: {row['MaxThreshold']}
    """
    doc = Document(
        page_content=content.strip(),
        metadata={
            # "ProductId": row["ProductId"],
            "Ppa": row["Ppa"],
            "Stock": row["MaxThreshold"],
            "DCI": row["DCI"],
            "ProductName": row["ProductName"]
        }
    )
    raw_documents.append(doc)
    ids.append(str(i))

# 4. Split documents for embedding
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = splitter.split_documents(raw_documents)

# 5. Path to save/load FAISS index
index_path = "vectorial_dbs/faiss_index_"

# 6. Load or build FAISS vector store
if os.path.exists(index_path):
    vector = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
    print("✅ FAISS index loaded from disk.")
else:
    vector = FAISS.from_documents(documents, embeddings)
    vector.save_local(index_path)
    print("✅ FAISS index created and saved to disk.")



✅ FAISS index created and saved to disk.


In [None]:
import os
import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# 1. Set up HuggingFace embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# 2. Load the orderlines_with_status.csv and normalize OrderDate to ISO format
df_orders = pd.read_csv("data/orderlines_with_status.csv")
df_orders['OrderDate'] = pd.to_datetime(df_orders['OrderDate']).dt.strftime('%Y-%m-%d')

# 3. Convert each row to a LangChain Document
order_documents = []

for _, row in df_orders.iterrows():
    content = f"""
    Order ID: {row['RefOrderId']}
    Product: {row['ProductName']}
    Quantity: {row['Qty']}
    Amount: {row['TotalAmount']}
    Order Date: {row['OrderDate']}
    Status: {row['status']}
    """
    doc = Document(
        page_content=content.strip(),
        metadata={
            "order_id": str(row["RefOrderId"]),
            "product_name": row["ProductName"],
            "quantity": str(row["Qty"]),
            "status": row["status"],
            "order_date": row["OrderDate"],
        }
    )
    order_documents.append(doc)

# 4. Optional: Split long documents (not strictly necessary unless content is large)
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
order_documents_split = splitter.split_documents(order_documents)

# 5. Define path to save/load FAISS index
index_path = "vectorial_dbs/faiss_index_order"

# 6. Load or build FAISS vector store
if os.path.exists(index_path):
    vector = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
    print("✅ FAISS index loaded from disk.")
else:
    vector = FAISS.from_documents(order_documents_split, embeddings)
    vector.save_local(index_path)
    print("✅ FAISS index created and saved to disk.")


✅ FAISS index created and saved to disk.
