In [10]:
import re
def extract_supplier_vendor_name(text: str):
    """
    Detect a supplier/vendor name in free-form text.
    Handles:
        - "supplier ABC Corporation"
        - "vendor XYZ Ltd"
        - "from Acme Industries"
        - "shipper Global Logistics"
    Returns the extracted name (cleaned and title-cased) or None.
    """
    if not text:
        return None

    patterns = [
        r"supplier\s+(?:name\s+)?(?:is\s+)?([A-Za-z0-9\s\.,&\-']+?)(?:\s+(?:for|with|has|is|on|in|at|,)|$)",
        r"vendor\s+(?:name\s+)?(?:is\s+)?([A-Za-z0-9\s\.,&\-']+?)(?:\s+(?:for|with|has|is|on|in|at|,)|$)",
        r"from\s+(?:supplier\s+)?(?:vendor\s+)?([A-Za-z0-9\s\.,&\-']+?)(?:\s+(?:for|with|has|is|on|in|at|,)|$)",
        r"shipper\s+(?:name\s+)?(?:is\s+)?([A-Za-z0-9\s\.,&\-']+?)(?:\s+(?:for|with|has|is|on|in|at|,)|$)",
    ]
    
    for pat in patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m:
            # Extract and clean the name
            name = m.group(1).strip()
            # Remove trailing punctuation
            name = re.sub(r'[,.\s]+$', '', name)
            # Clean up extra whitespace
            name = re.sub(r'\s+', ' ', name)
            # Return if valid length (at least 2 characters)
            if len(name) >= 2:
                return name.title()
    
    return None


In [12]:
print(extract_supplier_vendor_name("QUEST COMPOSITE TECHNOLOGY")) 

None


In [13]:
import re
def extract_supplier_vendor_name(text: str):
    """
    Detect a supplier/vendor name in free-form text.
    Handles:
        - "supplier ABC Corporation"
        - "vendor XYZ Ltd"
        - "from Acme Industries"
        - "shipper Global Logistics"
        - "supplier QUEST COMPOSITE TECHNOLOGY(0026071)"
    Returns the extracted name (cleaned and title-cased) or None.
    """
    if not text:
        return None

    patterns = [
        r"supplier\s+(?:name\s+)?(?:is\s+)?([A-Za-z0-9\s\.,&\-'()]+?)(?:\s+(?:for|with|has|on|in|at)\b|$)",
        r"vendor\s+(?:name\s+)?(?:is\s+)?([A-Za-z0-9\s\.,&\-'()]+?)(?:\s+(?:for|with|has|on|in|at)\b|$)",
        r"from\s+(?:supplier\s+)?(?:vendor\s+)?([A-Za-z0-9\s\.,&\-'()]+?)(?:\s+(?:for|with|has|on|in|at)\b|$)",
        r"shipper\s+(?:name\s+)?(?:is\s+)?([A-Za-z0-9\s\.,&\-'()]+?)(?:\s+(?:for|with|has|on|in|at)\b|$)",
    ]
    
    for pat in patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m:
            # Extract and clean the name
            name = m.group(1).strip()
            # Remove trailing punctuation (but keep parentheses if they're part of the name)
            name = re.sub(r'[,.\s]+$', '', name)
            # Clean up extra whitespace
            name = re.sub(r'\s+', ' ', name)
            # Return if valid length (at least 2 characters)
            if len(name) >= 2:
                return name.title()
    
    return None

In [14]:
print(extract_supplier_vendor_name("QUEST COMPOSITE TECHNOLOGY(0026071)")) 

None


In [15]:
print(extract_supplier_vendor_name("supplier QUEST COMPOSITE TECHNOLOGY(0026071)"))
# Output: Quest Composite Technology(0026071)

Quest Composite Technology(0026071)


In [2]:
from langchain_openai import AzureOpenAIEmbeddings
from config import settings

def get_azure_embeddings() -> AzureOpenAIEmbeddings:
    """Return the Azure embeddings model – used by vectorstore and RAG."""
    return AzureOpenAIEmbeddings(
        azure_deployment=settings.AZURE_OPENAI_EMBEDDING_MODEL,
        openai_api_version=settings.AZURE_OPENAI_API_VERSION,
        azure_endpoint=settings.AZURE_OPENAI_ENDPOINT,
        api_key=settings.AZURE_OPENAI_API_KEY,
    )

In [4]:
embeddings=get_azure_embeddings()

In [5]:
test_text = "This is a test sequence for the embeddings model."
vector = embeddings.embed_query(test_text)

In [6]:
print(f"✓ Embedding successful!")
print(f"Vector dimension: {len(vector)}")
print(f"First 5 values: {vector[:5]}")
print(f"Vector type: {type(vector)}")

✓ Embedding successful!
Vector dimension: 1536
First 5 values: [-0.029504351815876323, -0.0025313085636933176, -0.004927523671912916, -0.0015375293312978294, 0.014279349046610776]
Vector type: <class 'list'>


In [7]:
# Test batch embeddings
test_documents = [
    "Container shipment delayed",
    "Supplier delivery on time",
    "Cargo inspection complete"
]
vectors = embeddings.embed_documents(test_documents)

print(f"✓ Batch embedding successful!")
print(f"Number of documents embedded: {len(vectors)}")
print(f"Each vector dimension: {len(vectors[0])}")
print(f"First 5 values of the first vector: {vectors[0][:5]}")
print(f"Vector type: {type(vectors[0])}")

✓ Batch embedding successful!
Number of documents embedded: 3
Each vector dimension: 1536
First 5 values of the first vector: [-0.0009257371044195405, -0.036719217294288044, 0.00021014569543001887, 0.00040595848073678625, -0.020180731865602163]
Vector type: <class 'list'>


In [10]:
# services/vectorstore.py
import logging
import time
from pathlib import Path
from typing import List

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import AzureOpenAIEmbeddings

from config import settings


In [11]:
import logging
from pathlib import Path
from langchain_openai import AzureOpenAIEmbeddings
from config import settings
from services.azure_blob import get_shipment_df

logger = logging.getLogger("shipping_chatbot")
VECTORSTORE_DIR = Path("faiss_index")


def _embeddings() -> AzureOpenAIEmbeddings:
    """Create AzureOpenAIEmbeddings the same object used for RAG and for the RetrievalQA chain."""
    return AzureOpenAIEmbeddings(
        azure_deployment=settings.AZURE_OPENAI_EMBEDDING_MODEL,
        openai_api_version=settings.AZURE_OPENAI_API_VERSION,
        azure_endpoint=settings.AZURE_OPENAI_ENDPOINT,
        api_key=settings.AZURE_OPENAI_API_KEY,
    )


In [12]:
def _build_index() -> FAISS:
    """
    Build a brand new FAISS index from the shipment dataframe.
    The routine is batched (50 docs per batch) and respects Azure rate limits.
    """
    logger.info("Creating new FAISS vector store")
    df = get_shipment_df()

    # Turn each row into a single text block
    # Optimization: Use a format that is more token-efficient or LLM-friendly if possible.
    # Logic Fix: Increase text chunk size to avoid splitting a single row into multiple incoherent chunks.
    rows_as_text = [
        "\n".join(f"{k}: {v}" for k, v in row.items() if str(v).strip())
        for row in df.fillna("").astype(str).to_dict(orient="records")
    ]

    # CRITICAL FIX: Increased chunk_size from 400 to 2000.
    # Splitting a row breaks the context (e.g. key 'ETA' separated from value).
    splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
    chunks: List[str] = []

    # Check if rows are effectively split. Ideally, we want 1 row = 1 chunk.
    for txt in rows_as_text:
        # If the row is small enough, keep it as is.
        # split_text will return [txt] if it fits.
        split_chunks = splitter.split_text(txt)
        chunks.extend(split_chunks)

    embeddings = _embeddings()

    # Optimization: Increase batch size slightly if Azure permits (usually safe up to 16k tokens/req).
    # 50 rows * 500 chars ~= 25k chars ~= 6k tokens. Safe.
    batch = 50
    vectorstore = FAISS.from_texts(chunks[:batch], embeddings)

    for i in range(batch, len(chunks), batch):
        vectorstore.add_texts(chunks[i : i + batch])
        if i % (batch * 5) == 0:
            logger.debug(f"Indexed batch {i // batch + 1}/{len(chunks)//batch}")

        # 0.2s should be sufficient.
        time.sleep(0.2)

    VECTORSTORE_DIR.mkdir(parents=True, exist_ok=True)
    vectorstore.save_local(str(VECTORSTORE_DIR))
    logger.info("FAISS index persisted")
    return vectorstore


# ----------------------------------------------------------------------
# Public accessor – lazy‑load on first call
# ----------------------------------------------------------------------
_vectorstore: FAISS | None = None


def get_vectorstore() -> FAISS:
    """Return a cached FAISS store; builds it on first request."""
    global _vectorstore
    if _vectorstore is None:
        if VECTORSTORE_DIR.exists():
            logger.info("Loading existing FAISS index")
            try:
                _vectorstore = FAISS.load_local(
                    str(VECTORSTORE_DIR),
                    _embeddings(),
                    allow_dangerous_deserialization=True,
                )
            except Exception as e:
                logger.error(f"Failed to load existing index: {e}. Rebuilding...")
                _vectorstore = _build_index()
        else:
            _vectorstore = _build_index()
    return _vectorstore

In [None]:
get_vectorstore()