In [5]:
from langchain.document_loaders import CSVLoader, UnstructuredMarkdownLoader, PyPDFLoader, UnstructuredFileLoader
from langchain_openai import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from dotenv import load_dotenv
import os
from pathlib import Path

In [3]:
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
llm = ChatOpenAI(model='gpt-4o-mini', temperature=0.6)
# embeddings = OpenAIEmbeddings()

In [65]:
llm.invoke("Hello")

AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 8, 'total_tokens': 17, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-BgvzRpwJQ5hirLeo7hry0qQPteysp', 'finish_reason': 'stop', 'logprobs': None}, id='run--48f94ba8-f890-4313-b55d-d6ec1de86a70-0', usage_metadata={'input_tokens': 8, 'output_tokens': 9, 'total_tokens': 17, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [4]:
role_docs  = {
  "engineering": r"..\resources\data\engineering",
  "finance": r"..\resources\data\finance",
  "general": r"..\resources\data\general",
  "hr": r"..\resources\data\hr",
  "marketing": r"..\resources\data\marketing"
}


In [11]:
def vector_db():
  role_vectorstores = {}
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=1000,
      chunk_overlap=200,
      separators=["\n\n", "\n", " ", ""])

  for role, folder_path in role_docs.items():
    folder_path = os.path.normpath(folder_path)

    if not os.path.exists(folder_path):
      print(f"⚠️ Warning: Directory '{folder_path}' does not exist. Skipping role '{role}'.")
      continue
    print(f"Processed with {role} {folder_path}")
    documents = []

    
    for filename in os.listdir(folder_path):
      file_path = os.path.join(folder_path, filename)

      if not os.path.isfile(file_path):
        print(f"No file Present in {folder_path}")
        continue

      try:
        if filename.lower().endswith(".md"):
          md_loader = UnstructuredMarkdownLoader(file_path)
          docs = md_loader.load()

        elif filename.lower().endswith(".pdf"):
          pdf_loader = PyPDFLoader(file_path)
          docs = pdf_loader.load()

        elif filename.lower().endswith(".csv"):
          csv_laoder = CSVLoader(file_path)
          docs = csv_laoder.load()

        else:
          print(f"   ⚠️ Trying unstructured loader for: {filename}")
          file_loader = UnstructuredFileLoader(file_path)
          docs = file_loader.load()

        documents.extend(docs)
        print(f" ✅ Loaded {filename} ({len(docs)} documents)")          

      except Exception as e:
        print(f"Failed to load {filename}: {str(e)}")
        continue

    if not documents:
      print(f"⚠️ No documents found for {role}")
      continue
    chunks = text_splitter.split_documents(documents)

    try:
      vector_db = FAISS.from_documents(chunks, embeddings)
      role_vectorstores[role] = vector_db
      print(f"🧠 Created vector store for {role} with {len(chunks)} chunks")

    except Exception as e:
      print(f"Vector DB is not created for {role}: {str(e)}")
  
  return role_vectorstores
  

In [6]:
vectorstores = vector_db()
    
    # Save vector stores to disk
for role, store in vectorstores.items():
    safe_role = "".join(c for c in role if c.isalnum() or c in "_-").rstrip()
    store.save_local(f"vectorstore_{safe_role}")
    print(f"💾 Saved vector store for {role}")

print("\nCompleted processing all roles!")

  from .autonotebook import tqdm as notebook_tqdm


Processed with engineering ..\resources\data\engineering
 ✅ Loaded engineering_master_doc.md (1 documents)
🧠 Created vector store for engineering with 33 chunks
Processed with finance ..\resources\data\finance
 ✅ Loaded financial_summary.md (1 documents)
 ✅ Loaded quarterly_financial_report.md (1 documents)
🧠 Created vector store for finance with 23 chunks
Processed with general ..\resources\data\general
 ✅ Loaded employee_handbook.md (1 documents)
🧠 Created vector store for general with 16 chunks
Processed with hr ..\resources\data\hr
 ✅ Loaded hr_data.csv (100 documents)
🧠 Created vector store for hr with 100 chunks
Processed with marketing ..\resources\data\marketing
 ✅ Loaded marketing_report_2024 - Copy.md (1 documents)
 ✅ Loaded marketing_report_2024.md (1 documents)
 ✅ Loaded marketing_report_q1_2024.md (1 documents)
 ✅ Loaded marketing_report_q2_2024.md (1 documents)
 ✅ Loaded marketing_report_q3_2024.md (1 documents)
 ✅ Loaded market_report_q4_2024.md (1 documents)
🧠 Created v

In [8]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Use three sentence maximum and keep the answer concise. "
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

retriver = vectorstores["finance"].as_retriever()
question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriver, question_answer_chain)

chain.invoke({"input": "Cash Flow Analysis"})

{'input': 'Cash Flow Analysis',
 'context': [Document(id='e622a387-8eef-494c-8b96-97cb32afe574', metadata={'source': '..\\resources\\data\\finance\\financial_summary.md'}, page_content='Other Operational Expenses - A mix of general operational and administrative expenses totaling $15M, with a notable increase in travel and miscellaneous office costs, which grew by 8% year-over-year.\n\nCash Flow Analysis:\n\nFinSolve Technologies’s cash flow from operations remained positive, amounting to $50M, a 20% increase over the prior year. However, the delayed payment cycles from several key vendors resulted in accounts payable delays, which slightly impacted cash liquidity during the second half of 2024. Addressing these delays, particularly in vendor payments, will be crucial to ensuring smoother cash flow management.\n\nCash Flow from Investing Activities: Investments in research and development, new market entry, and acquisitions amounted to $15M, representing 20% of total cash flows. While 

In [70]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# 1. Improved system prompt with clearer instructions
system_prompt = (
    "You are an expert assistant specialized in company documentation. "
    "Answer the user's question using ONLY the provided context. "
    "If the answer cannot be found in the context, say 'I don't know'. "
    "Keep your answer concise (2-3 sentences maximum).\n\n"
    "Context:\n{context}"
)

# 2. Create the prompt template
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

# 3. IMPORTANT: Initialize your vector stores first!
# This should be done BEFORE this code snippet
# vectorstores = load_vector_stores()  # Use your loading function

# 4. Create retriever - added error handling
try:
    # Use the specific vector store for a role
    retriever = vectorstores["general"].as_retriever(
        search_kwargs={"k": 5}  # Retrieve top 5 documents
    )
except KeyError:
    raise ValueError("Vector store for 'general' role not found")

# 5. Initialize LLM (missing in your code)
# This should be done BEFORE creating chains
# Example: 
# from langchain_community.chat_models import ChatOpenAI
# llm = ChatOpenAI(model="gpt-3.5-turbo")

# 6. Create the chains
question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)

# 7. Invoke the chain - improved with proper input handling
query = "Your question about general policies here..."
result = chain.invoke({"input": query})

# 8. Access results properly
print("Answer:", result["answer"])
print("\nSources used:")
for doc in result["context"]:
    print(f"- {doc.metadata['source']}")

Answer: I don't know.

Sources used:
- ..\resources\data\general\employee_handbook.md
- ..\resources\data\general\employee_handbook.md
- ..\resources\data\general\employee_handbook.md
- ..\resources\data\general\employee_handbook.md
- ..\resources\data\general\employee_handbook.md


In [12]:
import os
def load_vector_store(role: str):
    path = f"vectorstore_{role}"
    print(path)
    if not os.path.exists(path):
        print(f"❌ Path not found: {path}")
        raise FileNotFoundError(f"No vectorstore for role '{role}'")
    return FAISS.load_local(path, embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"), allow_dangerous_deserialization=True)

In [13]:
load_vector_store("hr")

vectorstore_hr


  from .autonotebook import tqdm as notebook_tqdm


<langchain_community.vectorstores.faiss.FAISS at 0x18bb3325b10>