In [27]:
from dotenv import load_dotenv
import os
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.documents import Document

In [4]:
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")

if not openai_key:
    raise ValueError("OPENAI_API_KEY not found in .env file.")

In [5]:
from pathlib import Path

In [6]:
docs = []

data_path = Path("data")
for file in data_path.iterdir():
    if file.suffix == ".pdf":
        loader = PyPDFLoader(str(file))
        docs.extend(loader.load())
    elif file.suffix == ".txt":
        loader = TextLoader(str(file))
        docs.extend(loader.load())

In [7]:
print(f"Loaded {len(docs)} document chunks.")

Loaded 2 document chunks.


In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=200,
    separators=["\n\n", "\n", ".", "!", "?", " ", ""]
)

In [9]:
chunks = text_splitter.split_documents(docs)

In [10]:
print(f"Split into {len(chunks)} chunks.")
print(f"Sample chunk preview:\n\n{chunks[0].page_content[:300]}...")

Split into 3 chunks.
Sample chunk preview:

Subject: Project Phoenix Development Update  
From: operations@company.com  
To: project-phoenix@company.com  
Date: June 18, 2025

Hi team,

As discussed in our last meeting, Project Phoenix has officially moved into the **development phase**.  
The model architecture has been approved and assigned...


In [11]:
embeddings = OpenAIEmbeddings()

  embeddings = OpenAIEmbeddings()


In [13]:
vector_store = FAISS.from_documents(chunks, embeddings)
print("FAISS vector store created successfully.")

FAISS vector store created successfully.


In [28]:
prompt_template = """
You are a helpful assistant for enterprise teams.
Respond to the following question using ONLY the information from the context provided below.

Only show information that someone in the role of a "{role}" is allowed to access.

Context:
{context}

Question:
{question}

Answer:
"""

In [29]:
prompt = PromptTemplate(
    input_variables=["context", "question", "role"],
    template=prompt_template
)

In [30]:
llm = ChatOpenAI(temperature=0.2, model="gpt-3.5-turbo")

In [31]:
def ask_question_with_role(role: str, question: str):
    docs: list[Document] = vector_store.similarity_search(question, k=4)
    context = "\n\n".join(doc.page_content for doc in docs)
    final_prompt = prompt.format(context=context, question=question, role=role)
    response = llm.invoke(final_prompt)

    return response.content

In [32]:
ask_question_with_role("Manager", "What is the current status of Project Phoenix?")

'The current status of Project Phoenix is that the development phase has officially begun, with the model architecture approved and assigned to the engineering team. Backend development has started, API integration deadlines have been shared with the tech lead, and compliance documentation is still under internal review. The team is reminded to complete sprint planning by June 20, with the next milestone being to complete the MVP by July 10, 2025.'

In [33]:
ask_question_with_role("Intern", "Who is handling backend development?")

'Backend Developer: Aditi Jain'

In [34]:
ask_question_with_role("HR", "Is the compliance documentation completed?")

'Compliance documentation is still under internal review.'