In [1]:
# Uncomment them & install the necessary packages
# !pip install python-docx
# !pip install langchain
# !pip install sentence-transformers
# !pip install pymongo
# !pip install tf-keras
# !pip install tiktoken
# !pip install openai

### Step1: Setup Prerequisites

In [2]:
import os
from pymongo import MongoClient

In [3]:
MONGODB_URI=os.environ.get("MONGODB_URI")

# Create a new client and connect to the server
mongodb_client = MongoClient(MONGODB_URI)

# Send a ping to confirm a successful connection
try:
    mongodb_client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


### Step2: Load the Dataset

In [4]:
from docx import Document

In [5]:
# Extract text from .docx
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = []
    for para in doc.paragraphs:
        if para.text.strip():
            text.append(para.text.strip())
    return "\n".join(text)

### Step3: Chunk up the data

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=50,
    separators=["\n\n", "\n", ". ", "! ", "? ", " ", "", "#", "##", "###"]
)

In [8]:
# Chunk text using LangChain's RecursiveCharacterTextSplitter
def get_chunk(text, chunk_size=200, chunk_overlap=30):
    return splitter.split_text(text)

### Step4: Generate Embeddings

In [9]:
from sentence_transformers import SentenceTransformer




In [10]:
'''
all-MiniLM-L6-v2 (384d) - faster inference, good quality
BAAI/bge-small-en-v1.5 (384d) - better semantic understanding
sentence-transformers/all-mpnet-base-v2 (768d) - higher quality
'''

embedding_model = SentenceTransformer("thenlper/gte-small")

In [11]:
# Generate embeddings using Sentence Transformers
def generate_embeddings(chunks):
    embeddings = embedding_model.encode(chunks)
    return embeddings

# Step5: Ingest data int MongoDB

In [12]:
DB_NAME = "rag_chatbot"
COLLECTION_NAME = "knowledge_base"

In [13]:
# Connect to the `COLLECTION_NAME` collection.
collection = mongodb_client[DB_NAME][COLLECTION_NAME]

In [14]:

# Bulk delete all existing records from the collection defined above
collection.delete_many({})

DeleteResult({'n': 65, 'electionId': ObjectId('7fffffff000000000000026f'), 'opTime': {'ts': Timestamp(1752512775, 14), 't': 623}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1752512775, 14), 'signature': {'hash': b'[\x1b\x19\xc7\n\xb0\x11\x8eU\xaa\xe3\xd8\xe2\xbe\x011\xe4\xb8\x8c\xca', 'keyId': 7461045087271649375}}, 'operationTime': Timestamp(1752512775, 14)}, acknowledged=True)

In [15]:
def store_chunks_in_mongo(chunks, embeddings, source_doc):
    # Batch insert for better performance
    records = []
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        record = {
            "source": source_doc,
            "chunk_id": i,
            "text": chunk,
            "embedding": embedding.tolist(),
            "chunk_length": len(chunk)
        }
        records.append(record)
    
    # Bulk insert
    if records:
        collection.insert_many(records)
    print(f"✅ Ingested {len(chunks)} chunks from {source_doc} in MongoDB.")

In [16]:
# 4. Process a single document
def process_document(file_path):
    print(f"\n🔹 Processing: {file_path}")
    text = extract_text_from_docx(file_path)
    chunks = get_chunk(text)
    embeddings = generate_embeddings(chunks)

    store_chunks_in_mongo(chunks, embeddings, os.path.basename(file_path) )
    
    return chunks, embeddings

In [17]:
# 5. List of your uploaded documents
file_paths = [
    r'Knowledge_Docs/AI Bootcamp Journey & Learning Path.docx',
    r'Knowledge_Docs/Intern FAQ - AI Bootcamp.docx',
    r'Knowledge_Docs/Training For AI Engineer Interns.docx'
]

In [18]:
# 6. Run the process for each file
for file_path in file_paths:
    process_document(file_path)


🔹 Processing: Knowledge_Docs/AI Bootcamp Journey & Learning Path.docx
✅ Ingested 13 chunks from AI Bootcamp Journey & Learning Path.docx in MongoDB.

🔹 Processing: Knowledge_Docs/Intern FAQ - AI Bootcamp.docx
✅ Ingested 41 chunks from Intern FAQ - AI Bootcamp.docx in MongoDB.

🔹 Processing: Knowledge_Docs/Training For AI Engineer Interns.docx
✅ Ingested 11 chunks from Training For AI Engineer Interns.docx in MongoDB.


In [19]:
print(collection.find_one())

{'_id': ObjectId('6875390c7a054d82fa3951a5'), 'source': 'AI Bootcamp Journey & Learning Path.docx', 'chunk_id': 0, 'text': 'Bootcamp Journey\nUse this document as a high-level overview of your journey.\nThis document will reference both these aspects:\nTechnical Skills Development\nCore ML/AI Concepts\nGen AI & Data Engineering\nMLOps & Deployment\nProject-Based Learning\nAgile Scrum Methodology\nTeam Collaborations\nReal-world Applications\nProject Timeline\nHere is a high-level time line of your 11-week journey.\nWeek 1 - 11 Agenda for AI PM Bootcamp\nWeek 1: Learning and Onboarding Study all the AI knowledge:\nTraining for AI Engineers\nTraining of AI Designer\nEngineers: Working on Job Tracker or PM FAQ Chatbot', 'embedding': [-0.051575373858213425, 0.0025243167765438557, 0.055302925407886505, -0.07263786345720291, 0.00776576716452837, 0.06329257041215897, -0.004742173012346029, 0.04291272163391113, -0.032826587557792664, -0.049939848482608795, 0.038683027029037476, -0.037293735891

### Step6: Create a vector search index

In [20]:
from utils.utils import create_index, check_index_ready

In [21]:
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

In [22]:
index_definition= {
    "name": ATLAS_VECTOR_SEARCH_INDEX_NAME,
    "type": "vectorSearch",
    "definition": {
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 384,
                "similarity": "cosine"
            },
            {
                "type": "filter",
                "path": "source"
            }
        ]
    },
}

In [23]:
# Use the `create_index` function from the `utils` module to create a vector search index with the above definition for the `collection` collection
create_index(collection, ATLAS_VECTOR_SEARCH_INDEX_NAME, index_definition)

Creating the vector_index index


In [24]:
# Use the `check_index_ready` function from the `utils` module to verify that the index was created and is in READY status before proceeding
check_index_ready(collection, ATLAS_VECTOR_SEARCH_INDEX_NAME)

vector_index index status: READY
vector_index index definition: {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}, {'type': 'filter', 'path': 'source'}]}


### Step7: Perform Vector Search on our data

In [25]:
def vector_search(user_query: str, limit: int = 7, min_score: float = 0.7):
    """
    Optimized vector search with filtering and caching
    """
    query_embedding = embedding_model.encode([user_query])[0].tolist()
    
    pipeline = [
        {
            "$vectorSearch": {
                "index": ATLAS_VECTOR_SEARCH_INDEX_NAME,
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 50,  # Reduced for better performance
                "limit": limit,  # Get more candidates for filtering
            }
        },
        {
            "$project": {
                "_id": 0,
                "text": 1,
                "source": 1,
                "chunk_id": 1,
                "score": {"$meta": "vectorSearchScore"}
            }
        },
        {
            "$limit": limit
        }
    ]
    
    results = list(collection.aggregate(pipeline))
    return results

In [26]:
vector_search("AI engineering skills required")

[{'source': 'Intern FAQ - AI Bootcamp.docx',
  'chunk_id': 20,
  'text': '✅ Demonstrated strong engineering fundamentals\n🎖 Recognition:\n🏅 LinkedIn badge\n🏅Team recognition post on LinkedIn\n🏅 AI Engineer/ AI Designer Certification\n🏅 Access to free job referrals to tech companies\n🏅 Letter of recommendation from team lead\n🏅 LinkedIn endorsement from the team lead\n🏅 Priority access to  future bootcamp cohorts\n🥉 Tier 3: AI Rising Star\nTook initiative. Gained real-world experience.\nCriteria:\n✅ Participated in the entire project without quitting\n✅ Making efforts to communicate with the team lead and team members\n✅ Showed enthusiasm to grow and learn\n🎖 Recognition:',
  'score': 0.938530683517456},
 {'source': 'Intern FAQ - AI Bootcamp.docx',
  'chunk_id': 8,
  'text': 'Week 4: All: Join Pitch Day where Product Mangers and Designers will demonstrate their Ai product ideas and High-Fidelity designs after user research and gathering Voice of the Customer feedback.\nEngineers will th

In [27]:
vector_search("Bootcamp Journey")

[{'source': 'AI Bootcamp Journey & Learning Path.docx',
  'chunk_id': 0,
  'text': 'Bootcamp Journey\nUse this document as a high-level overview of your journey.\nThis document will reference both these aspects:\nTechnical Skills Development\nCore ML/AI Concepts\nGen AI & Data Engineering\nMLOps & Deployment\nProject-Based Learning\nAgile Scrum Methodology\nTeam Collaborations\nReal-world Applications\nProject Timeline\nHere is a high-level time line of your 11-week journey.\nWeek 1 - 11 Agenda for AI PM Bootcamp\nWeek 1: Learning and Onboarding Study all the AI knowledge:\nTraining for AI Engineers\nTraining of AI Designer\nEngineers: Working on Job Tracker or PM FAQ Chatbot',
  'score': 0.9556460976600647},
 {'source': 'Intern FAQ - AI Bootcamp.docx',
  'chunk_id': 0,
  'text': 'Intern FAQ - AI Bootcamp\nHere are the next steps (Intern Onboarding).\nIF YOU DIDN’T watch the welcome and onboarding video by Dr. Nancy Li, make sure to watch it now. This is mandatory to know how to be suc

In [28]:
vector_search("Cohort Schedule")

[{'source': 'Intern FAQ - AI Bootcamp.docx',
  'chunk_id': 16,
  'text': "Cohort #5 = Jun-23-2025  - Sep-12-2025\nCohort #6 = Sept-15-2025  - Nov-28-2025\n🧩 Team Matching Process\n🔹 Group Team Match (Week 4)\nDuring this phase, students will:\nFill out a team match form detailing their background, skills, and interests\nAttend a Zoom team match call where PMs will pitch product ideas\nEngineers will then fill out a Google form:\nSharing their tech stack years of experience\nRanking their top 3 product idea choices\nWant to be a Lead Engineer or not.\nTeams will be formed based on mutual interest, project scope, and complementary skills\n📹 Example: Cohort 3's  team match video.",
  'score': 0.9504494667053223},
 {'source': 'Intern FAQ - AI Bootcamp.docx',
  'chunk_id': 33,
  'text': '—------------------------------------------------------------------------------------------------------------------\nAdditional Intern Questions:\nWhat happens in situations when my OPT/CPT dates are 1-mont

### Step8: Build the RAG application

In [31]:
from openai import AzureOpenAI

In [32]:
ENDPOINT=os.environ.get("OPENAI_ENDPOINT")
API_KEY=os.environ.get("OPENAI_KEY")
MODEL_NAME=os.environ.get("MODEL_NAME")
API_VERSION=os.environ.get("API_VERSION")

In [33]:
client = AzureOpenAI(
    api_version=API_VERSION,
    azure_endpoint=ENDPOINT,
    api_key=API_KEY,
)

In [34]:
def create_prompt(user_query: str) -> str:
    """
    Create a chat prompt that includes the user query and retrieved context.

    Args:
        user_query (str): The user's query string.

    Returns:
        str: The chat prompt string.
    """
    # Retrieve the most relevant documents for the `user_query` using the `vector_search` function defined in Step 7
    context = vector_search(user_query)
    # Join the retrieved documents into a single string, where each document is separated by two new lines ("\n\n")
    context = "\n\n".join([doc.get('text') for doc in context])
    # print(context)
    # Prompt consisting of the question and relevant context to answer it
    prompt = f"Answer the question based only on the following context. If the context is empty, say I DON'T KNOW\n\nContext:\n{context}\n\nQuestion:{user_query}"
    return prompt

In [35]:
#  Define a function to answer user queries
def generate_answer(user_query: str) -> None:
    """
    Generate an answer to the user query.

    Args:
        user_query (str): The user's query string.
    """

    prompt = create_prompt(user_query)
    # print(prompt)
    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a helpful Q&A assistant.",},
            {"role": "user","content": prompt,}
        ],
        max_tokens=4096,
        temperature=1.0,
        top_p=1.0,
        model=MODEL_NAME
    )

    print(response.choices[0].message.content)

In [36]:
generate_answer('2 Team matching sessions')

The two team matching sessions are as follows:

1. Week 2: Product Managers will pitch to Designers to join their AI product idea.
2. Week 4: Product Managers and Designers will create High-Fidelity designs to pitch their AI product idea to Engineers/Data Scientists.


In [37]:
generate_answer('Team Structure Overview')

Each team is formed from participants in the PM Accelerator Program and typically includes:

- Product Managers – Responsible for business case development, voice of customer interviews, and market research
- Developers – Execute the technical build
- (Optional) Data Scientists & Designers – Added as needed, based on project complexity

Typical team size: 8–10 members. Larger-scope projects may be assigned bigger teams. Teams with multiple PMs and developers tend to launch more ambitious, successful products.


In [40]:
generate_answer('Cohort Schedule')

Cohort #5 runs from June 23, 2025, to September 12, 2025. Following that, Cohort #6 is scheduled from September 15, 2025, to November 28, 2025.
