In [24]:
# !pip install python-docx langchain
# !pip install sentence-transformers
# !pip install pymongo
# !pip install tf-keras

**Importing Packages**

In [56]:
from pymongo import MongoClient
from pymongo.errors import OperationFailure
from pymongo.collection import Collection

from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from docx import Document

import os
import requests
from typing import Dict
import time

**MongoDB Connection**

In [57]:
MONGODB_URI = (
    "mongodb+srv://ssmaheswar2001:11%40May%402001@discordchatbot.6llgqxu.mongodb.net/?retryWrites=true&w=majority&appName=DiscordChatbot"
)

# Create a new client and connect to the server
mongodb_client = MongoClient(MONGODB_URI)

# Send a ping to confirm a successful connection
try:
    mongodb_client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

DB_NAME = "rag_chatbot"
COLLECTION_NAME = "knowledge_base"

collection = mongodb_client[DB_NAME][COLLECTION_NAME]

# Bulk delete all existing records from the collection defined above
collection.delete_many({})

Pinged your deployment. You successfully connected to MongoDB!


DeleteResult({'n': 228, 'electionId': ObjectId('7fffffff000000000000026f'), 'opTime': {'ts': Timestamp(1752356044, 46), 't': 623}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1752356044, 47), 'signature': {'hash': b'\x16\xbeJ\xa7u\xd1SbR\x94?\xdb`\xadO\xb1\xbb\xcfW\x91', 'keyId': 7461045087271649375}}, 'operationTime': Timestamp(1752356044, 46)}, acknowledged=True)

In [58]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # 384-dimension

In [59]:
# 1. Extract text from .docx
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    text = []
    for para in doc.paragraphs:
        if para.text.strip():
            text.append(para.text.strip())
    return "\n".join(text)

In [60]:
# path = r'/AI Bootcamp Journey & Learning Path.docx'
# path = r'/content/drive/MyDrive/Research/RAG_Chatbot/Knowledge_Docs/AI Bootcamp Journey & Learning Path.docx'
# extract_text_from_docx(path)

In [61]:
# 2. Chunk text using LangChain's RecursiveCharacterTextSplitter
def chunk_text(text, chunk_size=200, chunk_overlap=30):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)

In [62]:
# 3. Generate embeddings using Sentence Transformers
def generate_embeddings(chunks):
    embeddings = model.encode(chunks)
    return embeddings

In [63]:
# Store in MongoDB
def store_chunks_in_mongo(chunks, embeddings, source_doc):
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        record = {
            "source": source_doc,
            "chunk_id": i,
            "text": chunk,
            "embedding": embedding.tolist()  # numpy -> list
        }
        collection.insert_one(record)
    print(f"✅ Stored {len(chunks)} chunks from {source_doc} in MongoDB.")

In [64]:
# 4. Process a single document
def process_document(file_path):
    print(f"\n🔹 Processing: {file_path}")
    text = extract_text_from_docx(file_path)
    chunks = chunk_text(text)
    embeddings = generate_embeddings(chunks)

    store_chunks_in_mongo(chunks, embeddings, os.path.basename(file_path) )

    # Preview first 3 chunks
    for i in range(min(3, len(chunks))):
        print(f"\n--- Chunk {i+1} ---\n{chunks[i][:300]}...\nEmbedding Sample: {embeddings[i][:5]}\n")

    return chunks, embeddings

In [65]:
# 5. List of your uploaded documents
file_paths = [
    r'/content/drive/MyDrive/Research/RAG_Chatbot/Knowledge_Docs/AI Bootcamp Journey & Learning Path.docx',
    r'/content/drive/MyDrive/Research/RAG_Chatbot/Knowledge_Docs/Intern FAQ - AI Bootcamp.docx',
    r'/content/drive/MyDrive/Research/RAG_Chatbot/Knowledge_Docs/Training For AI Engineer Interns.docx'
]

In [66]:
# 6. Run the process for each file
for file_path in file_paths:
    process_document(file_path)


🔹 Processing: /content/drive/MyDrive/Research/RAG_Chatbot/Knowledge_Docs/AI Bootcamp Journey & Learning Path.docx
✅ Stored 45 chunks from AI Bootcamp Journey & Learning Path.docx in MongoDB.

--- Chunk 1 ---
Bootcamp Journey
Use this document as a high-level overview of your journey.
This document will reference both these aspects:
Technical Skills Development
Core ML/AI Concepts
Gen AI & Data Engineering...
Embedding Sample: [ 0.00062082 -0.10144586  0.02150271  0.03365792  0.02824496]


--- Chunk 2 ---
Gen AI & Data Engineering
MLOps & Deployment
Project-Based Learning
Agile Scrum Methodology
Team Collaborations
Real-world Applications
Project Timeline...
Embedding Sample: [-0.01071191 -0.04011107 -0.03363782 -0.01385395  0.06305641]


--- Chunk 3 ---
Project Timeline
Here is a high-level time line of your 11-week journey.
Week 1 - 11 Agenda for AI PM Bootcamp
Week 1: Learning and Onboarding Study all the AI knowledge:
Training for AI Engineers...
Embedding Sample: [-0.02976413 -0.0

In [67]:
print(collection.find_one())

{'_id': ObjectId('6872d4d8a5ac9bc63b8dfd57'), 'source': 'AI Bootcamp Journey & Learning Path.docx', 'chunk_id': 0, 'text': 'Bootcamp Journey\nUse this document as a high-level overview of your journey.\nThis document will reference both these aspects:\nTechnical Skills Development\nCore ML/AI Concepts\nGen AI & Data Engineering', 'embedding': [0.0006208208505995572, -0.10144586116075516, 0.02150271087884903, 0.033657923340797424, 0.02824496105313301, -0.003074043430387974, 0.07726439088582993, 0.01505653653293848, -0.11698219180107117, -0.04487369954586029, -0.08303511142730713, -0.0037865121848881245, 0.07342841476202011, -0.010206049308180809, -0.02167656645178795, 0.07771863788366318, 0.016874490305781364, -0.05782297998666763, -0.0060520595870912075, -0.08342014253139496, 0.06491820514202118, 0.01680588722229004, -0.027954338118433952, 0.003069828264415264, -0.035850197076797485, 0.07642248272895813, 0.06691712141036987, -0.006445317063480616, 0.01712341047823429, 0.040400780737400

**Vector Search Index**

In [68]:
SLEEP_TIMER = 5

In [69]:
def create_index(collection: Collection, index_name: str, model: Dict) -> None:
    """
    Create a search index

    Args:
        collection (Collection): Collection to create search index against
        index_name (str): Index name
        model (Dict): Index definition
    """
    try:
        print(f"Creating the {index_name} index")
        collection.create_search_index(model=model)
    except OperationFailure:
        print(f"{index_name} index already exists, recreating...")
        try:
            print(f"Dropping {index_name} index")
            collection.drop_search_index(name=index_name)

            # Poll for index deletion to complete
            while True:
                indexes = list(collection.list_search_indexes())
                index_exists = any(idx.get("name") == index_name for idx in indexes)
                if not index_exists:
                    print(f"{index_name} index deletion complete")
                    break
                print(f"Waiting for {index_name} index deletion to complete...")
                time.sleep(SLEEP_TIMER)

            print(f"Creating new {index_name} index")
            collection.create_search_index(model=model)
            print(f"Successfully recreated the {index_name} index")
        except Exception as e:
            raise Exception(f"Error during index recreation: {str(e)}")


def check_index_ready(collection: Collection, index_name: str) -> None:
    """
    Poll for index status until it's ready

    Args:
        collection (Collection): Collection to check index status against
        index_name (str): Name of the index to check
    """
    while True:
        indexes = list(collection.list_search_indexes())
        matching_indexes = [idx for idx in indexes if idx.get("name") == index_name]

        if not matching_indexes:
            print(f"{index_name} index not found")
            time.sleep(SLEEP_TIMER)
            continue

        index = matching_indexes[0]
        status = index["status"]
        if status == "READY":
            print(f"{index_name} index status: READY")
            print(f"{index_name} index definition: {index['latestDefinition']}")
            break

        print(f"{index_name} index status: {status}")
        time.sleep(SLEEP_TIMER)

In [70]:
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

In [71]:
model = {
    "name": ATLAS_VECTOR_SEARCH_INDEX_NAME,
    "type": "vectorSearch",
    "definition": {
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 384,
                "similarity": "cosine",
            }
        ]
    },
}

In [72]:
# Use the `create_index` function from the `utils` module to create a vector search index with the above definition for the `collection` collection
create_index(collection, ATLAS_VECTOR_SEARCH_INDEX_NAME, model)

Creating the vector_index index


In [73]:
# Use the `check_index_ready` function from the `utils` module to verify that the index was created and is in READY status before proceeding
check_index_ready(collection, ATLAS_VECTOR_SEARCH_INDEX_NAME)

vector_index index status: READY
vector_index index definition: {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 384, 'similarity': 'cosine'}]}


**Performing vector search on our Data**

In [74]:
# Define a function to retrieve relevant documents for a user query using vector search
def vector_search(user_query: str):
    """
    Retrieve relevant documents for a user query using vector search.

    Args:
    user_query (str): The user's query string.

    Returns:
    list: A list of matching documents.
    """

    query_embedding = model.encode(user_query).tolist()

    pipeline = [
                    {
                        "$vectorSearch": {
                            "index": ATLAS_VECTOR_SEARCH_INDEX_NAME,
                            "queryVector": query_embedding,
                            "path": "embedding",
                            "numCandidates": 150,
                            "limit": 10,
                        }
                    },
                    {"$project": {
                            "_id": 0,
                            "text":1,
                            "score": {"$meta": "vectorSearchScore"}
                            }
                    }
                ]

    # Execute the aggregation `pipeline` and store the results in `results`
    results = collection.aggregate(pipeline)
    return list(results)

In [75]:
vector_search("AI engineering skills required")

AttributeError: 'dict' object has no attribute 'encode'