In [None]:
!pip install pypdf langchain sentence-transformers faiss-cpu


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

# Define your base project directory in Google Drive
# I recommend creating a dedicated folder in MyDrive, e.g., 'my_rag_chatbot_project'
base_drive_path = "/content/drive/MyDrive/my_rag_chatbot_project"

# Create the necessary subdirectories
os.makedirs(os.path.join(base_drive_path, "data"), exist_ok=True)
os.makedirs(os.path.join(base_drive_path, "chunks"), exist_ok=True)
os.makedirs(os.path.join(base_drive_path, "vectordb"), exist_ok=True)
os.makedirs(os.path.join(base_drive_path, "notebooks"), exist_ok=True)
os.makedirs(os.path.join(base_drive_path, "src"), exist_ok=True)

print(f"Project directories created in: {base_drive_path}")

In [None]:
!pip install pypdf langchain langchain-community sentence-transformers faiss-cpu transformers torch -q
# -q for quiet installation

In [None]:
# notebooks/1_document_processing_and_embedding.ipynb (Colab Version)

import os
from google.colab import drive
# Make sure you've mounted drive and created directories first as per steps 2 & 3 above.
# from google.colab import files # If you plan to upload file via code

# Define your base project directory in Google Drive
base_drive_path = "/content/drive/MyDrive/my_rag_chatbot_project"

# --- Step 2: Load the document ---
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define the path to your PDF document in Google Drive
pdf_path = os.path.join(base_drive_path, "data", "chatdata.pdf")

if not os.path.exists(pdf_path):
    print(f"Error: PDF not found at {pdf_path}. Please upload it to your Google Drive.")
else:
    print(f"Loading document from: {pdf_path}")
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    print(f"Loaded {len(documents)} pages.")

    full_document_content = "\n".join([doc.page_content for doc in documents])
    print(f"Content length before chunking: {len(full_document_content)} characters.")

    # --- Step 3: Clean and format the text (if needed) ---
    cleaned_content = full_document_content # Placeholder for actual cleaning if required later

    # --- Step 4: Chunk the documents into 100-300 word segments using sentence-aware splitting ---
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
    )

    chunks = text_splitter.create_documents([cleaned_content])
    print(f"Generated {len(chunks)} chunks.")

    # Optional: Save chunks to a file in Google Drive for review
    import json
    chunk_data = [{"page_content": chunk.page_content, "metadata": chunk.metadata} for chunk in chunks]
    chunks_save_path = os.path.join(base_drive_path, "chunks", "processed_chunks.json")
    with open(chunks_save_path, "w", encoding="utf-8") as f:
        json.dump(chunk_data, f, ensure_ascii=False, indent=4)
    print(f"Chunks saved to {chunks_save_path}")

    # --- Step 5: Generate embeddings using a pre-trained model ---
    from langchain_community.embeddings import SentenceTransformerEmbeddings

    embedding_model_name = "all-MiniLM-L6-v2"
    print(f"Loading embedding model: {embedding_model_name}")
    embeddings = SentenceTransformerEmbeddings(model_name=embedding_model_name)
    print("Embedding model loaded.")

    # --- Step 6: Store the embeddings in a vector database (FAISS) ---
    from langchain_community.vectorstores import FAISS

    print("Creating FAISS vector database...")
    vector_db = FAISS.from_documents(chunks, embeddings)
    print("FAISS vector database created.")

    # --- Step 7: Save the vector database to Google Drive ---
    faiss_save_path = os.path.join(base_drive_path, "vectordb", "faiss_index")
    vector_db.save_local(faiss_save_path)
    print(f"FAISS index saved to {faiss_save_path}")

    print("Document processing and embedding complete!")