In [None]:
!pip install -U torch torchvision --no-cache-dir
!pip install numpy google-colab huggingface_hub python-docx fastapi pyngrok uvicorn nest_asyncio sentence-transformers faiss-cpu datasets --no-cache-dir
!pip install -U bitsandbytes transformers


import os
import re
import gc
import torch
import numpy as np
from google.colab import drive
from huggingface_hub import login, snapshot_download
from docx import Document
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from fastapi import FastAPI
from pyngrok import ngrok
import uvicorn
import nest_asyncio
from sentence_transformers import SentenceTransformer
import faiss
from typing import List
from google.colab import userdata
from datasets import load_dataset
import json

# Mount Google Drive
drive.mount('/content/drive')
nest_asyncio.apply()

# Configuration
KB_PATH = '/content/drive/MyDrive/lifesciences/training_documents/'
DRIVE_MODEL_PATH = '/content/drive/MyDrive/lifesciences/models/'
MODEL_REPO_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# Constants
CHUNK_SIZE = 512  # In tokens
CHUNK_OVERLAP = 50
SIMILARITY_THRESHOLD = 0.65
MAX_CONTEXT_LENGTH = 3000

# ----------------------------
# Initialization (Run once)
# ----------------------------
class RAGSystem:
    def __init__(self):
        self.tokenizer = None
        self.model = None
        self.embedding_model = None
        self.index = None
        self.chunks = []

    def initialize(self):
        """One-time initialization of all components"""
        print("Entering initialize() method")
        try:
            self._auth_huggingface()
            model_path = self._download_model()
            self._load_llm(model_path)
            self._load_embedding_model()
            self._process_knowledge_base()
        except Exception as e:
            print(f"Error during initialization: {str(e)}")
        print("Exiting initialize() method")

    def _auth_huggingface(self):
        """Authenticate with Hugging Face"""
        print("Entering _auth_huggingface() method")
        try:
            from google.colab import userdata
            login(token=userdata.get("HF_TOKEN"))
            print("Hugging Face authentication successful.")
        except Exception as e:
            print(f"Authentication failed: {str(e)}")
            raise
        print("Exiting _auth_huggingface() method")

    def _download_model(self):
        """Download model from Hugging Face Hub to a new folder inside models"""
        print("Entering _download_model() method")
        try:
            model_folder = os.path.join(DRIVE_MODEL_PATH, MODEL_REPO_ID.split('/')[-1])
            os.makedirs(model_folder, exist_ok=True)

            model_path = snapshot_download(
                repo_id=MODEL_REPO_ID,
                cache_dir=model_folder,
                revision="main",
                ignore_patterns=["*.msgpack", "*.h5", "*.ot"],
                local_dir=model_folder,
                local_dir_use_symlinks=False
            )
            print(f"Model downloaded to {model_path}")
            return model_path
        except Exception as e:
            print(f"Model download failed: {str(e)}")
            raise
        print("Exiting _download_model() method")

    def _load_llm(self, model_path):
        """Load LLM with quantization"""
        print("Entering _load_llm() method")
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
            bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
            self.model = AutoModelForCausalLM.from_pretrained(
                model_path,
                device_map="auto",
                torch_dtype=torch.float16,
                trust_remote_code=True,
                quantization_config=bnb_config
            )
            print("LLM loaded successfully")
        except Exception as e:
            print(f"Model loading failed: {str(e)}")
            raise
        print("Exiting _load_llm() method")

    def _load_embedding_model(self):
        """Load sentence transformer model"""
        print("Entering _load_embedding_model() method")
        try:
            self.embedding_model = SentenceTransformer('thenlper/gte-base')
            print("Embedding model loaded")
        except Exception as e:
            print(f"Embedding model loading failed: {str(e)}")
            raise
        print("Exiting _load_embedding_model() method")

    def _process_knowledge_base(self):
        """Process documents and create FAISS index"""
        print("Entering _process_knowledge_base() method")
        try:
            self.chunks = self._chunk_documents()
            embeddings = self._create_embeddings(self.chunks)
            self._create_faiss_index(embeddings)
            print(f"Processed {len(self.chunks)} knowledge chunks")
        except Exception as e:
            print(f"Error processing knowledge base: {str(e)}")
            raise
        print("Exiting _process_knowledge_base() method")

    def _chunk_documents(self) -> List[str]:
        """Improved document chunking with text cleaning"""
        print("Entering _chunk_documents() method")
        try:
            chunks = []
            for doc_path in self._get_docx_files():
                content = self._read_docx(doc_path)
                content = self._clean_text(content)
                chunks.extend(self._token_based_chunking(content))
            return chunks
        except Exception as e:
            print(f"Error during chunking documents: {str(e)}")
            return []
        print("Exiting _chunk_documents() method")

    def _get_docx_files(self):
        """Get all DOCX files from knowledge base path"""
        print("Entering _get_docx_files() method")
        try:
            return [os.path.join(KB_PATH, f) for f in os.listdir(KB_PATH) if f.endswith('.docx')]
        except Exception as e:
            print(f"Error retrieving DOCX files: {str(e)}")
            return []
        print("Exiting _get_docx_files() method")

    def _read_docx(self, file_path: str) -> str:
        """Read DOCX file with paragraphs and tables, with error handling, maintaining section headings with tables."""
        print(f"Entering _read_docx() method for {file_path}")
        try:
            doc = Document(file_path)
            content = []
            temp_section = ""

            for para in doc.paragraphs:
                if para.style and para.style.name.startswith("Heading"):
                    if temp_section:
                        content.append(temp_section.strip())
                    temp_section = para.text.strip()
                else:
                    temp_section += "\n" + para.text.strip()

                for table in doc.tables:
                    if table._element.getparent() == para._element.getparent():
                        table_text = ""
                        for row in table.rows:
                            row_text = "\t".join(cell.text.strip() for cell in row.cells if cell.text.strip())
                            if row_text:
                                table_text += row_text + "\n"
                        temp_section += "\n" + table_text.strip()

            if temp_section:
                content.append(temp_section.strip())

            return "\n\n".join(content).strip()

        except Exception as e:
            print(f"Error reading {file_path}: {str(e)}")
            return ""
        print(f"Exiting _read_docx() method for {file_path}")

    def _clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        print("Entering _clean_text() method")
        try:
            text = re.sub(r'\s+', ' ', text)
            text = re.sub(r'\u200b', '', text)
            return text.strip()
        except Exception as e:
            print(f"Error cleaning text: {str(e)}")
            return text
        print("Exiting _clean_text() method")

    def _token_based_chunking(self, text: str) -> List[str]:
        """Token-based chunking with overlap, ensuring chunks don't exceed max length"""
        print("Entering _token_based_chunking() method")
        try:
            # Tokenize the text into tokens
            tokens = self.tokenizer.encode(text, add_special_tokens=False)
            num_tokens = len(tokens)

            # Get the model's max token length (adjust this value based on your model)
            max_length = self.tokenizer.model_max_length

            # If tokens exceed max_length, we need to chunk them
            if num_tokens > max_length:
                print(f"Input text has {num_tokens} tokens, which exceeds model's max length of {max_length}. Chunking will occur.")

            # Initialize chunks list
            chunks = []

            # Create chunks with overlap, ensuring each chunk is smaller than max_length
            for start in range(0, num_tokens, CHUNK_SIZE - CHUNK_OVERLAP):
                end = min(start + CHUNK_SIZE, num_tokens)

                # Make sure we don't exceed max token length
                if end - start > max_length:
                    print(f"Truncating chunk: start={start}, end={end}, chunk_size={end - start} exceeds max length!")
                    end = start + max_length

                chunk_tokens = tokens[start:end]
                chunk = self.tokenizer.decode(chunk_tokens, skip_special_tokens=True)
                chunks.append(chunk)

                # Optional: Log chunk size for debugging
                #print(f"Chunk {len(chunks)}: {len(chunk_tokens)} tokens (start: {start}, end: {end})")

            # Final number of chunks
            print(f"Total number of chunks: {len(chunks)}")

            return chunks

        except Exception as e:
            print(f"Error during token-based chunking: {str(e)}")
            return []

        finally:
            print("Exiting _token_based_chunking() method")


    def _create_embeddings(self, chunks: List[str]):
        """Create embeddings with batching and memory management."""
        print("Entering _create_embeddings() method")
        try:
            embeddings = []
            batch_size = 64

            for i in range(0, len(chunks), batch_size):
                batch = chunks[i:i+batch_size]
                emb = self.embedding_model.encode(batch, show_progress_bar=False)
                embeddings.append(emb)
                del batch
                gc.collect()

            if embeddings:
                return np.vstack(embeddings)
            else:
                print("No embeddings were generated.")
                return None
        except Exception as e:
            print(f"Error creating embeddings: {str(e)}")
            return None
        print("Exiting _create_embeddings() method")

    def _create_faiss_index(self, embeddings: np.ndarray):
        """Create optimized FAISS index"""
        print("Entering _create_faiss_index() method")
        try:
            dimension = embeddings.shape[1]
            self.index = faiss.IndexFlatIP(dimension)
            faiss.normalize_L2(embeddings)
            self.index.add(embeddings)
            print(f"FAISS index created with {self.index.ntotal} vectors")
        except Exception as e:
            print(f"Error creating FAISS index: {str(e)}")
        print("Exiting _create_faiss_index() method")

    def _find_similar_documents(self, query: str) -> str:
          """Find the most relevant chunks from the knowledge base based on query similarity"""
          print("Entering _find_similar_documents() method")
          try:
              # Encode the query to get its embedding
              query_embedding = self.embedding_model.encode([query])[0]

              # Normalize the query embedding (FAISS requires normalized vectors)
              faiss.normalize_L2(query_embedding)

              # Perform the FAISS search for the most similar documents
              distances, indices = self.index.search(np.array([query_embedding]), k=5)

              # Check if the search returned valid indices and distances
              if distances is None or indices is None:
                  print("Error: FAISS search returned None for distances or indices.")
                  return "Sorry, I couldn't find any relevant documents."

              if len(indices) == 0 or len(indices[0]) == 0:
                  print("No similar documents found.")
                  return "Sorry, no similar documents were found."

              # Log the top 5 results for debugging purposes
              print(f"Top 5 similar documents (indices): {indices[0]}")
              print(f"Top 5 distances: {distances[0]}")

              # Fetch the context for the top results (ensure indices are valid)
              context = "\n\n".join([self.chunks[i] for i in indices[0] if i < len(self.chunks)])

              # If no valid context is found, return a fallback message
              if not context:
                  print("Error: No valid context found for the top indices.")
                  return "Sorry, no relevant context could be retrieved."

              return context

          except Exception as e:
              # Catch any unexpected errors and log them for debugging
              print(f"Error during document retrieval: {str(e)}")
              return "Sorry, there was an error while processing your request."

          finally:
              # Clean up any unnecessary variables to free memory
              del query_embedding
              gc.collect()

          print("Exiting _find_similar_documents() method")



    def generate_response(self, query: str, context: str) -> str:
      """Generate a response by feeding the query and context into the language model"""
      print("Entering generate_response() method")
      try:
          full_input = f"Query: {query}\nContext: {context}"
          if len(full_input) > MAX_CONTEXT_LENGTH:
              full_input = full_input[:MAX_CONTEXT_LENGTH]

          # Tokenize the input
          inputs = self.tokenizer(full_input, return_tensors="pt", truncation=True, padding=True)

          # Ensure the input tensor is on the same device as the model (GPU)
          device = self.model.device  # Get model's device
          inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to the same device as model

          # Generate output
          outputs = self.model.generate(**inputs, max_length=1024)
          response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
          return response
      except Exception as e:
          print(f"Error generating response: {str(e)}")
          return "Sorry, I couldn't generate a response."
      print("Exiting generate_response() method")



app = FastAPI()

@app.on_event("startup")
async def startup_event():
    rag_system.initialize()
    ngrok.set_auth_token(userdata.get("ngrok_auth_token"))
    public_url = ngrok.connect(8000)
    print(f"API available at: {public_url}")

@app.post("/query")
async def handle_query(query: str):
    try:
        if len(query) < 3:
            return {"error": "Query too short"}

        context = rag_system._find_similar_documents(query)
        response = rag_system.generate_response(query, context)
        return {"response": response}

    except Exception as e:
        return {"error": str(e)}

def main():
    """Run FastAPI server"""
    uvicorn.run(app, host="0.0.0.0", port=8000)

if __name__ == "__main__":
  # ----------------------------
    # FastAPI Application
  #----------------------------
  rag_system = RAGSystem()
  main()




        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        
  @app.on_event("startup")
INFO:     Started server process [6256]
INFO:     Waiting for application startup.


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Entering initialize() method
Entering _auth_huggingface() method
Hugging Face authentication successful.
Exiting _auth_huggingface() method
Entering _download_model() method


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Model downloaded to /content/drive/MyDrive/lifesciences/models/DeepSeek-R1-Distill-Qwen-1.5B
Entering _load_llm() method
LLM loaded successfully
Exiting _load_llm() method
Entering _load_embedding_model() method
Embedding model loaded
Exiting _load_embedding_model() method
Entering _process_knowledge_base() method
Entering _chunk_documents() method
Entering _get_docx_files() method
Entering _read_docx() method for /content/drive/MyDrive/lifesciences/training_documents/ProtonGlow_Test_URS (1).docx
Entering _clean_text() method
Entering _token_based_chunking() method


Token indices sequence length is longer than the specified maximum sequence length for this model (2487177 > 16384). Running this sequence through the model will result in indexing errors


Input text has 2487177 tokens, which exceeds model's max length of 16384. Chunking will occur.
Total number of chunks: 5384
Exiting _token_based_chunking() method
Entering _create_embeddings() method
Entering _create_faiss_index() method
FAISS index created with 5384 vectors
Exiting _create_faiss_index() method
Processed 5384 knowledge chunks
Exiting _process_knowledge_base() method
Exiting initialize() method


INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


API available at: NgrokTunnel: "https://554c-34-124-175-170.ngrok-free.app" -> "http://localhost:8000"




Entering _find_similar_documents() method
Error during document retrieval: tuple index out of range


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Entering generate_response() method
INFO:     95.223.75.30:0 - "POST /query?query=%22The%20SCADA%20system%20should%20support%20quality%20control%20%22 HTTP/1.1" 200 OK
