In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
import chromadb
from langchain_community.document_loaders import TextLoader
import os
import torch


## Step 1 : Setting up the knowledge base

In [6]:
# initialise chromaDB
client= chromadb.PersistentClient(path='./research_db')
collection = client.get_or_create_collection(name='ml_publication',
                                             metadata={'hnsw:space':'cosine'})

##Step 2: Loading the Publications 📖

In [10]:
def load_research_publications(documents_path):
    """Load research publications from .txt files and return as list of strings"""


    documents = []

    # Load each .txt file in the documents folder
    for file in os.listdir(documents_path):
        if file.endswith(".txt"):
            file_path = os.path.join(documents_path, file)
            try:
                loader = TextLoader(file_path)
                loaded_docs = loader.load()
                documents.extend(loaded_docs)
                print(f"Successfully loaded: {file}")
            except Exception as e:
                print(f"Error loading {file}: {str(e)}")

    print(f"\nTotal documents loaded: {len(documents)}")

    # Extract content as strings and return
    publications = []
    for doc in documents:
        publications.append(doc.page_content)

    return publications

##Step 3: Chunking publications

In [11]:
def chunk_research_paper(paper_content, title):
    """Break a research paper into searchable chunks"""

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,          # ~200 words per chunk
        chunk_overlap=200,        # Overlap to preserve context
        separators=["\n\n", "\n", ". ", " ", ""]
    )

    chunks = text_splitter.split_text(paper_content)

    # Add metadata to each chunk
    chunk_data = []
    for i, chunk in enumerate(chunks):
        chunk_data.append({
            "content": chunk,
            "title": title,
            "chunk_id": f"{title}_{i}",
        })

    return chunk_data

#Step 4 : Creating Embeddings

In [12]:

def embed_documents(documents: list[str]) -> list[list[float]]:
    """
    Embed documents using a model.
    """
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps" if torch.backends.mps.is_available() else "cpu"
    )
    model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={"device": device},
    )

    embeddings = model.embed_documents(documents)
    return embeddings

## Step 5: Storing in Vector Database 💾

In [13]:
def insert_publications(collection:chromadb.Collection, publications : list[str]):
  """
  Insert documents into a ChromaDB collection.

    Args:
        collection (chromadb.Collection): The collection to insert documents into
        publications (list[str]): The documents to insert

    Returns:
        None
  """
  next_id = collection.count()
  for publication in publications:
    chunked_publication = chunk_research_paper(publication)
    embeddings = embed_documents(chunked_publication)
    ids = list(range(next_id, next_id + len(chunked_publication)))
    ids = [f"document_{id}" for id in ids]
    collection.add(
        embeddings=embeddings,
        ids=ids,
        documents=chunked_publication
    )

##Step 6: Intelligent Retrieval 🎯

In [17]:
def search_research_db(query, collection, embedding_model, top_k=5):
   # Convert question to vector using the embedding model
    query_vector = embedding_model.embed_query(query)

    # Search for similar content
    results = collection.query(
        query_embeddings=[query_vector],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )

    # Format results
    relevant_chunks = []
    for i, doc in enumerate(results["documents"][0]):
        relevant_chunks.append({
            "content": doc,
            "title": results["metadatas"][0][i]["title"],
            "similarity": 1 - results["distances"][0][i]  # Convert distance to similarity
        })

    return relevant_chunks

## Step 7: Generating Research-Backed Answers 🤖

In [19]:
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from google.colab import userdata
import os
from langchain_huggingface import HuggingFaceEmbeddings
import torch

def answer_research_question(query, collection, embedding_model, llm):
    """Generate an answer based on retrieved research"""

    # Get relevant research chunks
    relevant_chunks = search_research_db(query, collection, embedding_model, top_k=3)

    # Build context from research
    context = "\n\n".join([
        f"From {chunk['title']}:\n{chunk['content']}"
        for chunk in relevant_chunks
    ])

    # Create research-focused prompt
    prompt_template = PromptTemplate(
        input_variables=["context", "question"],
        template="""
Based on the following research findings, answer the researcher's question:

Research Context:
{context}

Researcher's Question: {question}

Answer: Provide a comprehensive answer based on the research findings above.
"""
    )

    # Generate answer
    prompt = prompt_template.format(context=context, question=query)
    response = llm.invoke(prompt)
    return response.content, relevant_chunks

# Retrieve the API key from Colab's Secrets Manager
GROQ_API_KEY = userdata.get('GROQ_API_KEY')

# Set the environment variable
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

# Initialize LLM
llm = ChatGroq(model="llama3-8b-8192")

# Initialize embedding model
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": device},
)

# Load publications (assuming this function is defined elsewhere and works)
publications = load_research_publications('./research_db') # This line might need adjustment based on your actual data loading

# Call the answer_research_question function
#Note: You need to make sure 'collection' is initialized and populated with documents
#and 'publications' is loaded before running this.
answer, sources = answer_research_question(
    "What are effective techniques for handling class imbalance?",
    collection,
    embedding_model, # Pass the embedding model object
    llm
)

print("AI Answer:", answer)
print("\nBased on sources:")
for source in sources:
    print(f"- {source['title']}")


Total documents loaded: 0
AI Answer: I apologize, but there is no research finding provided above. Therefore, I cannot answer the researcher's question.

Based on sources:
