<a href="https://colab.research.google.com/github/rohini-sp/ai-research-assistant/blob/main/ai_research_assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch transformers langchain langchain-community chromadb faiss-cpu sentence-transformers pymupdf streamlit pydantic ragas



In [None]:
import fitz
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from google.colab import files

In [None]:
# Load embedding model & summarizer
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

Device set to use cpu


In [None]:
# Step 1: Extracting text from uploaded PDFs

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text() for page in doc])
    return text

uploaded = files.upload()

documents = []  # Store (filename, text)
for file_name in uploaded.keys():
    text = extract_text_from_pdf(file_name)
    documents.append((file_name, text))

Saving 2307.06435v10.pdf to 2307.06435v10 (2).pdf
Saving IJCET_16_01_069.pdf to IJCET_16_01_069 (2).pdf


In [None]:
# Step 2: Creating FAISS Index for semantic search

doc_texts = []  # Store raw text
file_mapping = []  # Store (filename, section_index) for each text chunk

for file_name, text in documents:
    sections = text.split('\n\n')
    for i, section in enumerate(sections):
        doc_texts.append(section)
        file_mapping.append((file_name, i))

# Convert text to embeddings
embeddings = embed_model.encode(doc_texts, convert_to_numpy=True)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [None]:
# Step 3: Search function with file names

def search_similar_docs(query, top_k=3):
    query_embedding = embed_model.encode([query])
    distances, indices = index.search(query_embedding, top_k)

    results = {}

    for idx in indices[0]:
        file_name, section_idx = file_mapping[idx]
        section_text = doc_texts[idx]

        if file_name not in results:
            results[file_name] = []
        results[file_name].append(section_text)

    return results

In [None]:
# Step 4: Displaying Results
def display_results():
    query = input("Enter your search query: ")
    results = search_similar_docs(query)

    for file_name, sections in results.items():
        print(f"\n📄 **Document:** {file_name}")
        summaries = []
        for section in sections:
            summary = summarizer(section[:1000], max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
            summaries.append(summary)

        print("🔹 **Relevant Sections:**")
        for i, summary in enumerate(summaries):
            print(f"  {i+1}. {summary}")

# Run search
display_results()

Enter your search query: llms and their use cases

📄 **Document:** 2307.06435v10 (2).pdf
🔹 **Relevant Sections:**
  1. The organization of this paper is as follows. Section 2 discusses the background of LLMs. Section 3 focuses on LLMs overview, training pipelines and strategies, fine-tuning, andutilization in different domains. Section 4 highlights the config- grotesqueuration and parameters that play a crucial role in the function-forming of these models.
  2. Lack of insight into their operation limits their effectiveness and trustworthiness. Efforts are being made to make LLMs more explainable to promote user trust. Understanding the logicbehind LLMs’ responses is essential for fostering trust and ensuring they align with human values and legal standards.
  3. LLMs are the cognitive controllers of autonomous agents. LLMs have been incorporated in webagents, coding agents, tool agents, and conversational agents. They are capable of planning, decision-making, and performing actions to

In [None]:
'''
user_query = ""
retrieved_chunks = []
generated_answer = ""

def display_results():
    global user_query, retrieved_chunks, generated_answer

    user_query = input("Enter your search query: ")
    results = search_similar_docs(user_query)

    retrieved_chunks = []  # combine all retrieved text chunks
    for file_name, sections in results.items():
        print(f"\n Document: {file_name}")
        for section in sections:
            retrieved_chunks.append(section)

    # Combine top chunks and generate a single answer from them
    combined_text = " ".join(retrieved_chunks[:2])
    generated_answer = summarizer(combined_text[:1000], max_length=150, min_length=50, do_sample=False)[0]["summary_text"]

    print("\n Final Answer:")
    print(generated_answer)

    print("\n Summarized Sections:")
    for i, section in enumerate(retrieved_chunks[:3]):
        summary = summarizer(section[:1000], max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
        print(f"  {i+1}. {summary}")'''


In [None]:
'''# Step 5: Evaluation
def evaluate_response():
    from datasets import Dataset
    from langchain.llms import HuggingFacePipeline
    from langchain.embeddings import HuggingFaceEmbeddings
    from ragas import evaluate
    from ragas.metrics import faithfulness

    print("\nEvaluating answer using RAGAS...")

    # Hugging Face pipeline wrapped for LangChain
    hf_pipeline = pipeline("text2text-generation", model="google/flan-t5-base", max_length=512)
    llm = HuggingFacePipeline(pipeline=hf_pipeline)

     # Prepare dataset
    rag_data = [{
        "question": user_query,
        "contexts": retrieved_chunks[:2],
        "answer": generated_answer,
     }]
    dataset = Dataset.from_list(rag_data)

     # Run evaluation
    results = evaluate(dataset, metrics=[faithfulness], llm=llm,embeddings=embed_model)

    print("\n RAGAS Evaluation Results:")
    for metric, score in results.items():
        print(f"{metric}: {round(score, 3)}")'''