# Rag interface CORIA

## Imports

In [1]:
import os
import requests
import json
import faiss
import numpy as np
from nltk.tokenize import sent_tokenize
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import torch
import re
import logging
from urllib.parse import quote_plus
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,pipeline, AutoTokenizer, AutoModel
from torch.cuda.amp import autocast
import gradio as gr
from huggingface_hub import login
import xml.etree.ElementTree as ET


## Fill in the "" with your access token

In [2]:
login("")

In [3]:
# ✅ Free up GPU memory
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [4]:
# ✅ Force the model to run on GPU 1
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [5]:
# ✅ Configure Logging (Logs to logs_temp.txt)
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s", filename="logs_temp.txt", filemode="a")

## Get papers

In [10]:
def retrieve_relevant_papers(query, texts, top_k=3):
    """Retrieves relevant papers for a given query."""
    query_embedding = model.encode([query], convert_to_numpy=True)
    D, I = index.search(np.array(query_embedding), top_k)
    print(texts[i])
    return [texts[i] for i in I[0] if i < len(texts)]


In [24]:
def search_papers_semantic_scholar(query, max_results=5):
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    params = {
        "query": query,
        "limit": max_results,
        "fields": "title,abstract,url"
    }
    response = requests.get(url, params=params)
 
    if response.status_code == 200:
        data = response.json()
        papers = data.get("data", [])
        # Filter papers that have a non-empty abstract
        return [p for p in papers if p.get("abstract")]
    else:
        return []

In [23]:
def search_papers_semantic_scholar(query, min_papers=5, max_results=20):
    """Searches Semantic Scholar API for papers matching the query.
    Returns a list of papers with title, abstract, and DOI."""
    encoded_query = quote_plus(query)
    params = {
        "query": query,
        "limit": max_results,
        "fields": "title,abstract,externalIds,url"
    }

    papers = []
    logging.debug(f"🔍 Searching Semantic Scholar API with query: {params['query']}")

    try:
        response = requests.get(S2_URL, params=params, headers=HEADERS)
        logging.debug(f"📝 HTTP Response Code: {response.status_code}")

        if response.status_code != 200:
            logging.error(f"❌ HTTP Error {response.status_code}: {response.reason}")
            return papers

        data = response.json()
        for entry in data.get("data", []):
            title = entry.get("title", "No title available")
            abstract = entry.get("abstract", "No abstract available")
            doi = entry.get("externalIds", {}).get("DOI", "No DOI available")
            url = entry.get("url", "No URL available")

            if abstract and abstract != "No abstract available":
                papers.append({
                    "title": title,
                    "abstract": abstract,
                    "doi": doi,
                    "url": url
                })

            if len(papers) >= min_papers:
                break

        logging.debug(f"✅ Found {len(papers)} papers with abstracts.")
        return papers

    except requests.exceptions.RequestException as req_err:
        logging.error(f"⚠️ Request Exception: {req_err}")
        return papers
    except Exception as e:
        logging.error(f"⚠️ Unexpected Error: {e}", exc_info=True)
        return papers


In [7]:
def search_papers(query, min_papers=5, max_results=20):
    """Fetches research papers from Scopus API, ensuring at least `min_papers` with abstracts."""
    encoded_query = quote_plus(query)
    papers = []

    while len(papers) < min_papers:  # Keep searching until we get enough papers
        params = {"query": encoded_query, "count": max_results, "start": 0}
        logging.debug(f"🔍 Searching Scopus API with params: {params}")

        try:
            response = requests.get(SCOPUS_URL, params=params, headers=HEADERS)
            logging.debug(f"📝 HTTP Response Code: {response.status_code}")

            if response.status_code != 200:
                logging.error(f"❌ HTTP Error {response.status_code}: {response.reason}")
                break  # Stop retrying on API failure

            response_data = response.json()
            raw_papers = response_data.get("search-results", {}).get("entry", [])

            for entry in raw_papers:
                scopus_id = entry.get("identifier", {}).get("scopus_id", "Unknown")
                title = entry.get("dc:title", "No title available")
                abstract = entry.get("dc:description", "No abstract available") or entry.get("prism:description", "No abstract available")
                doi = entry.get("prism:doi", "No DOI available")

                # ✅ Fetch full metadata if DOI exists
                if doi != "No DOI available":
                    logging.debug(f"🔗 Fetching additional metadata for DOI: {doi}")
                    full_paper_details = get_full_paper_details_from_crossref(doi)
                    if full_paper_details:
                        abstract = full_paper_details.get('abstract', abstract)
                        title = full_paper_details.get('title', title)

                # ✅ Only add papers with abstracts
                if abstract and abstract != "No abstract available":
                    papers.append({"scopus_id": scopus_id, "title": title, "abstract": abstract, "doi": doi})
                    if len(papers) >= min_papers:
                        break  # Stop early if we get enough papers

            logging.debug(f"✅ Found {len(papers)} papers with abstracts.")
            if len(papers) >= min_papers:
                return papers  # ✅ Return once we have enough papers

        except requests.exceptions.RequestException as req_err:
            logging.error(f"⚠️ Request Exception: {req_err}")
            break  # Stop retrying if there's a network issue
        except Exception as e:
            logging.error(f"⚠️ Unexpected Error: {e}", exc_info=True)
            break

        # ✅ If not enough papers, increase max_results and retry
        max_results += 10
        logging.warning(f"⚠️ Not enough papers found. Increasing max_results to {max_results} and retrying...")

    return papers  # Return whatever we found (even if < min_papers)


In [6]:
def fetch_abstract(scopus_id):
    """Fetch the abstract from the abstracts API using the Scopus ID."""
    abstract_api_url = ABSTRACT_URL.format(scopus_id)
    logging.debug(f"🔗 Fetching abstract from ABSTRACT_URL: {abstract_api_url}")
    
    try:
        response = requests.get(abstract_api_url, headers=HEADERS)
        logging.debug(f"📝 Abstract API HTTP Response Code: {response.status_code}")
        
        if response.status_code != 200:
            logging.error(f"❌ HTTP Error fetching abstract: {response.status_code}: {response.reason}")
            return "No abstract available"
        
        content_type = response.headers.get("Content-Type", "")
        if "xml" in content_type.lower():
            root = ET.fromstring(response.text)
            # Adjust the tag/path as needed based on the actual XML structure.
            abstract = root.findtext(".//dc:description", default="No abstract available")
        else:
            data = response.json()
            abstract = data.get("abstract", "No abstract available")
        return abstract
    except Exception as e:
        logging.error(f"⚠️ Exception when fetching abstract: {e}")
        return "No abstract available"



In [9]:
def get_full_paper_details_from_crossref(doi):
    """Fetches full paper details from CrossRef using DOI."""
    url = f"{CROSSREF_URL}{doi}"
    logging.debug(f"📡 Querying CrossRef for DOI: {doi}")

    try:
        response = requests.get(url)
        logging.debug(f"📝 CrossRef HTTP Response Code: {response.status_code}")

        if response.status_code == 200:
            data = response.json()
            paper_details = data.get("message", {})

            title = paper_details.get("title", ["No title available"])[0]
            abstract = paper_details.get("abstract", "No abstract available")
            abstract = clean_abstract(abstract)
            logging.debug(f"✅ CrossRef Data Retrieved: {title[:50]}...")
            return {"title": title, "abstract": abstract}

        else:
            logging.warning(f"⚠️ CrossRef failed for DOI {doi}: HTTP {response.status_code}")
            return None

    except requests.exceptions.RequestException as req_err:
        logging.error(f"⚠️ Request Exception: {req_err}")
    except Exception as e:
        logging.error(f"⚠️ Unexpected Error fetching CrossRef data: {e}", exc_info=True)

    return None


In [8]:
def clean_abstract(abstract):
    """Removes all XML/HTML tags and ensures proper spacing."""

    if not abstract or abstract.strip().lower() == "no abstract available":
        return "No abstract available"

    # ✅ Replace tags with spaces instead of just removing them
    clean_text = re.sub(r"</?[^>]+>", " ", abstract)
    clean_text = re.sub(r"\s+", " ", clean_text).strip()  # Normalize spaces

    return clean_text if clean_text else "No abstract available"

In [11]:


def index_papers(papers):
    """Embeds and stores research papers in FAISS index."""
    texts = [paper["abstract"] for paper in papers]
    embeddings = model.encode(texts, convert_to_numpy=True, batch_size=2)

    global index
    index.add(embeddings)
    return texts


## LLM functions

In [None]:
# ✅ Load SentenceTransformer on GPU 1
device = "cuda:1" if torch.cuda.is_available() else "cpu"
#device = "cpu"
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# ✅ Initialize FAISS index
index = faiss.IndexFlatL2(384)

In [12]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

# ✅ Proper BitsAndBytes configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=torch.float16
)

# ✅ Load model with quantization
llama_model = AutoModelForCausalLM.from_pretrained(
    model_name, quantization_config=bnb_config, device_map="auto"
)

# ✅ Load tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
def generate_llm_response(query, context=""):
    """Generates an AI response using research paper context."""
    formatted_context = f"Here are some research papers retrieved on this topic:\n{context}\nUsing information from them, answer:\n"
    prompt = f"[INST] <<SYS>>{formatted_context}<</SYS>> {query} [/INST]"

    inputs = llama_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4900).to("cuda")

    with autocast():
        output = llama_model.generate(**inputs)

    return llama_tokenizer.decode(output[0], skip_special_tokens=True)



In [14]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

def extract_important_sentences(text, query, top_n=3):
    """
    Extracts the top n most relevant sentences based on semantic similarity to the query.
    """
    # Split the text into individual sentences
    sentences = [s.strip() for s in text.split('.') if s.strip()]

    if not sentences:
        return []

    # Encode sentences and the query
    sentence_embeddings = sentence_model.encode(sentences, convert_to_numpy=True)
    query_embedding = sentence_model.encode([query], convert_to_numpy=True)

    # Compute cosine similarity between query and each sentence
    similarities = cosine_similarity(query_embedding, sentence_embeddings)[0]

    # Rank sentences by similarity (higher is better)
    ranked_sentences = sorted(zip(sentences, similarities), key=lambda x: x[1], reverse=True)

    # Return the top N most relevant sentences
    return [s[0] for s in ranked_sentences[:top_n]]

In [15]:
def find_closest_sentence_for_point(point, abstracts):
    """Finds the closest sentence from abstracts for a given important sentence."""
    # Tokenize the abstracts into sentences
    all_sentences = []
    for abstract in abstracts:
        all_sentences.extend(sent_tokenize(abstract))

    # Encode the point and all abstract sentences
    point_embedding = model.encode([point], convert_to_numpy=True)
    all_sentences_embeddings = model.encode(all_sentences, convert_to_numpy=True)

    # Use FAISS to find the most similar sentence
    D, I = index.search(np.array(point_embedding), 1)

    closest_sentence = all_sentences[I[0][0]]
    return closest_sentence



In [16]:
# Adjusted function to find the closest sentence from the abstracts
def find_closest_sentence_for_point(important_sentence, abstracts):
    """
    Finds the closest matching sentence from the provided abstracts to the important sentence.
    """
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    # Embed the important sentence and the abstract sentences
    important_embedding = model.encode([important_sentence])

    # Flatten all abstract sentences into a list of sentences
    all_sentences = []
    for abstract in abstracts:
        abstract_sentences = abstract.split('.')
        all_sentences.extend([sentence.strip() for sentence in abstract_sentences if sentence.strip()])

    # Encode all sentences in the abstracts
    all_embeddings = model.encode(all_sentences)

    # Calculate cosine similarities between the important sentence and all abstract sentences
    similarities = cosine_similarity(important_embedding, all_embeddings)

    # Find the most similar sentence
    most_similar_idx = similarities.argmax()
    closest_sentence = all_sentences[most_similar_idx]

    return closest_sentence

#chat

In [17]:
def generate_answer(query, paper_context):
    """Generate the AI response using the papers' context."""
    final_response = generate_llm_response(query, paper_context)
    response_after_inst = final_response.split("[/INST]")[-1].strip()
    return response_after_inst


In [35]:

# Load a Natural Language Inference (NLI) model
nli_model = pipeline("text-classification", model="facebook/bart-large-mnli")



Device set to use cuda:0


In [20]:
def chat(query):
    """Main function to get the full response by calling the sub-functions."""

    # Step 1: Get sources (papers)
    paper_context, paper_links, papers = get_sources(query)

    if paper_context == "No relevant research papers found.":
        return paper_context

    # Step 2: Generate the AI response
    response_after_inst = generate_answer(query, paper_context)

    # Step 3: Extract important sentences and find their closest matches
    df = extract_important_sentences_and_matches(response_after_inst, papers, query)  # Pass query here

    # Log the query, response, and related data to logs_temp.txt
    with open("logs_temp.txt", "a", encoding="utf-8") as log_file:
        log_file.write(f"Query: {query}\n")
        log_file.write(f"Response:\n{response_after_inst}\n")
        log_file.write(f"Relevant Papers:\n{paper_links}\n")
        log_file.write(f"Top Important Sentences and Closest Matches:\n{df.to_string(index=False)}\n\n")

    # Step 4: Prepare the output
    full_output = f"{response_after_inst}\n\nRelevant Papers:\n{paper_links}\n"
    full_output += "\nTop Important Sentences and Closest Matches:\n"
    full_output += df.to_string(index=False)  # Convert DataFrame to string without the index

    return full_output


In [22]:
S2_URL = "https://api.semanticscholar.org/graph/v1/paper/search"
HEADERS = {"Accept": "application/json"}

In [25]:
# Load model for sentence similarity
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")



In [26]:
def compute_similarity(sentence1, sentence2):
    inputs = tokenizer([sentence1, sentence2], padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return torch.cosine_similarity(embeddings[0], embeddings[1], dim=0).item()



In [27]:
def check_agreement(sentence1, sentence2, threshold=0.7):
    score = compute_similarity(sentence1, sentence2)
    return "ENTAILMENT" if score >= threshold else "CONTRADICTION"

In [28]:
def extract_important_sentences_and_matches(response, papers, query):
    important = extract_important_sentences(response, query, top_n=3)
    abstracts = [p['abstract'] for p in papers]
    matches = [find_closest_sentence_for_point(s, abstracts) for s in important]
    labels = [check_agreement(s, m) for s, m in zip(important, matches)]
    return pd.DataFrame({"Important Sentence": important, "Closest Match": matches, "Agreement": labels})



In [29]:
def get_sources(query):
    papers = search_papers_semantic_scholar(query, max_results=5)
    if not papers:
        return "No papers found.", "", []
    context = "\n".join([f"\U0001F4C4 **{p['title']}**\n{p['abstract']}\n" for p in papers])
    links = "\n".join([f"\U0001F4C4 **{p['title']}**\n[Link]({p['url']})" for p in papers])
    return context, links, papers

In [30]:
def process_and_update_chat(user_input, chatbot, highlight_types, keep_sources):
    context, links, papers = get_sources(user_input)
    if not papers:
        chatbot.append((user_input.strip(), "No papers found."))
        return chatbot, "", gr.update(visible=False)

    response = generate_answer(user_input, context)
    df = extract_important_sentences_and_matches(response, papers, user_input)
    highlighted = response

    for _, row in df.iterrows():
        tooltip = f"Closest match: {row['Closest Match']}"
        color = "#d4f7d4" if "Confirmed" in highlight_types and row["Agreement"] == "ENTAILMENT" else ""
        if "Potential misinformation" in highlight_types and row["Agreement"] == "CONTRADICTION":
            color = "#f7d4d4"
        if color:
            highlighted = highlighted.replace(
                row['Important Sentence'],
                f'<span title="{tooltip}" style="background-color: {color};">{row["Important Sentence"]}</span>'
            )

    chatbot.append((user_input.strip(), highlighted.strip()))
    show = "Confirmed" in highlight_types or keep_sources
    source_html = "".join([
        f'<div><strong>{p["title"]}</strong><a href="{p["url"]}" target="_blank">Link</a></div>' for p in papers
    ]) if show else ""
    return chatbot, source_html, gr.update(visible=show)



In [31]:
def update_keep_sources_visibility(highlight_types):
    return gr.update(visible=len(highlight_types) == 0)

In [32]:
def update_sources_visibility(highlight_types, keep_sources):
    return gr.update(visible="Confirmed" in highlight_types or keep_sources)

## Gradio interface

In [33]:
with gr.Blocks() as demo:
    with gr.Group(visible=True, elem_id="all"):
        gr.Markdown("""# AI Research Interface""", elem_id="desc")
        with gr.Row():
            gr.Textbox(label="Hidden Input", visible=False)
        with gr.Column(elem_id="chatbot-container"):
            chatbot = gr.Chatbot(height=300)
            sources_output = gr.HTML(visible=True, elem_id="desc")
            highlight_types = gr.CheckboxGroup([
                "Confirmed", "Potential misinformation"
            ], label="Highlight Types", value=["Confirmed"])
            keep_sources = gr.Checkbox(label="Keep Sources Visible", value=True, visible=False)
            with gr.Row(elem_id="chatbot-controls"):
                user_input = gr.Textbox(elem_id="chatbot-user-input", label="Your Message", lines=1)
                submit_button = gr.Button("Send", elem_id="submit-button")

    highlight_types.change(update_keep_sources_visibility, [highlight_types], [keep_sources])
    highlight_types.change(update_sources_visibility, [highlight_types, keep_sources], [sources_output])
    keep_sources.change(update_sources_visibility, [highlight_types, keep_sources], [sources_output])
    submit_button.click(process_and_update_chat, [user_input, chatbot, highlight_types, keep_sources], [chatbot, sources_output, sources_output])

    demo.css = """
#chatbot-container .message:nth-child(odd) .user,
#chatbot-container .message:nth-child(even) .bot {
    display: none;
}
button {
    align-self: center;
}
#all {
    border: 5px solid #333;
    border-radius: 10px;
    padding: 20px;
    background-color: white;
    margin: 20px;
}
textarea {
    background-color: #e4e4e7;
}
#desc {
    background-color: white;
    padding: 10px;
    font-size: 24px;
    border-bottom: 5px solid #333;
}
"""
demo.launch(server_name="0.0.0.0", server_port=7878)


  chatbot = gr.Chatbot(height=300)


* Running on local URL:  http://0.0.0.0:7878

To create a public link, set `share=True` in `launch()`.


