## Imports

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

from pathlib import Path
from collections import defaultdict
import numpy as np

In [2]:
from huggingface_hub import InferenceClient
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

## Shared Embedding Model

In [3]:
embedding_model = HuggingFaceEmbeddings(
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
)

  embedding_model = HuggingFaceEmbeddings(


## Domain -> Index Registry

In [4]:
DOMAIN_INDEX_PATHS = {
    "artificial_intelligence": "../indexes/artificial_intelligence",
    "medical": "../indexes/medical",
    "climate": "../indexes/climate",
    "cyber_security": "../indexes/cyber_security",
    "business": "../indexes/business",
    "psychology": "../indexes/psychology",
    "automobile": "../indexes/automobile"
}


## Load Multiple Vector DBs

In [5]:
def load_vector_dbs(domains):
    vector_dbs = {}
    for domain in domains:
        path = DOMAIN_INDEX_PATHS[domain]
        vector_dbs[domain] = FAISS.load_local(
            path,
            embedding_model,
            allow_dangerous_deserialization=True
        )
    return vector_dbs

## Broad Retrieval

In [6]:
def retrieve_candidate_chunks(query, vector_dbs, k=40):
    all_docs = []
    
    for domain, db in vector_dbs.items():
        docs = db.similarity_search(query, k=k)
        for d in docs:
            d.metadata['domain'] = domain
        all_docs.extend(docs)
    
    return all_docs

## Aggregate Chunks -> Papers

In [7]:
def aggregate_by_paper(docs):
    papers = defaultdict(list)
    
    for doc in docs:
        source = doc.metadata.get('source', 'unknown')
        papers[source].append(doc)
    
    return papers

## Score Papers

In [8]:
def score_papers(papers):
    scored = []
    
    for paper, docs in papers.items():
        score = len(docs)
        domains = set(d.metadata['domain'] for d in docs)
        pages = sorted(set(d.metadata.get('page') for d in docs if d.metadata.get('page') is not None))
        
        scored.append({
            'paper': paper,
            'score': score,
            'domains': list(domains),
            'pages': pages[:5]
        })
    
    return sorted(scored, key = lambda x: x['score'], reverse=True)

## LLM Explanation

In [9]:
client = InferenceClient(
    model="meta-llama/Meta-Llama-3-8B-Instruct",
    token=os.getenv("HF_TOKEN")
)

In [10]:
def explain_recommendation(query, paper_name, snippets):
    context = "\n".join(snippets[:3])
    
    prompt = f"""
    User is researching: {query}
    
    Paper: {paper_name}
    
    Relevent excerpts:
    {context}
    
    Explain briefly why this paper is useful.
    """
    
    response = client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0.3
    )
    
    return response.choices[0].message.content

## End-to-End Test Call

In [11]:
query = "recent research trends in artificial intelligence"
domains = ["artificial_intelligence"]

vector_dbs = load_vector_dbs(domains)
docs = retrieve_candidate_chunks(query, vector_dbs, k=50)
papers = aggregate_by_paper(docs)
ranked = score_papers(papers)

ranked[:5]


[{'paper': 'AI computer science 5.pdf',
  'score': 22,
  'domains': ['artificial_intelligence'],
  'pages': [7, 19, 35, 43, 44]},
 {'paper': 'AI computer science 1.pdf',
  'score': 9,
  'domains': ['artificial_intelligence'],
  'pages': [0, 2, 3, 4]},
 {'paper': 'AI computer science 4.pdf',
  'score': 8,
  'domains': ['artificial_intelligence'],
  'pages': [0, 1, 2, 4, 5]},
 {'paper': '3727353.3727478.pdf',
  'score': 8,
  'domains': ['artificial_intelligence'],
  'pages': [0, 1, 2, 4]},
 {'paper': 'AI computer science 2.pdf',
  'score': 2,
  'domains': ['artificial_intelligence'],
  'pages': [25, 32]}]

In [12]:
top_paper = ranked[0]
snippets = [d.page_content for d in papers[top_paper["paper"]]]

explain_recommendation(query, top_paper["paper"], snippets)


'This paper is useful because it provides an overview of recent research trends in artificial intelligence, including the reintegration of robotics and machine learning, the emergence of intelligent agents, and the availability of large data sets. It highlights significant advancements and shifts in the field, offering insights into the current state of AI research.'