In [9]:
import requests
import os

def download_arxiv_paper(arxiv_id, download_dir="papers"):
    """
    Downloads a single paper from arXiv in PDF format.
    """
    url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    os.makedirs(download_dir, exist_ok=True)
    pdf_path = os.path.join(download_dir, f"{arxiv_id}.pdf")
    
    response = requests.get(url)
    with open(pdf_path, "wb") as f:
        f.write(response.content)
    
    return pdf_path

# Example usage
papers_to_download = ["2103.00020", "2106.10336"]  # Example arXiv IDs
for pid in papers_to_download:
    downloaded_pdf_path = download_arxiv_paper(pid)
    print(f"Downloaded: {downloaded_pdf_path}")


Downloaded: papers\2103.00020.pdf
Downloaded: papers\2106.10336.pdf


In [10]:
import PyPDF2

def pdf_to_text(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

# Example usage
pdf_texts = {}
for pid in papers_to_download:
    pdf_path = os.path.join("papers", f"{pid}.pdf")
    text = pdf_to_text(pdf_path)
    pdf_texts[pid] = text


In [11]:
import re

def clean_text(text):
    # Basic cleaning: remove extra whitespace, newlines, etc.
    # Customize cleaning steps as needed.
    cleaned = re.sub(r"\s+", " ", text).strip()
    return cleaned

def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += (chunk_size - overlap)  # Overlap for context continuity
    return chunks

all_chunks = []
for pid, text in pdf_texts.items():
    c_text = clean_text(text)
    chunks = chunk_text(c_text)
    for idx, chunk in enumerate(chunks):
        all_chunks.append({
            "paper_id": pid,
            "chunk_index": idx,
            "chunk_text": chunk
        })

print(f"Total chunks: {len(all_chunks)}")


Total chunks: 95


In [12]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Example embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Convert chunk texts to embeddings
chunk_texts = [chunk["chunk_text"] for chunk in all_chunks]
embeddings = model.encode(chunk_texts, show_progress_bar=True)

# Convert to float32 for FAISS
embeddings = np.array(embeddings, dtype='float32')

# Build a FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance index
index.add(embeddings)
print(f"FAISS index size: {index.ntotal}")

# Keep track of metadata in the same order
metadata_store = [ (all_chunks[i]["paper_id"], all_chunks[i]["chunk_index"]) 
                   for i in range(len(all_chunks)) ]

Batches: 100%|██████████| 3/3 [00:04<00:00,  1.54s/it]

FAISS index size: 95





In [13]:
def get_top_k_chunks(query, k=3):
    query_embedding = model.encode([query], show_progress_bar=False)
    query_embedding = np.array(query_embedding, dtype='float32')

    # Search FAISS index
    distances, indices = index.search(query_embedding, k)
    
    retrieved_chunks = []
    for dist, idx in zip(distances[0], indices[0]):
        paper_id, chunk_index = metadata_store[idx]
        chunk_text = all_chunks[idx]["chunk_text"]
        retrieved_chunks.append({
            "paper_id": paper_id,
            "chunk_index": chunk_index,
            "chunk_text": chunk_text,
            "distance": dist
        })
    return retrieved_chunks

# Example usage
user_query = "What are the latest advancements in Transformer models?"
top_chunks = get_top_k_chunks(user_query, k=3)
for chunk in top_chunks:
    print(f"Paper {chunk['paper_id']} - Chunk {chunk['chunk_index']}:\n{chunk['chunk_text'][:200]}...\n")


Paper 2103.00020 - Chunk 59:
the ViT-L/14 model ﬁne-tuned on 336-by-336 pixel input images. EfﬁcietNet We use the nine models (B0-B8) from the original EfﬁcientNet paper (Tan & Le, 2019), as well as the noisy-student variants (B0...

Paper 2103.00020 - Chunk 8:
of the model. For the text encoder, we only scale the width of the model to be proportional to the calculated increase in width of the ResNet and do not scale the depth at all, as we found CLIP’s perf...

Paper 2103.00020 - Chunk 2:
and pre-training approaches, VirTex (Desai & Johnson, 2020), ICMLM (Bulent Sariyildiz et al., 2020), and Con- VIRT (Zhang et al., 2020) have recently demonstrated the potential of transformer-based la...



In [None]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
from ollama import chat
def load_llama_model(model_name="meta-llama/Llama-2-7b-chat-hf"):
    # For large models, you might need device_map="auto" and torch_dtype=torch.float16
    tokenizer = LlamaTokenizer.from_pretrained(model_name)
    model = LlamaForCausalLM.from_pretrained(
        model_name,
        device_map="auto",  # automatically spread across GPUs
        torch_dtype=torch.float16
    )
    generate_pipeline = pipeline(
        "text-generation", 
        model=model, 
        tokenizer=tokenizer, 
        max_length=2048, 
        temperature=0.7, 
        do_sample=True,  # or False, depending on your preference
        top_k=50,
        repetition_penalty=1.2
    )
    return generate_pipeline

def generate_answer_with_context_llama(query, top_chunks, generate_pipeline):
    # Construct a prompt that includes the retrieved context
    context_text = "\n\n".join(
        f"[Source: paper={c['paper_id']}, chunk={c['chunk_index']}] {c['chunk_text']}"
        for c in top_chunks
    )

    # You can structure your prompt however you like:
    system_prompt = (
        "You are a helpful AI assistant. Use only the text from the sources below to answer:\n"
        f"{context_text}\n\n"
        "Now answer the user question. "
        "If the answer cannot be found, say you are not sure. "
        "Include references to [Source: paper=xxx, chunk=yyy] wherever relevant.\n"
        f"User's question: {query}\n\n"
    )

    # Call the pipeline
    # output = generate_pipeline(system_prompt, max_new_tokens=512)[0]["generated_text"]
    response: ChatResponse = chat(model='llama3.2', max_tokens=512, messages=[
    {
        'role': 'user',
        'content': system_prompt,
    },
    ])
    generated_text = response['message']['content']
    print("Generated Text:", generated_text)
    # The pipeline will return the entire prompt plus generation. 
    # You might want to strip out the prompt or parse the relevant part.
    # generated_answer = output[len(system_prompt):].strip()
    

    return generated_text


In [21]:
dir(chat)

['__call__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__func__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__self__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']

In [15]:
answer = generate_answer_with_context_llama(user_query, top_chunks)
print(answer)

TypeError: generate_answer_with_context_llama() missing 1 required positional argument: 'generate_pipeline'

In [17]:
import os
from ollama import  chat
from ollama import ChatResponse