# **Setting Up the environment keys**

In [1]:
from google.colab import userdata
import os

HF_TOKEN = userdata.get("HUGGINGFACEHUB_API_TOKEN")
WEAVIATE_URL = userdata.get("WEAVIATE_URL")
WEAVIATE_API_KEY = userdata.get("WEAVIATE_API_KEY")
GEMINI_API_KEY = userdata.get("GEMINI_API_KEY")

# make HF token visible to libraries that expect env var
if HF_TOKEN:
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_TOKEN


# **Installing and importing required libraries**

In [None]:
!pip -q install "weaviate-client<4" \
                "langchain>=0.2" "langchain-community>=0.2" "langchain-text-splitters>=0.2" \
                sentence-transformers \
                unstructured "unstructured[pdf]" pypdf \
                langchain-google-genai \
                langchain-huggingface


In [3]:
# LangChain bits
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Weaviate as WeaviateStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA

# Data loading (you already shared)
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader  # if you also use .txt later
from langchain.document_loaders import PyPDFDirectoryLoader

# LLM (Gemini) – optional if you want HF LLM instead
from langchain_google_genai import ChatGoogleGenerativeAI

# Weaviate client v3
import weaviate


# **Initializing Weaviate client**

In [20]:
INDEX_NAME = "DocChunk"    # Weaviate "class" name
TEXT_KEY   = "text"        # property that will hold the raw chunk text

client = weaviate.Client(
    url="https://"+WEAVIATE_URL,
    auth_client_secret=weaviate.AuthApiKey(WEAVIATE_API_KEY),
)



# **Creating a class in our cluster**

In [21]:
# Create class if it doesn't exist
existing = [c["class"] for c in client.schema.get().get("classes", [])]
if INDEX_NAME not in existing:
    class_obj = {
        "class": INDEX_NAME,
        "description": "RAG document chunks",
        "vectorizer": "none",               # because we bring our own vectors
        "vectorIndexType": "hnsw",
        "vectorIndexConfig": {"distance": "cosine"},
        "properties": [
            {"name": TEXT_KEY,   "dataType": ["text"]},
            {"name": "source",   "dataType": ["text"]},
            {"name": "chunk_id", "dataType": ["int"]},
        ],
    }
    client.schema.create_class(class_obj)
    print(f"Created Weaviate class: {INDEX_NAME}")
else:
    print(f"Weaviate class already exists: {INDEX_NAME}")

Created Weaviate class: DocChunk


# **If you want to delete the class ( If any thing goes wrong )**

In [19]:
client.schema.delete_class(INDEX_NAME)

# **Using hugging face model for embeddings**
because weaviate has limit of 1536 dimension and gemini has 3072 dimensions

In [None]:
EMBED_MODEL = "Alibaba-NLP/gte-large-en-v1.5"

embedding = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL,
    model_kwargs={"trust_remote_code": True},  # needed for some sentence-transformers wrappers
    encode_kwargs={"batch_size": 32}  # process 32 chunks at a time
)

# sanity check: print dimension (must be <= 1536 for Weaviate)
dim = len(embedding.embed_query("dimension test"))
print("Embedding dimension:", dim)


# **Loading and preparing our data**

In [7]:
!mkdir pdfs

In [8]:
!gdown 1hPQlXrX8FbaYaLypxTmeVOFNitbBMlEE -O pdfs/yolov7paper.pdf
!gdown 1vILwiv6nS2wI3chxNabMgry3qnV67TxM -O pdfs/rachelgreecv.pdf

Downloading...
From: https://drive.google.com/uc?id=1hPQlXrX8FbaYaLypxTmeVOFNitbBMlEE
To: /content/pdfs/yolov7paper.pdf
100% 2.27M/2.27M [00:00<00:00, 18.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1vILwiv6nS2wI3chxNabMgry3qnV67TxM
To: /content/pdfs/rachelgreecv.pdf
100% 271k/271k [00:00<00:00, 5.57MB/s]


In [9]:
loader = PyPDFDirectoryLoader("pdfs")
data = loader.load()

In [None]:
data

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text_chunks = text_splitter.split_documents(data)

In [None]:
text_chunks

In [16]:
len(text_chunks)

97

# **Weaviate requires that all property names in the schema (and the metadata you upsert) follow GraphQL naming rules:**

1. Must start with a letter (A–Z or a–z) or underscore _.

2. Followed by letters, numbers, or underscores.

3. Length ≤ 231 chars.

4. No dots (.), dashes (-), or special characters.

In [22]:
for doc in text_chunks:
    clean_metadata = {}
    for k, v in doc.metadata.items():
        # Replace '.' with '_' and enforce GraphQL-safe naming
        safe_key = k.replace(".", "_")
        clean_metadata[safe_key] = v
    doc.metadata = clean_metadata


# **Now storing our cleaned data in the form of embeddings in weaviate**

In [23]:
# Vectorstore wrapper around Weaviate
vectorstore = WeaviateStore.from_documents(
    documents=text_chunks,
    embedding=embedding,
    client=client,
    index_name=INDEX_NAME,
    text_key=TEXT_KEY,
)

print("✅ Chunks embedded and stored in Weaviate.")


✅ Chunks embedded and stored in Weaviate.


# **Building RAG**

In [31]:
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3, "fetch_k": 24})
'''
k => get top-k most similar chunks
Problem: they might all be very similar to each other → you lose diversity.


search_type = "mmmr" => MMR tries to balance:
Relevance (is the chunk close to the query?)
Diversity (are the chunks different from each other?)

This avoids redundancy (e.g., getting 3 nearly identical chunks from one page).


fetch_k => how many initial candidates to pull from the vector store before applying the MMR selection.

'''

'\nk => get top-k most similar chunks\nProblem: they might all be very similar to each other → you lose diversity.\n\n\nsearch_type = "mmmr" => MMR tries to balance:\nRelevance (is the chunk close to the query?)\nDiversity (are the chunks different from each other?)\n\nThis avoids redundancy (e.g., getting 3 nearly identical chunks from one page).\n\n\nfetch_k => how many initial candidates to pull from the vector store before applying the MMR selection.\n\n'

In [32]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",           # or "gemini-1.5-pro" if you want stronger reasoning
    google_api_key=GEMINI_API_KEY
)

In [33]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

In [34]:
query = "Give me a concise summary of the key ideas in these PDFs."
query_2 = "Tell me about Riyan" # for testing
resp = qa.invoke({"query": query})

print("Answer:\n", resp["result"])
print("\nSources:")
for d in resp["source_documents"]:
    print("-", d.metadata.get("source"), "…")

Answer:
 The provided text discusses several papers focusing on improving Convolutional Neural Networks (CNNs).  Key themes include:

* **Efficient scaling of CNNs:** One paper explores a compound scaling strategy that efficiently improves accuracy by scaling both width and depth of the network, outperforming strategies that scale only width or depth.

* **Reparamaterization of optimizers and architectures:** Another paper investigates re-parameterizing optimizers instead of modifying network architectures for improved performance.

* **Novel architectural blocks:**  Several papers introduce new building blocks for CNNs, such as asymmetric convolution blocks and diverse branch blocks, aimed at enhancing performance.  One paper specifically focuses on replacing conventional convolution blocks with RepConv blocks within a specific architecture (ELAN).

The papers present ablation studies and experimental results demonstrating the effectiveness of their proposed methods.  Specific perform