## Load and Inspect Dataset

In [3]:
import json
import pandas as pd

# Define paths
json_path = '/home/kosal/AI/Dynamic_K_RAG/Dataset/Computer_Science_Theory_QA.json'
csv_path  = '/home/kosal/AI/Dynamic_K_RAG/Dataset/Computer_Science_Theory_QA.csv'

# Load the JSON file
with open(json_path, 'r', encoding='utf8') as f:
    data = json.load(f)

# Convert to a DataFrame
df = pd.DataFrame(data['intents'])

# Save to CSV (overwrite if exists)
df.to_csv(csv_path, index=False, encoding='utf8')

# Read back the CSV 
df = pd.read_csv(csv_path, encoding='utf8')


## Basic Cleaning and Convert to JSONL

In [4]:
import ast

# Saft Covert from String to List
def safe_eval_list(val):
    if isinstance(val, list):
        return val
    try:
        return ast.literal_eval(val)
    except:
        return []

df['patterns'] = df['patterns'].apply(safe_eval_list)
df['responses'] = df['responses'].apply(safe_eval_list)

# Now, regenerate your QA pairs as before:
output_jsonl = "/home/kosal/AI/Dynamic_K_RAG/Dataset/cs_qa.jsonl"
qa_pairs = 0

with open(output_jsonl, "w", encoding="utf8") as f:
    for _, row in df.iterrows():
        for pattern in row['patterns']:
            answer = row['responses'][0] if row['responses'] else ""
            obj = {
                "question": pattern.strip(),
                "answer": answer.strip(),
                "tag": row['tag']
            }
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")
            qa_pairs += 1

print(f"Total QA pairs written: {qa_pairs}")


Total QA pairs written: 347


## Split Into Train / Val / Test Sets

In [5]:
from sklearn.model_selection import train_test_split

# Load all JSONL lines
with open(output_jsonl, encoding="utf8") as f:
    data = [json.loads(line) for line in f]

train, temp = train_test_split(data, test_size=0.2, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

for name, split in zip(["train", "val", "test"], [train, val, test]):
    out_file = f"cs_qa_{name}.jsonl"
    with open(out_file, "w", encoding="utf8") as f:
        for rec in split:
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
    print(f"{name}: {len(split)} samples saved to {out_file}")


train: 277 samples saved to cs_qa_train.jsonl
val: 35 samples saved to cs_qa_val.jsonl
test: 35 samples saved to cs_qa_test.jsonl


# Facts Corpus & Vector Index Construction

### Build the Facts Corpus

In [6]:
import json

facts = set()
with open("/home/kosal/AI/Dynamic_K_RAG/Dataset/cs_qa_train.jsonl", encoding="utf8") as f:
    for line in f:
        qa = json.loads(line)
        facts.add(qa["answer"].strip())

facts = list(facts)
print(f"Total unique facts: {len(facts)}")

facts_path = "/home/kosal/AI/Dynamic_K_RAG/Dataset/facts.jsonl"
with open(facts_path, "w", encoding="utf8") as f:
    for idx, fact in enumerate(facts):
        f.write(json.dumps({"fact_id": idx, "fact": fact}, ensure_ascii=False) + "\n")
print(f"Saved facts to {facts_path}")


Total unique facts: 163
Saved facts to /home/kosal/AI/Dynamic_K_RAG/Dataset/facts.jsonl


### Vector Embedding

In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np

facts = []
fact_ids = []
with open(facts_path, encoding="utf8") as f:
    for line in f:
        item = json.loads(line)
        facts.append(item["fact"])
        fact_ids.append(item["fact_id"])

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Compute embeddings (batch for speed)
embeddings = model.encode(facts, batch_size=64, show_progress_bar=True)
print(f"Embeddings shape: {embeddings.shape}")


Batches: 100%|██████████| 3/3 [00:00<00:00, 15.15it/s]

Embeddings shape: (163, 384)





### Build the FAISS Vector Index

In [8]:
import faiss

dim = embeddings.shape[1]  
index = faiss.IndexHNSWFlat(dim, 32)
index.hnsw.efConstruction = 200
index.add(np.array(embeddings).astype("float32"))
print(f"Indexed {index.ntotal} facts.")

# Save index to disk for future use
faiss.write_index(index, "/home/kosal/AI/Dynamic_K_RAG/Dataset/facts_hnsw.index")
print("FAISS index saved.")


Indexed 163 facts.
FAISS index saved.


In [9]:
fact_id_map = {i: fact for i, fact in enumerate(facts)}
import pickle
with open("/home/kosal/AI/Dynamic_K_RAG/Dataset/fact_id_map.pkl", "wb") as f:
    pickle.dump(fact_id_map, f)
print("Fact ID map saved.")

Fact ID map saved.


### Test Retrieval End-to-End

In [10]:
import faiss

k=1

index = faiss.read_index("/home/kosal/AI/Dynamic_K_RAG/Dataset/facts_hnsw.index")

test_query = "Explain data abstraction in computer science."

q_vec = model.encode([test_query])

D, I = index.search(np.array(q_vec).astype("float32"), k)

import pickle
with open("/home/kosal/AI/Dynamic_K_RAG/Dataset/fact_id_map.pkl", "rb") as f:
    fact_id_map = pickle.load(f)

print(f"Top-{k} retrieved facts for query:")
for idx in I[0]:
    print("-", fact_id_map[idx])


Top-1 retrieved facts for query:
- Data abstraction is a technique used in computer programming to separate the implementation details of a data type from its interface, allowing the implementation to be changed without affecting the code that uses it. This is often achieved through the use of abstract data types (ADTs), which are defined by the operations they support rather than their specific implementation, or through the use of interfaces and classes in object-oriented programming languages. Data abstraction helps to reduce the complexity of software systems by allowing code to be written in a modular and flexible way and by hiding the underlying details of data types from the user.


# Label Creation for Dynamic Top-K


## Load Resources

In [11]:
import json
import numpy as np
import faiss
import pickle
from sentence_transformers import SentenceTransformer

qa_path = "/home/kosal/AI/Dynamic_K_RAG/Dataset/cs_qa_train.jsonl"
index_path = "/home/kosal/AI/Dynamic_K_RAG/Dataset/facts_hnsw.index"
fact_map_path = "/home/kosal/AI/Dynamic_K_RAG/Dataset/fact_id_map.pkl"

with open(qa_path, encoding="utf8") as f:
    qa_data = [json.loads(line) for line in f]
model = SentenceTransformer("all-MiniLM-L6-v2")
index = faiss.read_index(index_path)
with open(fact_map_path, "rb") as f:
    fact_id_map = pickle.load(f)

## Exact Match

In [12]:
def exact_match(pred, gold):
    """Simple, case-insensitive, punctuation-stripped match."""
    import string
    def normalize(s):
        return ''.join(c.lower() for c in s if c not in string.punctuation).strip()
    return normalize(pred) == normalize(gold)

# Dummy Answer 
def generate_answer(context, question):
    # TODO: Replace with your real LLM generation code!
    # For now, just return the context or gold answer for debugging.
    return context


# Main Loop to find top k
max_K = 10
labels = []

for ex in qa_data:
    question = ex["question"]
    gold_answer = ex["answer"]

    # Embed the question
    q_vec = model.encode([question])

    found_k = max_K
    for k in range(1, max_K+1):
        D, I = index.search(np.array(q_vec).astype("float32"), k)
        context = "\n".join([fact_id_map[idx] for idx in I[0][:k]])
        candidate = generate_answer(context, question)
        if exact_match(candidate, gold_answer):
            found_k = k
            break

    labels.append({
        "question": question,
        "question_embedding": q_vec[0].tolist(),
        "gold_answer": gold_answer,
        "best_k": found_k
    })

print(f"Processed {len(labels)} QA examples.")

labels_path = "/home/kosal/AI/Dynamic_K_RAG/Dataset/dynamic_k_labels.jsonl"
with open(labels_path, "w", encoding="utf8") as f:
    for record in labels:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")
print(f"Saved label data to {labels_path}")


Processed 277 QA examples.
Saved label data to /home/kosal/AI/Dynamic_K_RAG/Dataset/dynamic_k_labels.jsonl
