In [9]:
!pip install faiss-cpu huggingface_hub flask flask-cors pyngrok



In [10]:
%%writefile ingest.py

import argparse
import os
import pickle

import faiss
import numpy as np
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer


def load_data(csv_path: str):
    """
    Load your CSV and return two parallel lists:
    - docs: the text to chunk (e.g. the Answer field)
    - metas: a list of dicts with whatever metadata you want to carry along
    """
    df = pd.read_csv(csv_path)
    # adjust column names as needed
    docs = df['Answer'].fillna('').astype(str).tolist()
    metas = [
        {
            'doc_id': row['Document_ID'],
            'question': row['Question'],
            'source': row['Document_Source'],
            'url': row['Document_URL']
        }
        for _, row in df.iterrows()
    ]
    return docs, metas


def chunk_text(text: str, size: int, overlap: int):
    """
    Simple word-based chunking with overlap.
    """
    tokens = text.split()
    chunks = []
    for start in range(0, len(tokens), size - overlap):
        chunk = tokens[start:start + size]
        if not chunk:
            break
        chunks.append(' '.join(chunk))
        if start + size >= len(tokens):
            break
    return chunks


def embed_chunks(
    chunks: list[str],
    tokenizer: AutoTokenizer,
    model: AutoModel,
    device: torch.device
) -> np.ndarray:
    """
    Tokenize + forward-pass each chunk, mean-pool the last hidden states.
    Returns an (N × D) array.
    """
    model.eval()
    all_embeds = []
    with torch.no_grad():
        for text in chunks:
            tokens = tokenizer(
                text,
                return_tensors='pt',
                truncation=True,
                padding='longest'
            ).to(device)

            outputs = model(**tokens)
            last_hidden = outputs.last_hidden_state    # (1, seq_len, D)
            mask = tokens['attention_mask'].unsqueeze(-1)  # (1, seq_len, 1)
            # mean-pool only over non-padded tokens
            summed = (last_hidden * mask).sum(dim=1)      # (1, D)
            counts = mask.sum(dim=1)                      # (1, 1)
            pooled = summed / counts                      # (1, D)
            all_embeds.append(pooled.squeeze(0).cpu().numpy())

    return np.vstack(all_embeds)  # shape (N, D)


def build_and_save_index(
    embeddings: np.ndarray,
    metas: list[dict],
    out_dir: str
):
    """
    Normalize embeddings for inner-product similarity,
    build a FlatIP index, save index + metadata.
    """
    # normalize to unit length for IP = cosine
    faiss.normalize_L2(embeddings)

    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embeddings)

    os.makedirs(out_dir, exist_ok=True)
    faiss.write_index(index, os.path.join(out_dir, 'faiss.index'))

    with open(os.path.join(out_dir, 'metadata.pkl'), 'wb') as f:
        pickle.dump(metas, f)


def main(args):
    # 1) load
    docs, base_metas = load_data(args.csv_path)

    # 2) chunk + assemble metadata
    all_chunks = []
    chunk_metas = []
    for doc_text, meta in zip(docs, base_metas):
        chunks = chunk_text(doc_text, args.chunk_size, args.overlap)
        for i, c in enumerate(chunks):
            all_chunks.append(c)
            m = meta.copy()
            m['chunk_id'] = i
            chunk_metas.append(m)

    # 3) load model + embed
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = AutoTokenizer.from_pretrained(args.model)
    model = AutoModel.from_pretrained(args.model).to(device)

    embeddings = embed_chunks(all_chunks, tokenizer, model, device)

    # 4) build FAISS + save everything
    build_and_save_index(embeddings, chunk_metas, args.out_dir)
    print(f"Indexed {len(all_chunks)} chunks. Index + metadata saved to '{args.out_dir}'.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="RAG ingestion: CSV → chunks → PUBMEDBERT → FAISS"
    )
    parser.add_argument(
        "csv_path",
        help="path to your consolidated CSV (e.g. /content/all_data.csv)"
    )
    parser.add_argument(
        "--out_dir", "-o",
        default="index_data",
        help="where to write faiss.index + metadata.pkl"
    )
    parser.add_argument(
        "--chunk_size", "-c", type=int, default=200,
        help="max words per chunk"
    )
    parser.add_argument(
        "--overlap", "-l", type=int, default=50,
        help="words overlap between consecutive chunks"
    )
    parser.add_argument(
        "--model", "-m",
        default="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
        help="HuggingFace model ID for PubMedBERT"
    )
    args = parser.parse_args()
    main(args)


Overwriting ingest.py


In [11]:
!python ingest.py /content/all_data.csv --out_dir /content/outputs

2025-05-16 01:30:49.297831: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747359049.331197    4606 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747359049.342170    4606 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-16 01:30:49.374870: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. De

In [24]:
# %%writefile retrieve_generate.py

import argparse
import os
import pickle

import faiss
import numpy as np
import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)

from huggingface_hub import login

def load_data(csv_path: str):
    # Load your CSV and return raw docs and base metadata
    df = pd.read_csv(csv_path)
    docs = df['Answer'].fillna('').astype(str).tolist()
    metas = [
        {
            'doc_id': row['Document_ID'],
            'source': row['Document_Source'],
            'url': row['Document_URL'],
        }
        for _, row in df.iterrows()
    ]
    return docs, metas

def chunk_text(text: str, size: int, overlap: int):
    tokens = text.split()
    chunks = []
    for start in range(0, len(tokens), size - overlap):
        chunk = tokens[start:start + size]
        if not chunk:
            break
        chunks.append(' '.join(chunk))
        if start + size >= len(tokens):
            break
    return chunks

def embed_query(text: str, tokenizer: AutoTokenizer, model: AutoModel, device: torch.device) -> np.ndarray:
    # Embed a single query and L2-normalize (for IndexFlatIP / cosine sim)
    model.eval()
    toks = tokenizer(text, return_tensors='pt', truncation=True, padding='longest').to(device)
    with torch.no_grad():
        out = model(**toks).last_hidden_state      # (1, seq_len, D)
        mask = toks['attention_mask'].unsqueeze(-1)  # (1, seq_len, 1)
        summed = (out * mask).sum(dim=1)            # (1, D)
        counts = mask.sum(dim=1)                    # (1, 1)
        pooled = (summed / counts).cpu().numpy()    # (1, D)
    faiss.normalize_L2(pooled)
    return pooled

def get_final_response(
    query,
    csv_path="/content/all_data.csv",
    index_dir="/content/outputs",
    embed_model="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
    llm_model="meta-llama/Llama-2-7b-chat-hf",
    chunk_size=200,
    overlap=50,
    top_k=10,
    max_new_tokens=512,
    hf_token='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'
):

    login(token=hf_token)

    # 1) Load FAISS index + metadata
    idx = faiss.read_index(os.path.join(index_dir, "faiss.index"))
    with open(os.path.join(index_dir, "metadata.pkl"), "rb") as f:
        chunk_metas = pickle.load(f)

    # 2) Reload & re-chunk docs
    docs, _ = load_data(csv_path)
    all_chunks = []
    for doc_text in docs:
        all_chunks.extend(chunk_text(doc_text, chunk_size, overlap))

    # 3) Prepare embed model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    embed_tokenizer = AutoTokenizer.from_pretrained(embed_model)
    embed_model = AutoModel.from_pretrained(embed_model, device_map="auto")#.to(device)

    # 4) Embed query & search
    q_vec = embed_query(query, embed_tokenizer, embed_model, device)
    distances, indices = idx.search(q_vec, top_k)

    # 5) Collect retrieved chunks
    retrieved = []
    for dist, idx_ in zip(distances[0], indices[0]):
        retrieved.append({
            "score": float(dist),
            "meta":  chunk_metas[idx_],
            "text":  all_chunks[idx_],
        })

    # 6) Build prompt
    context = "\n\n".join(
        f"[{i+1}] Source: {r['meta']['source']} | Text: {r['text']}"
        for i, r in enumerate(retrieved)
    )
    system_prompt = (
        "You are a knowledgeable medical assistant. "
        "Use the following retrieved context to answer the user's question in detailed."
    )
    user_prompt = (
        f"{system_prompt}\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {query}\n"
        f"Answer:"
    )

    from google import genai
    from google.genai import types

    client = genai.Client(api_key="AIzaSyBfa7rvEKHMNjhWayQLUstXP27k5AP7Fz0")

    response = client.models.generate_content(
        model="gemini-2.5-flash-preview-04-17",
        config=types.GenerateContentConfig(
        max_output_tokens=8000,
    ),
        contents=user_prompt,
    )

    answer = response.text

    # 9) Display results
    # print("\n=== Retrieved Contexts ===")
    # for r in retrieved:
    #     print(f"• (score={r['score']:.4f}) {r['text'][:200]}…\n")
    # print("\n=== Answer ===")
    # print(answer)

   # return answer + "\n\n" + score +  " Context:\n" + context

    return answer + '\n' + '\n'.join([f"{r['score']:.4f} {r['text'][:2001]}.\n" for r in retrieved])

In [25]:
print(get_final_response("What causes Laron syndrome ?"))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Based on the provided context, there is no information about what causes Laron syndrome. The context discusses the causes or inheritance of other conditions like Jones syndrome, Bell's palsy, blue rubber bleb nevus syndrome, Behr syndrome, and Wildervanck syndrome, but Laron syndrome is not mentioned.
0.9657 What causes Jones syndrome? The exact, underlying genetic cause of Jones syndrome is not yet known..

0.9640 Is Williams syndrome inherited?.

0.9625 What causes Bell's palsy?.

0.9616 What causes blue rubber bleb nevus syndrome? Currently the cause of blue rubber bleb syndrome is not known..

0.9611 How is oculopharyngeal muscular dystrophy inherited?.

0.9600 Is genetic testing available for occipital horn syndrome?.

0.9590 What causes Behr syndrome? The exact cause of Behr syndrome is not known; however, a genetic cause is suspected based on the families identified, thus far..

0.9585 Researchers are not sure how common Shwachman-Diamond syndrome is. Several hundred cases have 

In [14]:
from pyngrok import ngrok
ngrok.kill()

In [15]:
import threading
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok, conf


# =================== SET YOUR TOKENS ===================
conf.get_default().auth_token = "2x9QwOR9RwW6iKUQDqkPeL54lza_2AnJ8kVPZLKEAvEaVQupk"
HF_TOKEN = "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"

# =================== SETUP FLASK ===================
app = Flask(__name__)
CORS(app, resources={r"/api/*": {"origins": "*"}}, supports_credentials=True)

@app.route("/api/ask", methods=["GET", "POST"])
def ask():
    if request.method == "GET":
        return "✅ Flask is running and /api/ask is reachable. Use POST to query."

    query = request.json.get("query")
    print("📥 Received query:", query)

    try:
        answer = get_final_response(query, hf_token=HF_TOKEN)
        return jsonify({"answer": answer})
    except Exception as e:
        print("❌ Error:", str(e))
        return jsonify({"answer": f"Error: {str(e)}"}), 500

# =================== START SERVER ===================
public_url = ngrok.connect(5000)
print("\n🌐 Public URL (paste into frontend):", public_url)

# Run Flask in a thread so it doesn't block
threading.Thread(target=app.run, kwargs={"port": 5000}).start()







🌐 Public URL (paste into frontend): NgrokTunnel: "https://3f06-34-125-67-162.ngrok-free.app" -> "http://localhost:5000"


In [26]:
import requests
r = requests.post("http://127.0.0.1:5000/api/ask", json={"query": "What causes L-arginine:glycine amidinotransferase deficiency?"})
print(r.json())

📥 Received query: What causes L-arginine:glycine amidinotransferase deficiency?


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
INFO:werkzeug:127.0.0.1 - - [16/May/2025 02:35:05] "POST /api/ask HTTP/1.1" 200 -


{'answer': "Based on the provided context:\n\nL-arginine:glycine amidinotransferase deficiency is caused by mutations in the **GATM gene**.\n\nHere's a detailed explanation:\n1.  The **GATM gene** provides instructions for making the enzyme called **arginine:glycine amidinotransferase**.\n2.  This enzyme plays a crucial role in the first step of the two-step process that produces **creatine** from the amino acids glycine, arginine, and methionine.\n3.  Specifically, the enzyme transfers a guanidino group from arginine to glycine, producing guanidinoacetic acid (which is then converted to creatine).\n4.  Creatine is essential for the body to store and use energy properly.\n5.  **Mutations in the GATM gene** lead to a deficiency of the arginine:glycine amidinotransferase enzyme or impair its ability to function correctly in creatine synthesis.\n6.  This impairment results in a **shortage of creatine** in the body.\n7.  The effects of this creatine shortage are most severe in organs and t