In [2]:
import os
import json

chunks = []

folder = r"/Users/mohamad/Documents/GitHub/Personalized-RAG-Chatbot/chunks.json"

with open(folder, "r", encoding="utf-8") as f:
    data = json.load(f)
    if isinstance(data, list):
        chunks.extend(data)
    elif isinstance(data, dict):
        chunks.append(data)
    else:
        print(f"Skipping {folder} as it is not a list or dictionary.")

print(f"โ Loaded {len(chunks)} chunks from {folder}")

texts = [c["content"] for c in chunks]
metadata = [
    {
        "id": c["id"],
        "title": c["title"],
        "source": c["source"],
        "text": c["content"]
    }
    for c in chunks
]

โ Loaded 100 chunks from /Users/mohamad/Documents/GitHub/Personalized-RAG-Chatbot/chunks.json


In [2]:
import openai
import numpy as np
from tqdm import tqdm
import os
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

texts = [c["content"] for c in chunks]
embeddings = []

for text in tqdm(texts, desc="Embedding texts"):
    response = openai.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    emb = np.array(response.data[0].embedding, dtype="float32")
    emb /= np.linalg.norm(emb)
    embeddings.append(emb)

embeddings = np.vstack(embeddings)

Embedding texts: 100%|โโโโโโโโโโ| 100/100 [01:14<00:00,  1.35it/s]


In [3]:
import faiss
import os

dim = embeddings.shape[1]

index = faiss.IndexFlatIP(dim)
index.add(embeddings)

os.makedirs("storage", exist_ok=True)
faiss.write_index(index, "storage/openai_index.faiss")

print(f"โ FAISS index created and saved with {index.ntotal} vectors.")


โ FAISS index created and saved with 100 vectors.


In [3]:
def get_embedding(text: str, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=text,
        model=model
    )
    embedding = response.data[0].embedding
    return np.array(embedding, dtype='float32')

In [5]:
import json
os.makedirs("storage", exist_ok=True)

metadata = [
    {
        "id": i,
        "title": c.get("title", ""),
        "source": c.get("source", ""),
        "content": c["content"]
    }
    for i, c in enumerate(chunks)
]


with open("storage/chunks_metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)

In [18]:
import json
import faiss
import openai
import numpy as np

index = faiss.read_index("storage/openai_index.faiss")

with open("storage/chunks_metadata.json", "r", encoding="utf-8") as f:
    metadata = json.load(f)

def search_index(query, k=5, min_score=0.4):

    query_vector = get_embedding(query).reshape(1, -1)
    query_vector /= np.linalg.norm(query_vector)

    distances, indices = index.search(query_vector, k)

    # Build results, but ensure at least one item is returned
    results = []
    top_pairs = list(zip(distances[0], indices[0]))

    for dist, idx in top_pairs:
        if idx < 0:
            continue
        if dist < min_score:
            continue  # skip low scores
        chunk_data = metadata[idx]
        results.append({
            "score": float(dist),
            "chunk": chunk_data["content"],
            "metadata": {
                "id": chunk_data["id"],
                "title": chunk_data.get("title", ""),
                "source": chunk_data.get("source", "")
            }
        })

    # Fallback: if empty, include the best match regardless of score
    if results and top_pairs:
        dist, idx = top_pairs[0]
        if idx >= 0:
            chunk_data = metadata[idx]
            results.append({
                "score": float(dist),
                "chunk": chunk_data["content"],
                "metadata": {
                    "id": chunk_data["id"],
                    "title": chunk_data.get("title", ""),
                    "source": chunk_data.get("source", "")
                }
            })

    return results

query = "ูุง ูู ุงูุนูุงูุงุช ุงูุชู ุชุฏู ุนูู ุงุณุชุฌุงุจุฉ ุฏุนุงุก ุงูุฅูุณุงูุ"
query = "ููู ูููู ููุฅูุณุงู ุฃู ูุนุฑู ุฃู ุฏุนุงุกู ูุฏ ุงุณุชุฌูุจุ"
query = "ูุง ูู ุฑุฃูู ูููุง ุญุฏุซ ุจุงูุฃูุณุ"

results = search_index(query)

print("Top relevant chunks:")
for i, res in enumerate(results, 1):
    print(f"\nResult {i} (score: {res['score']:.4f}):")
    print(res["chunk"])
    print(res["metadata"])


Top relevant chunks:


In [6]:
def reformulate_query(query):
    model="gpt-4o"
    system_prompt = (
        "ุฃูุช ูุณุงุนุฏ ูุชุฎุตุต ูู ุฅุนุงุฏุฉ ุตูุงุบุฉ ุงูุฃุณุฆูุฉ ุจุทุฑููุฉ ููููุฉ ุถูู ูุธุงู ุงุณุชุฑุฌุงุน ุงููุนูููุงุช (RAG)."
        "ุฅุฐุง ูุงู ุงูุณุคุงู ููุชูุจูุง ุจุงูููุฌุฉ ุงููุจูุงููุฉ ุจุฃุญุฑู ุฅูุฌููุฒูุฉุ ุชุฑุฌู ุงูุณุคุงู ุฅูู ุงูุนุฑุจูุฉ ุงููุตุญู ุจุฃูุซุฑ ุทุฑููุฉ ุงุญุชุฑุงููุฉ ููููุฉ ูุน ุงูุญูุงุธ ุนูู ุงููููุฉ ูู ุงูุชุนุจูุฑ."
        "ุงุจุฏุฃ ุฏุงุฎูููุง ุจุฎุทุฉ ูุฎุชุตุฑุฉ ูู ูฃ ุฅูู ูฅ ุฎุทูุงุช ููุงููููุฉ ููุนุงูุฌุฉ ูู ูุฑุญูุฉ ูู ูุฑุงุญู ุงูุณุคุงูุ ููุง ุชุถูู ูุฐู ุงูุฎุทุฉ ูู ุงููุชูุฌุฉ ุงูููุงุฆูุฉ. "
        "ุฃุนุฏ ูุชุงุจุฉ ุงูุณุคุงู ุจููุณ ุงูุตูุบุฉ ุงููุณุชุฎุฏูุฉ ูู ูุจู ุงููุชููู (ูุง ุชุบูุฑ ุงูุถูุงุฆุฑ ุฃู ูุฌูุฉ ุงููุธุฑ)ุ ููุง ุชุถู ุฃู ุชุญุฐู ุฃู ูุนูู ุฌุฏูุฏ."
        "ุฅุฐุง ูุงู ุงูุณุคุงู ูุงุถุญูุง ููุจุงุดุฑูุงุ ุฃุนูุฏ ุนุฑุถู ููุง ูู ูุน ุชุญุณูู ุทููู ููุฃุณููุจ ููุท."
        "ุงููุฏู ูู ุฌุนู ุงูุณุคุงู ุฃูุถุญ ูุฃูุซุฑ ุฑุณููุฉ ุฏูู ุชุบููุฑ ูุนูุงู ุฃู ุตูุบุฉ ุงููุชููู. "
        "ุจุนุฏ ุชุนุฏูู ูู ุณุคุงูุ ุชุญูู ุฏุงุฎูููุง ูู ุฌููุฉ ุฃู ุฌููุชูู ุฃู ุงูุชุนุฏูู ุญูู ุงููุถูุญ ูุงูุงุญุชุฑุงููุฉ ุฏูู ุชุบููุฑ ุงูุฌููุฑ. "
        "ุงูุชุจ ููุท ุงูุตูุบุฉ ุงูููุงุฆูุฉ ููุณุคุงู ุฏูู ุดุฑุญ ุฃู ุฎุทูุงุช."
        "ุงูุฅุฎุฑุงุฌ ุฏุงุฆููุง ุนุจุงุฑุฉ ุนู ุงูุณุคุงู ุงูููุงุฆู ุงููุนุงุฏ ุตูุงุบุชู ููุท (ุฌููุฉ ูุงุญุฏุฉ ุฃู ุฃูุซุฑ ุจุงููุบุฉ ุงูุนุฑุจูุฉ ุงููุตุญู). ูุง ุชุดุฑุญ ุฃู ุชุฏุฑุฌ ุฃู ุชูุงุตูู ุนู ุงูุนูููุฉ ุฃู ุงูููุงุฆู ุงููููุฐุฉ ุฏุงุฎูููุง โ ุงููุงุชุฌ ุงูููุงุฆู ูู ุงูุณุคุงู ููุท."
        )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": query},
    ]
    response = openai.chat.completions.create(
        model=model,
        messages=messages,
        temperature=1,
    )
    return response.choices[0].message.content.strip()

In [22]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def generate_answer_with_history(user_id, query, retrieved_chunks, formatted_history: str):
    model="gpt-4o-mini"
    context = "\n\n".join([c["chunk"] for c in retrieved_chunks])
    query = reformulate_query(query)
    messages = [
        {
            "role": "system",
            "content": (
                "ุฃูุช ูุณุงุนุฏ ูุฌูุจ ุนูู ุฃูู ุงูุณูุฏ ูุงุดู ุตูู ุงูุฏูู."
                "ุงุณุชุนู ุฏุงุฎูููุง ุจูุงุฆูุฉ ูุฎุชุตุฑุฉ (3-7 ุนูุงุตุฑ) ููุฎุทูุงุช ุงูููุงููููุฉ ูุจู ุชูุฏูู ุฃู ุฅุฌุงุจุฉุ ููู ูุง ุชุทุจุน ุฃู ุชุฏุฑุฌ ูุฐู ุงููุงุฆูุฉ ูู ุงูุฑุฏ."
                "ุนูุฏ ุงูุชุนุงูู ูุน ุฃุณุฆูุฉ ุชุญูุฉ ุฃู ุฃุณุฆูุฉ ุนุงูุฉ ุจุณูุทุฉ ูุซู ุงูุณูุงู ุนูููู ุฃู ููู ุงูุญุงูุ ูููู ุงูุฅุฌุงุจุฉ ุนูููุง ูุจุงุดุฑุฉ ุฏูู ุงูุญุงุฌุฉ ููุงุนุชูุงุฏ ุนูู ูุตูุต ุงูุณูุงู."
                "ุงุนุชูุฏ ูู ุฅุฌุงุจุชู ููุท ุนูู ุงููุตูุต ุงููุชููุฑุฉ ูู ุงูุณูุงู ููุฃุณุฆูุฉ ุงูุฃุฎุฑู. ุฅุฐุง ูู ููู ุงูุฌูุงุจ ูุงุถุญูุง ููุงููุงู ูู ุงูุณูุงูุ ูู ุฅูู ูุง ููุฌุฏ ุฅุฌุงุจุฉ."
                ".ูุง ูุฌุจ ุฃู ุชุฌูุจ ุนูู ุฃู ุณุคุงู ุฅุฐุง ูู ูุชู ุงุณุชุฑุฌุงุน ุฃูุฉ ููุงุทุน ูุตูุฉุ ููุง ุชุฐูุฑ ุฃู ููุถูุน ูู ุฅุฌุงุจุงุชู ูุง ูู ููู ููุฌูุฏูุง ุฃูุถูุง ูู ุงูููุงุทุน ุงููุณุชุฑุฌุนุฉ."
                "ูุง ุชุดุฑ ุฅูู ุงูุณูุงู ูู ุฅุฌุงุจุชู ุฅุฐ ุฃู ุงููุงุฑุฆ ูุง ูุณุชุทูุน ูุฑุงุกุชู."
                "ุชุญุฏุซ ุจุงุญุชุฑุงู ุนู ุงูุดุฎุตูุงุช ุงูุดูุนูุฉุ ูุน ุฐูุฑ ุงูุฃููุงุจ ุงูููุงุณุจุฉ."
                "ุฃุฌุจ ุฏุงุฆููุง ุจุงููุบุฉ ุงูุนุฑุจูุฉ."
                "ูุง ุชุดุฑุญ ุฃู ุชุฏุฑุฌ ุฃู ุชูุงุตูู ุนู ุงูุนูููุฉ ุฃู ุงูููุงุฆู ุงููููุฐุฉ ุฏุงุฎูููุงุ ุงูุชุจ ุงูุฌูุงุจ ููุท."
                "ูุจู ุนุฑุถ ุงูุงุฌุงุจุฉ ูููุงุฑุฆ ุชุงูุฏ ุฏุงุฎูููุง ูู ุงู ุงูุงุฌุงุจุฉ ุฏูููุฉ ูููุชููุฉ ุจูุงุก ุนูู ุงููุนูููุงุช ุงููุชููุฑุฉ ูู ุงูุณูุงูุ ูููุชุฒูุฉ ุจูุงูุฉ ุงูุดุฑูุท ูุงูุชุนูููุงุช ุงููุงุฑุฏุฉ."
            ),
        },
        {"role": "system", "content": PERSONA_PREAMBLE},
        {"role": "system", "content": f"\n\nุงูุฑุณุงุฆู ุงูุณุงุจูุฉ:\n{formatted_history}"},
        {
            "role": "user",
            "content": f"ุงูุณูุงู:\n\n{context}\n\nุงูุณุคุงู: {query}",
        },    
    ]
    response = openai.chat.completions.create(
        model=model,
        messages=messages,
        temperature=1,
    )
    print(messages)
    answer_text = response.choices[0].message.content.strip()

    # Build unique citations list from retrieved chunks' metadata
    citations = []

    if retrieved_chunks:
        seen = set()
        for item in retrieved_chunks:
            md = item.get("metadata", {})
            title = (md.get("title") or "").strip()
            source = (md.get("source") or "").strip()
            key = (title, source)
            if (title or source) and key not in seen:
                seen.add(key)
                if title and source:
                    citations.append(f"- {title} โ {source}")
                elif title:
                    citations.append(f"- {title}")
                else:
                    citations.append(f"- {source}")

    if citations:
        answer_text = f"{answer_text}\n\nุงููุตุงุฏุฑ:\n" + "\n".join(citations)

    print(answer_text)
    return answer_text

In [10]:
with open("/Users/mohamad/Documents/GitHub/Personalized-RAG-Chatbot/character.json", "r", encoding="utf-8") as f:
    character = json.load(f)

def build_persona_preamble(c) -> str:
    if isinstance(c, list):
        c = next((x for x in c if isinstance(x, dict) and 'lexicon' in x), (c[0] if c and isinstance(c[0], dict) else {}))
    elif not isinstance(c, dict):
        c = {}

    role_instructions = c.get("role_instructions")
    t = c.get("tone", {})
    tone = ", ".join(t.values())

    lex = c.get("lexicon") or {}
    inv = "\n- ".join(lex.get("invocations") or [])
    honors = "\n- ".join(lex.get("honorifics") or [])
    ashura = "\n- ".join(lex.get("ashura_register") or [])
    bins = "\n- ".join(lex.get("binaries") or [])
    values = "\n- ".join(lex.get("values") or [])

    dm_formal = "\n- ".join(lex.get("discourse_markers_formal") or [])
    dm_colloq = "\n- ".join(lex.get("discourse_markers_colloquial") or [])
    emph = "\n- ".join(lex.get("emphasis_markers") or [])
    key_terms = "\n- ".join(lex.get("key_terms") or [])

    reh = c.get("rhetorical_scaffold") or {}
    open = "\n- ".join(reh.get("open") or [])
    develop = "\n- ".join(reh.get("develop") or [])
    evidence = "\n- ".join(reh.get("evidence") or [])
    application = "\n- ".join(reh.get("application") or [])
    closure = "\n- ".join(reh.get("closure") or [])

    pacing = c.get("response_pacing", {})
    response_pacing = ", ".join(pacing.values())

    greetings = "\n- ".join(c.get("greeting_templates") or [])
    closing = "\n- ".join(c.get("closing_templates") or [])
    condolences = "\n- ".join(c.get("condolence_templates") or [])

    q = c.get("quote_frames", {})
    quote_frames = ", ".join(q.values())

    do = "\n- ".join(c.get("do") or [])
    dont = "\n- ".join(c.get("dont") or [])

    snippet = c.get("style_snippets", {})
    style_snippets = ", ".join(snippet.values())

    micro = c.get("micro_templates", {})
    micro_templates = ", ".join(micro.values())

    topics = "\n- ".join(c.get("topics") or [])

    tk = c.get("topics_knowledge") or {}
    personal_section = ""
    other_topics_sections = []
    if isinstance(tk, dict):
        for name, data in tk.items():
            is_personal = isinstance(name, str) and "ุณูุฑุฉ ุงูุณูุฏ ูุงุดู ุตููู ุงูุฏูู" in name
            highlights = []
            points = []
            use_with = None
            if isinstance(data, dict):
                highlights = data.get("highlights") or []
                points = data.get("points") or []
                use_with = data.get("use_with")
            if is_personal:
                lines = []
                if points:
                    lines.extend(points)
                elif highlights:
                    lines.extend(highlights)
                else:
                    lines.extend([f"{k}: {v}" for k, v in data.items()])
                personal_section = "\n".join(["ุงูุณูุฑุฉ ุงูุดุฎุตูุฉ:"] + [f"- {x}" for x in lines])
            else:
                lines = []
                if highlights:
                    lines.extend(highlights)
                elif points:
                    lines.extend(points)
                else:
                    lines.extend([f"{k}: {v}" for k, v in data.items()])
                section = "\n".join([name + ":"] + [f"- {x}" for x in lines] + ([f"- use_with: {use_with}"] if use_with else []))
                other_topics_sections.append(section)
    topics_knowledge_personal = personal_section
    topics_knowledge_other = "\n\n".join(other_topics_sections)

    cu = c.get("contextual_usage") or {}
    contextual_usage = "\n".join(["ูููุฏ ุงูุงุณุชุฎุฏุงู ุงูุณูุงูู (ุฅูุฒุงูู):"] + [f"- {k}: {v}" for k, v in cu.items()])

    return (
        f"ุงูุดุฎุตูุฉ: {c.get('name','')}\n"
        f"ุงูุบุฑุถ: {c.get('purpose','')}\n"
        f"ุชุนูููุงุช ุงูุฏูุฑ: {role_instructions}\n"
        f"ุงููุจุฑุฉ: {tone}\n"
        f"ุงูุงูุชุชุงุญูุงุช:\n- {inv}\n"
        f"ุงูุฃููุงุจ:\n- {honors}\n"
        f"ุณุฌู ุนุงุดูุฑุงุฆู:\n- {ashura}\n"
        f"ุงูุซูุงุฆูุงุช:\n- {bins}\n"
        f"ุงูููู:\n- {values}\n"
        f"ุฑูุงุจุท ุงูุฎุทุงุจ (ูุตุญู):\n- {dm_formal}\n"
        f"ุฑูุงุจุท ุงูุฎุทุงุจ (ุนุงููุฉ):\n- {dm_colloq}\n"
        f"ุนูุงูุงุช ุงูุชุฃููุฏ:\n- {emph}\n"
        f"ูุตุทูุญุงุช ููุชุงุญูุฉ:\n- {key_terms}\n"
        f"ุงูุชูููุฏ ุงูุจูุงุบู:\n- {open}\n"
        f"ุงูุชุทููุฑ ุงูุจูุงุบู:\n- {develop}\n"
        f"ุฃูุซูุฉ ูุฃุฏูุฉ:\n- {evidence}\n"
        f"ุชุทุจูู ุงูุจูุงุบุฉ:\n- {application}\n"
        f"ุงูุฅุบูุงู ุงูุจูุงุบู:\n- {closure}\n"
        f"ุฅููุงุน ุงูุงุณุชุฌุงุจุฉ: {response_pacing}\n"
        f"ููุงูุจ ุงูุชุฑุญูุจ:\n- {greetings}\n"
        f"ููุงูุจ ุงูุฎุชุงู:\n- {closing}\n"
        f"ููุงูุจ ุงูุชุนุฒูุฉ:\n- {condolences}\n"
        f"ุฃุทุฑ ุงูุงูุชุจุงุณ: {quote_frames}\n"
        f"ุงูุนู:\n- {do}\n"
        f"ูุง ุชูุนู:\n- {dont}\n"
        f"ููุชุทูุงุช ุฃุณููุจูุฉ: {style_snippets}\n"
        f"ููุงูุจ ุฏูููุฉ: {micro_templates}\n"
        f"ุงูููุถูุนุงุช:\n- {topics}\n"
        f"{topics_knowledge_personal}\n\n{topics_knowledge_other}\n"
        f"{contextual_usage}\n"
        f"ุชูุจูู: ุงูุงูุชุฒุงู ุจูุง ุณุจู ุฅูุฒุงูู ูู ูู ุฅุฌุงุจุฉ."
    )

PERSONA_PREAMBLE = build_persona_preamble(character)

In [23]:
query = "ูุง ูู ุงูุนูุงูุงุช ุงูุชู ุชุฏู ุนูู ุงุณุชุฌุงุจุฉ ุฏุนุงุก ุงูุฅูุณุงูุ"
query = "ููู ูููู ููุฅูุณุงู ุฃู ูุนุฑู ุฃู ุฏุนุงุกู ูุฏ ุงุณุชุฌูุจุ"
query = "ูุง ูู ุฑุฃูู ูููุง ุญุฏุซ ุจุงูุฃูุณุ"

results = search_index(query)

retrieved_chunks = results
answer = generate_answer_with_history(user_id=1, query=query, retrieved_chunks=retrieved_chunks, formatted_history="")
print(answer)

[{'role': 'system', 'content': 'ุฃูุช ูุณุงุนุฏ ูุฌูุจ ุนูู ุฃูู ุงูุณูุฏ ูุงุดู ุตูู ุงูุฏูู.ุงุณุชุนู ุฏุงุฎูููุง ุจูุงุฆูุฉ ูุฎุชุตุฑุฉ (3-7 ุนูุงุตุฑ) ููุฎุทูุงุช ุงูููุงููููุฉ ูุจู ุชูุฏูู ุฃู ุฅุฌุงุจุฉุ ููู ูุง ุชุทุจุน ุฃู ุชุฏุฑุฌ ูุฐู ุงููุงุฆูุฉ ูู ุงูุฑุฏ.ุนูุฏ ุงูุชุนุงูู ูุน ุฃุณุฆูุฉ ุชุญูุฉ ุฃู ุฃุณุฆูุฉ ุนุงูุฉ ุจุณูุทุฉ ูุซู ุงูุณูุงู ุนูููู ุฃู ููู ุงูุญุงูุ ูููู ุงูุฅุฌุงุจุฉ ุนูููุง ูุจุงุดุฑุฉ ุฏูู ุงูุญุงุฌุฉ ููุงุนุชูุงุฏ ุนูู ูุตูุต ุงูุณูุงู.ุงุนุชูุฏ ูู ุฅุฌุงุจุชู ููุท ุนูู ุงููุตูุต ุงููุชููุฑุฉ ูู ุงูุณูุงู ููุฃุณุฆูุฉ ุงูุฃุฎุฑู. ุฅุฐุง ูู ููู ุงูุฌูุงุจ ูุงุถุญูุง ููุงููุงู ูู ุงูุณูุงูุ ูู ุฅูู ูุง ููุฌุฏ ุฅุฌุงุจุฉ..ูุง ูุฌุจ ุฃู ุชุฌูุจ ุนูู ุฃู ุณุคุงู ุฅุฐุง ูู ูุชู ุงุณุชุฑุฌุงุน ุฃูุฉ ููุงุทุน ูุตูุฉุ ููุง ุชุฐูุฑ ุฃู ููุถูุน ูู ุฅุฌุงุจุงุชู ูุง ูู ููู ููุฌู

In [24]:
history = "{'role': 'system', 'content': '\n\nุงูุฑุณุงุฆู ุงูุณุงุจูุฉ:\nuser: aan shu btaaref tehke'}, {'role': 'user', 'content': 'ุงูุณูุงู:\n\n\n\nุงูุณุคุงู: ุนู ูุงุฐุง ุชุณุชุทูุน ุงูุชุญุฏุซุ'}]"
query = "hello"
# query =  "ุณูุงู"

refined_query = reformulate_query(query)
print("๐ Reformulated:", refined_query)


๐ Reformulated: ูุฑุญุจูุงุ ููู ูููููู ูุณุงุนุฏุชู ุงููููุ
