In [1]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import os

model = SentenceTransformer("intfloat/multilingual-e5-large")
dim = model.get_sentence_embedding_dimension()


  from .autonotebook import tqdm as notebook_tqdm


In [39]:
import os
import json

chunks = []

folder = "chunks"
for filename in os.listdir(folder):
    if filename.endswith(".json"):
        path = os.path.join(folder, filename)
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)  # could be a list of chunks
            if isinstance(data, list):
                chunks.extend(data)  # add all chunks
            elif isinstance(data, dict):
                chunks.append(data)  # add single chunk

print(f"✅ Loaded {len(chunks)} chunks from {folder}")

texts = [c["content"] for c in chunks]
metadata = [
    {
        "id": c["chunk_id"],
        "title": c["title"],
        "source": c["source"],
        "text": c["content"]
    }
    for c in chunks
]

✅ Loaded 38 chunks from chunks


In [40]:
from tqdm import tqdm

embeddings = model.encode(
    [f"passage: {t}" for t in tqdm(texts, desc="Encoding chunks")],
    batch_size=16,
    convert_to_numpy=True,
    normalize_embeddings=True
).astype("float32")


Encoding chunks: 100%|██████████| 38/38 [00:00<?, ?it/s]


In [41]:
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

os.makedirs("storage", exist_ok=True)
faiss.write_index(index, "storage/index.faiss")

with open("storage/metadata.jsonl", "w", encoding="utf-8") as f:
    for m in metadata:
        f.write(json.dumps(m, ensure_ascii=False) + "\n")

print(f"✅ Stored {len(metadata)} chunks into FAISS index")

✅ Stored 38 chunks into FAISS index


In [None]:
import faiss
import json
from sentence_transformers import SentenceTransformer, CrossEncoder

# === Load FAISS index and metadata ===
index = faiss.read_index("storage/index.faiss")
with open("storage/metadata.jsonl", "r", encoding="utf-8") as f:
    metadata = [json.loads(line) for line in f]

# === Load embedding model (same one you used to build the index) ===
embedder = SentenceTransformer("intfloat/multilingual-e5-base")

# === Load cross-encoder reranker ===
reranker = CrossEncoder("Omartificial-Intelligence-Space/ARA-Reranker-V1")

# === Query ===
query = "الناس عبيد الدنيا"

# Step 1: Encode query with "query:" format
q_emb = embedder.encode(
    [f"query: {query}"],
    convert_to_numpy=True,
    normalize_embeddings=True
).astype("float32")

# Step 2: Retrieve top-k candidates from FAISS
D, I = index.search(q_emb, k=10)  # get top-10
candidates = [metadata[idx]["content"] for idx in I[0]]

# Step 3: Re-rank with cross-encoder
pairs = [[query, passage] for passage in candidates]
rerank_scores = reranker.predict(pairs)

# Step 4: Sort by reranker scores
ranked = sorted(zip(candidates, rerank_scores), key=lambda x: x[1], reverse=True)

# Step 5: Print top-3 results after reranking
print("\n🔎 Top-3 Results After Reranking:\n")
for i, (chunk, score) in enumerate(ranked[:3]):
    print(f"Rank {i+1} | Score: {score:.3f}")
    print("Chunk:", chunk)
    print("---")


In [7]:
import os
import json
from transformers import AutoTokenizer

# Load a tokenizer (you can replace with an Arabic-compatible one if needed)
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

folder_path = "chunks"  # folder containing your JSON files

for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)  # assuming each file contains a list of JSON objects

        print(f"\n📂 File: {filename}")
        for i, element in enumerate(data):
            content = element.get("content", "")
            tokens = tokenizer.encode(content, add_special_tokens=False)
            print(f"  Element {i} → {len(tokens)} tokens")



📂 File: 2.json
  Element 0 → 302 tokens
  Element 1 → 263 tokens
  Element 2 → 245 tokens
  Element 3 → 244 tokens
  Element 4 → 276 tokens
  Element 5 → 287 tokens
  Element 6 → 285 tokens
  Element 7 → 296 tokens
  Element 8 → 290 tokens
  Element 9 → 284 tokens

📂 File: 3.json
  Element 0 → 298 tokens
  Element 1 → 219 tokens
  Element 2 → 235 tokens
  Element 3 → 132 tokens
  Element 4 → 185 tokens
  Element 5 → 271 tokens
  Element 6 → 154 tokens
  Element 7 → 204 tokens
  Element 8 → 231 tokens
  Element 9 → 193 tokens
  Element 10 → 163 tokens
  Element 11 → 195 tokens

📂 File: 4.json
  Element 0 → 185 tokens
  Element 1 → 175 tokens
  Element 2 → 132 tokens
  Element 3 → 161 tokens
  Element 4 → 172 tokens
  Element 5 → 157 tokens
  Element 6 → 143 tokens
  Element 7 → 142 tokens
  Element 8 → 171 tokens

📂 File: id.json
  Element 0 → 68 tokens
  Element 1 → 62 tokens
  Element 2 → 60 tokens
  Element 3 → 58 tokens
  Element 4 → 42 tokens
  Element 5 → 71 tokens
  Element 6 → 

In [24]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def generate_answer(query, retrieved_chunks, model="gpt-4o-mini"):

    context = "\n\n".join([c["text"] for c in retrieved_chunks])

    messages = [
        {
            "role": "system",
            "content": (
                "أنت مساعد يجيب بناءا على فكر السيد هاشم صفي الدين."
                "اعتمد في اجابتك على النصوص المتوفرة في السياق. "
                "إذا لم يكن الجواب واضحا وكاملا في السياق، قل أنك لا تعرف. "
                "تكلم باحترام عن الشخصيات الشيعية, مع ذكر الألقاب المناسبة."
                "أجب دائمًا باللغة العربية الفصحى الواضحة."
            ),
        },
        {
            "role": "user",
            "content": f"السياق:\n{context}\n\nالسؤال: {query}",
        },
    ]

    response = openai.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.2,
    )

    return response.choices[0].message.content.strip()


In [25]:
query = "شو هي الأسماء اللي علمها الله سبحانه وتعالى للنبي آدم؟"
query = "ما رأي السيد هاشم بالامام علي؟"

q_emb = model.encode([
    f"query: {query}"
], convert_to_numpy=True, normalize_embeddings=True).astype("float32")

k = 3
D, I = index.search(q_emb, k)
results = [metadata[idx] for idx in I[0]]

retrieved_chunks = results
answer = generate_answer(query, retrieved_chunks)
print(answer)

TypeError: sequence item 0: expected str instance, dict found