In [1]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import os

model = SentenceTransformer("intfloat/multilingual-e5-large")
dim = model.get_sentence_embedding_dimension()


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import os
import json

chunks = []

folder = "chunks"
for filename in os.listdir(folder):
    if filename.endswith(".json"):
        path = os.path.join(folder, filename)
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
            if isinstance(data, list):
                chunks.extend(data)
            elif isinstance(data, dict):
                chunks.append(data)
            else:
                print(f"Skipping {filename} as it is not a list or dictionary.")

print(f"✅ Loaded {len(chunks)} chunks from {folder}")

texts = [c["content"] for c in chunks]
metadata = [
    {
        "id": c["chunk_id"],
        "title": c["title"],
        "source": c["source"],
        "text": c["content"]
    }
    for c in chunks
]

✅ Loaded 83 chunks from chunks


In [7]:
from tqdm import tqdm

embeddings = model.encode(
    [f"passage: {t}" for t in tqdm(texts, desc="Encoding chunks")],
    batch_size=16,
    convert_to_numpy=True,
    normalize_embeddings=True
).astype("float32")


Encoding chunks: 100%|██████████| 83/83 [00:00<00:00, 12285.69it/s]




In [8]:
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

os.makedirs("storage", exist_ok=True)
faiss.write_index(index, "storage/index.faiss")

with open("storage/metadata.jsonl", "w", encoding="utf-8") as f:
    for m in metadata:
        f.write(json.dumps(m, ensure_ascii=False) + "\n")

print(f"✅ Stored {len(metadata)} chunks into FAISS index")

✅ Stored 83 chunks into FAISS index


In [None]:
import faiss
import json
from sentence_transformers import SentenceTransformer, CrossEncoder

index = faiss.read_index("storage/index.faiss")
with open("storage/metadata.jsonl", "r", encoding="utf-8") as f:
    metadata = [json.loads(line) for line in f]

# reranker = CrossEncoder("Omartificial-Intelligence-Space/ARA-Reranker-V1")
reranker = CrossEncoder("oddadmix/arabic-reranker-v1")

query = "خبرني بعد"

q_emb = model.encode(
    [f"query: {query}"],
    convert_to_numpy=True,
    normalize_embeddings=True
).astype("float32")

D, I = index.search(q_emb, k=10)
candidates = []
for idx in I[0]:
    if "text" in metadata[idx]:
        text = str(metadata[idx]["text"]).strip()
        if text:
            candidates.append(text)
        else:
            print(text)
    else:
        print(f"Missing 'content' in metadata[{idx}]")
            
pairs = [[query, passage] for passage in candidates]
rerank_scores = reranker.predict(pairs)

ranked = sorted(zip(candidates, rerank_scores), key=lambda x: x[1], reverse=True)

print("\n🔎 Top-3 Results After Reranking:\n")
for i, (chunk, score) in enumerate(ranked[:3]):
    print(f"Rank {i+1} | Score: {score:.3f}")
    print("Chunk:", chunk)
    print("---")


In [3]:
import os
import json
from transformers import AutoTokenizer

# Load a tokenizer (you can replace with an Arabic-compatible one if needed)
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

folder_path = "chunks"  # folder containing your JSON files

for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)  # assuming each file contains a list of JSON objects

        print(f"\n📂 File: {filename}")
        for i, element in enumerate(data):
            content = element.get("content", "")
            tokens = tokenizer.encode(content, add_special_tokens=False)
            print(f"  Element {i} → {len(tokens)} tokens")



📂 File: 1.json
  Element 0 → 155 tokens
  Element 1 → 165 tokens
  Element 2 → 138 tokens
  Element 3 → 144 tokens
  Element 4 → 102 tokens

📂 File: 2.json
  Element 0 → 413 tokens
  Element 1 → 249 tokens
  Element 2 → 248 tokens
  Element 3 → 345 tokens
  Element 4 → 259 tokens
  Element 5 → 364 tokens
  Element 6 → 359 tokens
  Element 7 → 312 tokens
  Element 8 → 324 tokens
  Element 9 → 326 tokens
  Element 10 → 315 tokens
  Element 11 → 324 tokens
  Element 12 → 312 tokens
  Element 13 → 338 tokens
  Element 14 → 299 tokens
  Element 15 → 334 tokens
  Element 16 → 331 tokens
  Element 17 → 337 tokens
  Element 18 → 318 tokens
  Element 19 → 303 tokens
  Element 20 → 299 tokens
  Element 21 → 337 tokens
  Element 22 → 346 tokens
  Element 23 → 331 tokens
  Element 24 → 346 tokens
  Element 25 → 353 tokens
  Element 26 → 354 tokens
  Element 27 → 347 tokens
  Element 28 → 369 tokens
  Element 29 → 353 tokens
  Element 30 → 313 tokens
  Element 31 → 320 tokens
  Element 32 → 143 to

In [24]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def generate_answer(query, retrieved_chunks, model="gpt-4o-mini"):

    context = "\n\n".join([c["text"] for c in retrieved_chunks])

    messages = [
        {
            "role": "system",
            "content": (
                "أنت مساعد يجيب بناءا على فكر السيد هاشم صفي الدين."
                "اعتمد في اجابتك على النصوص المتوفرة في السياق. "
                "إذا لم يكن الجواب واضحا وكاملا في السياق، قل أنك لا تعرف. "
                "تكلم باحترام عن الشخصيات الشيعية, مع ذكر الألقاب المناسبة."
                "أجب دائمًا باللغة العربية الفصحى الواضحة."
            ),
        },
        {
            "role": "user",
            "content": f"السياق:\n{context}\n\nالسؤال: {query}",
        },
    ]

    response = openai.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.2,
    )

    return response.choices[0].message.content.strip()


In [25]:
query = "شو هي الأسماء اللي علمها الله سبحانه وتعالى للنبي آدم؟"
query = "ما رأي السيد هاشم بالامام علي؟"

q_emb = model.encode([
    f"query: {query}"
], convert_to_numpy=True, normalize_embeddings=True).astype("float32")

k = 3
D, I = index.search(q_emb, k)
results = [metadata[idx] for idx in I[0]]

retrieved_chunks = results
answer = generate_answer(query, retrieved_chunks)
print(answer)

TypeError: sequence item 0: expected str instance, dict found