In [1]:
pip install transformers rank_bm25 datasets evaluate

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m


In [1]:
from datasets import load_dataset
from rank_bm25 import BM25Okapi
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

# تحميل مجموعة بيانات SQuAD (الإنجليزية)
dataset = load_dataset("squad")
questions = [q["question"] for q in dataset["train"]]
contexts = [q["context"] for q in dataset["train"]]
answers = [q["answers"]["text"][0] for q in dataset["train"]]

# نستخدم فقط 1000 مثال للسرعة
questions = questions[:1000]
contexts = contexts[:1000]
answers = answers[:1000]

# Tokenization للنصوص الإنجليزية
tokenized_corpus = [context.split() for context in contexts]
bm25 = BM25Okapi(tokenized_corpus)

# دالة البحث
def classical_ir_search(query, n=5):
    tokenized_query = query.split()
    doc_scores = bm25.get_scores(tokenized_query)
    top_n = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:n]
    return [contexts[i] for i in top_n]



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
# تحميل نموذج BERT للإنجليزية
model_name = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)


# دالة استخراج الجواب بواسطة LLM
def llm_answer(question, context):
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])
    )
    return answer

# دالة كاملة: سؤال ➡️ بحث ➡️ استدعاء نموذج ➡️ جواب
def qa_system(question):
    retrieved_contexts = classical_ir_search(question, n=1)
    best_context = retrieved_contexts[0]
    answer = llm_answer(question, best_context)
    return answer


In [None]:
!pip install bert_score


In [3]:
from evaluate import load
bertscore = load("bertscore")

def evaluate_methods(question, true_answer):
    # البحث بالطريقة الكلاسيكية
    bm25_results = classical_ir_search(question)
    bm25_answer = bm25_results[0][:500]  # نأخذ أول 500 حرف من الفقرة

    # البحث بنموذج LLM
    llm_context = bm25_results[0]  # نستخدم أفضل نتيجة من BM25 كسياق
    predicted_answer = llm_answer(question, llm_context)

    # التقييم باستخدام BERTScore مع اللغة الإنجليزية
    bm25_score = bertscore.compute(
        predictions=[bm25_answer], references=[true_answer], lang="en"
    )["f1"][0]

    llm_score = bertscore.compute(
        predictions=[predicted_answer], references=[true_answer], lang="en"
    )["f1"][0]

    return {
        "question": question,
        "true_answer": true_answer,
        "bm25_answer": bm25_answer,
        "llm_answer": predicted_answer,
        "bm25_score": bm25_score,
        "llm_score": llm_score
    }


In [5]:
# --- اختبار على سؤال عشوائي ---
sample_idx = 42

result = evaluate_methods(questions[sample_idx], answers[sample_idx])

print("="*50)
print(f"🔹 السؤال:\n{result['question']}")
print(f"\n✅ الإجابة الصحيحة:\n{result['true_answer']}")
print(f"\n📄 إجابة BM25:\n{result['bm25_answer']}\n(التقييم: {result['bm25_score']:.2f})")
print(f"\n🤖 إجابة BERT:\n{result['llm_answer']}\n(التقييم: {result['llm_score']:.2f})")
print("="*50)

# --- تقييم على عدة أسئلة ---
import numpy as np

scores = []

# نجرب على أول 50 سؤال لتسريع التنفيذ
for i in range(100):
    res = evaluate_methods(questions[i], answers[i])
    scores.append((res["bm25_score"], res["llm_score"]))

# حساب المتوسطات
bm25_avg = np.mean([s[0] for s in scores])
llm_avg = np.mean([s[1] for s in scores])

print("\n📊 النتائج النهائية على 100 سؤال:")
print(f"🔵 متوسط تقييم BM25: {bm25_avg:.2f}")
print(f"🟢 متوسط تقييم BERT QA: {llm_avg:.2f}")


🔹 السؤال:
What percentage of students at Notre Dame participated in the Early Action program?

✅ الإجابة الصحيحة:
39.1%

📄 إجابة BM25:
In 2015-2016, Notre Dame ranked 18th overall among "national universities" in the United States in U.S. News & World Report's Best Colleges 2016. In 2014, USA Today ranked Notre Dame 10th overall for American universities based on data from College Factual. Forbes.com's America's Best Colleges ranks Notre Dame 13th among colleges in the United States in 2015, 8th among Research Universities, and 1st in the Midwest. U.S. News & World Report also lists Notre Dame Law School as 22nd overall. Busine
(التقييم: 0.81)

🤖 إجابة BERT:
57. 6 %
(التقييم: 0.96)





📊 النتائج النهائية على 50 سؤال:
🔵 متوسط تقييم BM25: 0.80
🟢 متوسط تقييم BERT QA: 0.91
