In [1]:
import os
import joblib

# تأكد أن BASE_PATH يشير للمسار الصحيح حيث توجد ملفات vectorizer
BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), "..", "vectorize", "saved_models"))

def load_vectorizer(name: str, vectorizer_type: str = "tfidf"):
    path = os.path.join(BASE_PATH, vectorizer_type, f"{name}_vectorizer.joblib")
    print(f"Loading vectorizer from: {path}")
    model = joblib.load(path)
    print(f"Loaded object type: {type(model)}")
    return model

datasets = ["antique", "beir"]

for dataset_name in datasets:
    file_prefix = f"{dataset_name}_all"
    print(f"\nTesting dataset: {dataset_name}")
    try:
        vectorizer = load_vectorizer(file_prefix)
        assert "TfidfVectorizer" in str(type(vectorizer)), "Loaded object is not a TfidfVectorizer!"
        print(f"✔️ {dataset_name} vectorizer loaded correctly and is a TfidfVectorizer.")
    except Exception as e:
        print(f"❌ Error loading vectorizer for {dataset_name}: {e}")



Testing dataset: antique
Loading vectorizer from: c:\Users\HP\IR-project\vectorize\saved_models\tfidf\antique_all_vectorizer.joblib
❌ Error loading vectorizer for antique: No module named 'TF_IDF'

Testing dataset: beir
Loading vectorizer from: c:\Users\HP\IR-project\vectorize\saved_models\tfidf\beir_all_vectorizer.joblib
❌ Error loading vectorizer for beir: No module named 'TF_IDF'


In [11]:
import sys
import os

# إضافة مجلد الجذر (IR-project) لمسارات البحث
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from query_processing import process
import numpy as np

dataset = "beir"
query = "what is Iraq?"

vector, tokens = process(dataset, query)
vector, tokens = process(dataset, query)

print("Vector shape:", vector.shape)
print("Tokens after preprocessing:", tokens)
vector_array = vector.toarray()[0]
nonzero_indices = np.nonzero(vector_array)[0]

print(f"عدد القيم غير الصفرية: {len(nonzero_indices)}")
for idx in nonzero_indices:
    print(f"Index: {idx}, TF-IDF: {vector_array[idx]}")


[DISK] Loading vectorizer for dataset: beir
Loading vectorizer from: c:\Users\HP\IR-project\vectorize\saved_models\tfidf\beir_all_vectorizer.joblib
Loaded object type: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
[CACHE] Using cached vectorizer for dataset: beir
Vector shape: (1, 711463)
Tokens after preprocessing: ['iraq', 'iraq']
عدد القيم غير الصفرية: 1
Index: 313706, TF-IDF: 1.0


In [None]:
from ranking import match_and_rank

query = "What is also?"
dataset = "beir"

results = match_and_rank(query, dataset)
print("Top ranked docs:", list(results.items())[:5])


[1] بدء معالجة الاستعلام...
Loading vectorizer from: c:\Users\HP\IR-project\vectorize\saved_models\tfidf\beir_all_vectorizer.joblib
Loaded object type: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
[2] تم استخراج التوكنز من الاستعلام: ['also']
[Cache MISS] جاري تحميل البيانات لأول مرة للداتا سيت: beir
[*] محاولة تحميل الفهرس من: C:\Users\HP\IR-project\indexing\saved_models\inverted_index\beir_inverted_index.joblib
[3] شكل مصفوفة TF-IDF: (382545, 711463)
[4] عينة من doc_ids: [527629, 527630, 527631, 527632, 527633]
[5] عدد المفاتيح في الفهرس المعكوس: 711463
[6] التوكنز الموجودة في الفهرس المعكوس: ['also']
[7] عدد الوثائق المرشحة بعد الفحص: 112545
[8] تم بناء قاموس للمطابقة بين doc_id و index.
[9] عدد doc_ids في الفهرس المعكوس غير موجودة في doc_id_to_index: 0
[10] عدد مؤشرات الوثائق المرشحة بعد المطابقة: 112545
[11] تم استخراج تمثيلات TF-IDF للوثائق المرشحة. الشكل: (112545, 711463)
[12] تم حساب درجات التشابه. عدد الدرجات: 112545
[13] عدد الوثائق بعد تطبيق عتبة التشابه (0.0001

In [1]:
from evaluation import run_evaluation
run_evaluation("antique")


Evaluating query 3097310
[DISK] Loading vectorizer for antique
[DISK] Loading index data for antique from disk...


KeyboardInterrupt: 

In [2]:
import sys
import os

# إضافة مجلد الجذر (IR-project) لمسارات البحث
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from storage.vector_storage import load_doc_ids

# تحميل doc_ids
doc_ids = load_doc_ids("beir_all")

# طباعة أول 10 عناصر مع نوع كل عنصر
for i, doc_id in enumerate(doc_ids[:10], 1):
    print(f"{i}: {doc_id} (type: {type(doc_id)})")


1: 527629 (type: <class 'int'>)
2: 527630 (type: <class 'int'>)
3: 527631 (type: <class 'int'>)
4: 527632 (type: <class 'int'>)
5: 527633 (type: <class 'int'>)
6: 527634 (type: <class 'int'>)
7: 527635 (type: <class 'int'>)
8: 527636 (type: <class 'int'>)
9: 527637 (type: <class 'int'>)
10: 527638 (type: <class 'int'>)


In [1]:
import os
import sys
import joblib

# إعداد المسار الجذري للمشروع
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from indexing.inverted_index_loader import load_inverted_index
index = load_inverted_index("beir")
sample_token = next(iter(index))
print(f"عينة من ids في الفهرس: {index[sample_token][:5]}")
print(f"نوع أول id في الفهرس: {type(index[sample_token][0])}")



[*] محاولة تحميل الفهرس من: C:\Users\HP\IR-project\indexing\saved_models\inverted_index\beir_inverted_index.joblib
عينة من ids في الفهرس: [1179658, 786468, 786471, 786476, 786493]
نوع أول id في الفهرس: <class 'int'>


In [2]:
%pip install ir_measures


Defaulting to user installation because normal site-packages is not writeable
Collecting ir_measures
  Downloading ir_measures-0.3.7-py3-none-any.whl.metadata (7.0 kB)
Collecting pytrec-eval-terrier>=0.5.5 (from ir_measures)
  Downloading pytrec_eval_terrier-0.5.7-cp312-cp312-win_amd64.whl.metadata (1.0 kB)
Downloading ir_measures-0.3.7-py3-none-any.whl (60 kB)
Downloading pytrec_eval_terrier-0.5.7-cp312-cp312-win_amd64.whl (57 kB)
Installing collected packages: pytrec-eval-terrier, ir_measures
Successfully installed ir_measures-0.3.7 pytrec-eval-terrier-0.5.7
Note: you may need to restart the kernel to use updated packages.


