In [7]:
import sys
import os
import numpy as np

# إضافة مجلد الجذر (IR-project) لمسارات البحث
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print("Added to sys.path:", project_root)

from query_processor import query_service

dataset = "antique"   # أو "beir"
query = "Should teachers get tenure?"

vector = query_service(dataset, query)
print("Vector shape:", vector.shape)
print("Vector sample values:", vector.toarray()[0][:10])
vector_array = vector.toarray()[0]
nonzero_indices = np.nonzero(vector_array)[0]

print(f"عدد القيم غير الصفرية: {len(nonzero_indices)}")
for idx in nonzero_indices:
    print(f"Index: {idx}, TF-IDF: {vector_array[idx]}")


Added to sys.path: c:\Users\HP\IR-project
Loading vectorizer from: c:\Users\HP\IR-project\vectorize\saved_models\tfidf\antique_all_vectorizer.joblib
Loaded object type: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
Type of self.vectorizer: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
Tokens after preprocessing: ['teacher', 'get', 'tenur']
Missing tokens from vocabulary: []
Non-zero elements in vector: 3
Vector shape: (1, 204284)
Vector sample values: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
عدد القيم غير الصفرية: 3
Index: 76030, TF-IDF: 0.2291259274224079
Index: 172484, TF-IDF: 0.5033723598545621
Index: 173256, TF-IDF: 0.8331371896136138


In [8]:
import sys
import os

# إضافة مجلد الجذر (IR-project) لمسارات البحث
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print("Added to sys.path:", project_root)

from query_processor import query_service

dataset = "beir"  
query = "Tell me about the Roman Empire."

vector = query_service(dataset, query)
print("Vector shape:", vector.shape)
print("Vector sample values:", vector.toarray()[0][:10])


Added to sys.path: c:\Users\HP\IR-project
Loading vectorizer from: c:\Users\HP\IR-project\vectorize\saved_models\tfidf\beir_all_vectorizer.joblib
Loaded object type: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
Type of self.vectorizer: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
Tokens after preprocessing: ['tell', 'roman', 'empir']
Missing tokens from vocabulary: []
Non-zero elements in vector: 3
Vector shape: (1, 711463)
Vector sample values: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [5]:
import os
import joblib

# تأكد أن BASE_PATH يشير للمسار الصحيح حيث توجد ملفات vectorizer
BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), "..", "vectorize", "saved_models"))

def load_vectorizer(name: str, vectorizer_type: str = "tfidf"):
    path = os.path.join(BASE_PATH, vectorizer_type, f"{name}_vectorizer.joblib")
    print(f"Loading vectorizer from: {path}")
    model = joblib.load(path)
    print(f"Loaded object type: {type(model)}")
    return model

datasets = ["antique", "beir"]

for dataset_name in datasets:
    file_prefix = f"{dataset_name}_all"
    print(f"\nTesting dataset: {dataset_name}")
    try:
        vectorizer = load_vectorizer(file_prefix)
        assert "TfidfVectorizer" in str(type(vectorizer)), "Loaded object is not a TfidfVectorizer!"
        print(f"✔️ {dataset_name} vectorizer loaded correctly and is a TfidfVectorizer.")
    except Exception as e:
        print(f"❌ Error loading vectorizer for {dataset_name}: {e}")



Testing dataset: antique
Loading vectorizer from: c:\Users\HP\IR-project\vectorize\saved_models\tfidf\antique_all_vectorizer.joblib
Loaded object type: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
✔️ antique vectorizer loaded correctly and is a TfidfVectorizer.

Testing dataset: beir
Loading vectorizer from: c:\Users\HP\IR-project\vectorize\saved_models\tfidf\beir_all_vectorizer.joblib
Loaded object type: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
✔️ beir vectorizer loaded correctly and is a TfidfVectorizer.


In [2]:
import sys
import os

# إضافة مجلد الجذر (IR-project) لمسارات البحث
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from query_processing import process
import numpy as np

dataset = "beir"
query = "what is Iraq?"

vector, tokens = process(dataset, query)

print("Vector shape:", vector.shape)
print("Tokens after preprocessing:", tokens)
vector_array = vector.toarray()[0]
nonzero_indices = np.nonzero(vector_array)[0]

print(f"عدد القيم غير الصفرية: {len(nonzero_indices)}")
for idx in nonzero_indices:
    print(f"Index: {idx}, TF-IDF: {vector_array[idx]}")


Loading vectorizer from: c:\Users\HP\IR-project\vectorize\saved_models\tfidf\beir_all_vectorizer.joblib
Loaded object type: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
Vector shape: (1, 711463)
Tokens after preprocessing: ['iraq', 'iraq']
عدد القيم غير الصفرية: 1
Index: 313706, TF-IDF: 1.0


In [1]:
from ranking import match_and_rank
from evaluation import compute_map

query = "what is Iraq?"

dataset = "beir"
qrels_file = f"dataBases/{dataset}_qrels.tsv"

results = match_and_rank(query, dataset)
map_score = compute_map(results, qrels_file)

print("Top ranked documents:", list(results.items())[:5])
print(f"MAP Score: {map_score:.4f}")


Loading vectorizer from: c:\Users\HP\IR-project\vectorize\saved_models\tfidf\beir_all_vectorizer.joblib
Loaded object type: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>


FileNotFoundError: لم يتم العثور على الفهرس المعكوس: c:\Users\HP\indexing\saved_models\inverted_index\beir_inverted_index.joblib