In [1]:
import os
import pandas as pd
import numpy as np
import joblib
from tqdm import tqdm


def load_model_from_disk(model_path):
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"❌ Model not found at {model_path}")
    return joblib.load(model_path)


def load_queries_from_csv(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"❌ File not found: {file_path}")
    
    df = pd.read_csv(file_path)
    if 'text' not in df.columns:
        raise ValueError("❌ CSV must contain a 'text' column")
    
    return df['text'].tolist()


def create_query_embeddings(query_csv_path, dataset_name, batch_size=3000):
    try:
        output_dir = f"data/{dataset_name}"
        os.makedirs(output_dir, exist_ok=True)
        
        embeddings_file = os.path.join(output_dir, "query_embeddings_matrix.joblib")
        vectorizer_path = os.path.join(output_dir, "embeddings_vectorizer.joblib")

        if os.path.exists(embeddings_file):
            print(f"✔️ Embeddings already exist for {dataset_name}. Skipping.")
            return

        # تحميل الكويريز
        queries = load_queries_from_csv(query_csv_path)
        if not queries:
            print("⚠️ No queries found.")
            return

        # تحميل الموديل من الملف المحفوظ
        model = load_model_from_disk(vectorizer_path)
        embeddings = []

        total_batches = (len(queries) + batch_size - 1) // batch_size
        with tqdm(total=total_batches, desc=f"Encoding queries for {dataset_name}") as pbar:
            for i in range(0, len(queries), batch_size):
                batch = queries[i:i + batch_size]
                batch_embeddings = model.encode(batch, convert_to_numpy=True)
                embeddings.append(batch_embeddings)
                pbar.update(1)

        embeddings = np.vstack(embeddings).astype(np.float32)

        # حفظ النتائج فقط
        joblib.dump(embeddings, embeddings_file)

        print(f"✅ Embeddings saved: {embeddings.shape}")
    except Exception as e:
        print(f"❌ Error: {str(e)}")


# التشغيل
if __name__ == "__main__":
    create_query_embeddings(r"data\antique\queries_antique.csv", "antique")
    create_query_embeddings(r"data\beir\queries_beir.csv", "beir")


  from .autonotebook import tqdm as notebook_tqdm
Encoding queries for antique: 100%|██████████| 1/1 [00:00<00:00,  1.20it/s]


✅ Embeddings saved: (176, 768)


Encoding queries for beir: 100%|██████████| 4/4 [00:22<00:00,  5.51s/it]

✅ Embeddings saved: (10000, 768)





In [1]:
import os
import sqlite3
import numpy as np
import joblib
from tqdm import tqdm

def load_model_from_disk(model_path):
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"❌ Model not found at {model_path}")
    return joblib.load(model_path)

def load_documents_emb(dataset_name):
    try:
        db_path = f"data/{dataset_name}/index.db"
        if not os.path.exists(db_path):
            print(f"❌ Database not found at {db_path}")
            return []

        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT processed_text FROM documents")
        rows = cursor.fetchall()
        conn.close()

        documents = [row[0] for row in rows if row[0]]  # تأكد أنه غير فارغ
        return documents
    except Exception as e:
        print(f"❌ Error loading documents: {e}")
        return []

def create_query_embeddings_from_db(dataset_name, batch_size=3000):
    try:
        output_dir = f"data/{dataset_name}"
        os.makedirs(output_dir, exist_ok=True)

        embeddings_file = os.path.join(output_dir, "query_embeddings_matrix.joblib")
        vectorizer_path = os.path.join(output_dir, "embeddings_vectorizer.joblib")

        if os.path.exists(embeddings_file):
            print(f"✔️ Embeddings already exist for {dataset_name}. Skipping.")
            return

        # تحميل النصوص المعالجة من قاعدة البيانات
        queries = load_documents_emb(dataset_name)
        if not queries:
            print(f"⚠️ No processed texts found in database for {dataset_name}.")
            return

        # تحميل النموذج
        model = load_model_from_disk(vectorizer_path)
        embeddings = []

        total_batches = (len(queries) + batch_size - 1) // batch_size
        with tqdm(total=total_batches, desc=f"Encoding queries for {dataset_name}") as pbar:
            for i in range(0, len(queries), batch_size):
                batch = queries[i:i + batch_size]
                batch_embeddings = model.encode(batch, convert_to_numpy=True)
                embeddings.append(batch_embeddings)
                pbar.update(1)

        embeddings = np.vstack(embeddings).astype(np.float32)

        # حفظ التضمينات
        joblib.dump(embeddings, embeddings_file)
        print(f"✅ Embeddings saved: {embeddings.shape}")

    except Exception as e:
        print(f"❌ Error: {str(e)}")

# التشغيل
if __name__ == "__main__":
    create_query_embeddings_from_db("antique")
    create_query_embeddings_from_db("beir")


  from .autonotebook import tqdm as notebook_tqdm
Encoding queries for antique:   1%|▏         | 2/135 [01:33<1:43:47, 46.83s/it]


KeyboardInterrupt: 