In [None]:
import os
import sqlite3
import numpy as np
import joblib
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch

# ✅ التحقق من GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    print(f"✅ GPU متوفر: {gpu_name}")
    if "T4" not in gpu_name:
        print("⚠️ هذا ليس T4 GPU. قد تختلف السرعة حسب نوع البطاقة.")
else:
    print("⚠️ لا يوجد GPU، سيتم استخدام المعالج العادي (CPU)")

# ✅ تحميل النصوص من قاعدة البيانات
def load_documents_emb(dataset_name):
    try:
        db_path = os.path.join("data", dataset_name, "index.db")
        if not os.path.exists(db_path):
            print(f"❌ Database not found at {db_path}")
            return []

        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT processed_text FROM queries")
        rows = cursor.fetchall()
        conn.close()

        documents = [row[0] for row in rows if row[0]]
        return documents
    except Exception as e:
        print(f"❌ Error loading queries: {e}")
        return []

# ✅ إنشاء التمثيلات الشعاعية (embeddings)
def create_query_embeddings_from_db(dataset_name, batch_size=3000):
    try:
        output_dir = os.path.join("data", dataset_name)
        os.makedirs(output_dir, exist_ok=True)

        embeddings_file = os.path.join(output_dir, f"query_embeddings_matrix_{dataset_name}.joblib")

        if os.path.exists(embeddings_file):
            print(f"✔️ Embeddings already exist for {dataset_name}. Skipping.")
            return

        queries = load_documents_emb(dataset_name)
        if not queries:
            print(f"⚠️ No processed texts found in database for {dataset_name}.")
            return

        print(f"🚀 Loading sentence-transformers model 'all-mpnet-base-v2' ...")
        model = SentenceTransformer('all-mpnet-base-v2', device=device)

        embeddings = []

        total_batches = (len(queries) + batch_size - 1) // batch_size
        with tqdm(total=total_batches, desc=f"Encoding queries for {dataset_name}") as pbar:
            for i in range(0, len(queries), batch_size):
                batch = queries[i:i + batch_size]
                batch_embeddings = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
                embeddings.append(batch_embeddings)
                pbar.update(1)

        embeddings = np.vstack(embeddings).astype(np.float32)

        joblib.dump(embeddings, embeddings_file)
        print(f"✅ Embeddings saved to {embeddings_file}, shape = {embeddings.shape}")

    except Exception as e:
        print(f"❌ Error: {e}")

# ✅ التشغيل
if __name__ == "__main__":
    create_query_embeddings_from_db("antique")
    # create_query_embeddings_from_db("beir")


KeyboardInterrupt: 

In [3]:
import os
import sqlite3
import numpy as np
import joblib
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch



def load_documents_emb(dataset_name):
    try:
        db_path = os.path.join("data", dataset_name, "index.db")
        if not os.path.exists(db_path):
            print(f"❌ قاعدة البيانات غير موجودة في المسار: {db_path}")
            return []

        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT processed_text FROM queries")
        rows = cursor.fetchall()
        conn.close()

        documents = [row[0] for row in rows if row[0]]
        return documents
    except Exception as e:
        print(f"❌ خطأ في تحميل الاستعلامات: {e}")
        return []

def create_query_embeddings_from_db(dataset_name, batch_size=3000):
    try:
        output_dir = os.path.join("data", dataset_name)
        os.makedirs(output_dir, exist_ok=True)

        embeddings_file = os.path.join(output_dir, f"query_embeddings_matrix_{dataset_name}.joblib")

        if os.path.exists(embeddings_file):
            print(f"✔️ ملف التضمينات موجود بالفعل لـ {dataset_name}. يتم التجاوز.")
            return

        queries = load_documents_emb(dataset_name)
        if not queries:
            print(f"⚠️ لا توجد استعلامات معالجة في قاعدة البيانات لـ {dataset_name}.")
            return

        print(f"🚀 تحميل نموذج 'all-mpnet-base-v2' من sentence-transformers ...")
        model = SentenceTransformer('all-mpnet-base-v2')

        embeddings = []

        total_batches = (len(queries) + batch_size - 1) // batch_size
        with tqdm(total=total_batches, desc=f"ترميز الاستعلامات لـ {dataset_name}") as pbar:
            for i in range(0, len(queries), batch_size):
                batch = queries[i:i + batch_size]
                batch_embeddings = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
                embeddings.append(batch_embeddings)
                pbar.update(1)

        embeddings = np.vstack(embeddings).astype(np.float32)
        joblib.dump(embeddings, embeddings_file)

        print(f"✅ تم حفظ التضمينات: {embeddings.shape}")
        print(f"📁 المسار: {embeddings_file}")

    except Exception as e:
        print(f"❌ خطأ أثناء التضمين: {e}")

if __name__ == "__main__":
    create_query_embeddings_from_db("antique")
    # create_query_embeddings_from_db("beir")


🚀 تحميل نموذج 'all-mpnet-base-v2' من sentence-transformers ...


ترميز الاستعلامات لـ antique: 100%|██████████| 1/1 [00:06<00:00,  6.32s/it]


✅ تم حفظ التضمينات: (176, 768)
📁 المسار: data\antique\query_embeddings_matrix_antique.joblib


In [4]:
import os
import numpy as np
import joblib
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

def create_embeddings_from_csv(csv_path, output_file, batch_size=3000):
    try:
        if not os.path.exists(csv_path):
            print(f"❌ ملف CSV غير موجود: {csv_path}")
            return

        # تحميل الملف وقراءة العمود المطلوب
        df = pd.read_csv(csv_path)
        queries = df['processed_query'].dropna().tolist()

        if not queries:
            print("⚠️ لا توجد استعلامات معالجة في الملف.")
            return

        print(f"🚀 تحميل نموذج 'all-mpnet-base-v2' من sentence-transformers ...")
        model = SentenceTransformer('all-mpnet-base-v2')

        embeddings = []
        total_batches = (len(queries) + batch_size - 1) // batch_size

        with tqdm(total=total_batches, desc="ترميز الاستعلامات المحسنة") as pbar:
            for i in range(0, len(queries), batch_size):
                batch = queries[i:i + batch_size]
                batch_embeddings = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
                embeddings.append(batch_embeddings)
                pbar.update(1)

        embeddings = np.vstack(embeddings).astype(np.float32)
        joblib.dump(embeddings, output_file)

        print(f"✅ تم حفظ التضمينات: {embeddings.shape}")
        print(f"📁 المسار: {output_file}")

    except Exception as e:
        print(f"❌ خطأ أثناء التضمين: {e}")

if __name__ == "__main__":
    csv_file = os.path.join("data", "beir", "enhanced_queries_beir.csv")
    output_joblib = os.path.join("data", "beir", "embedding_enhanced_queries_beir_matrix.joblib")
    create_embeddings_from_csv(csv_file, output_joblib)


🚀 تحميل نموذج 'all-mpnet-base-v2' من sentence-transformers ...


ترميز الاستعلامات المحسنة: 100%|██████████| 4/4 [00:18<00:00,  4.51s/it]


✅ تم حفظ التضمينات: (9997, 768)
📁 المسار: data\beir\embedding_enhanced_queries_beir_matrix.joblib
