In [1]:
# Uninstall conflicting packages and clear cache
!pip uninstall -y torch torchvision torchaudio sentence-transformers -q
!pip cache purge -q

# Install compatible versions quietly
!pip install torch==2.3.0 torchvision==0.18.0 sentence-transformers==2.7.0 pandas numpy joblib nltk tqdm -q


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.2/779.2 MB[0m [31m806.8 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m115.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
import os
import sqlite3
import numpy as np
import joblib
import ast
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch
from google.colab import drive, files

In [3]:
# Download NLTK data quietly
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Mount Google Drive quietly
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [10]:
# ✅ تثبيت gdown لتحميل الملفات من Google Drive
!pip install -U gdown

# ✅ تحميل المجلد من Google Drive (الرابط المشترك)
import gdown
import os

folder_id = "1Y19Gai5PCohj8Y4ny-gGHoXGIm0d7WJm"
drive_path = "/content/data"

try:
    gdown.download_folder(f"https://drive.google.com/drive/folders/{folder_id}", output=drive_path, quiet=False, use_cookies=False)
    print(f"✅ تم تحميل المجلد إلى: {drive_path}")
except Exception as e:
    print(f"❌ فشل تحميل المجلد: {str(e)}")
    print("⚠️ يرجى تحميل المجلد يدويًا إلى '/content/data' من الرابط:")
    print(f"https://drive.google.com/drive/folders/{folder_id}")
    print("تأكد من أن إعدادات المشاركة مضبوطة على 'أي شخص لديه الرابط'.")

# ✅ التحقق من وجود المجلد
if not os.path.exists(drive_path):
    raise FileNotFoundError(f"❌ المجلد {drive_path} غير موجود! يرجى تحميل المجلد يدويًا.")

# ✅ باقي المكتبات
import pandas as pd
import sqlite3
import numpy as np
import joblib
import ast
import re
import gc
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch
import nltk

# ✅ تحميل بيانات NLTK بصمت
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# ✅ تحقق من GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    print(f"✅ GPU متوفر: {gpu_name}")
    if "T4" not in gpu_name:
        print("⚠️ هذا ليس T4 GPU. قد تختلف السرعة حسب نوع البطاقة.")
else:
    raise RuntimeError("❌ لا يوجد GPU! فعّل GPU من Runtime > Change runtime type.")

# ✅ دوال الموديل والمعالجة
def custom_tokenizer(text):
    if not isinstance(text, str) or not text.strip():
        return []
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english')) - {'not', 'no', 'never'}
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2 and t not in ['example', 'test']]
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(t, pos='v') for t in tokens]

def preprocess_text(text, cache=None):
    if cache is not None and text in cache:
        return cache[text]
    tokens = custom_tokenizer(text)
    if cache is not None:
        cache[text] = tokens
    return tokens

def load_embeddings(dataset_name, drive_path):
    path = f"{drive_path}/{dataset_name}/embeddings_matrix.joblib"
    return joblib.load(path) if os.path.exists(path) else np.array([])

def load_doc_data(dataset_name, drive_path):
    path = f"{drive_path}/{dataset_name}/index.db"
    if not os.path.exists(path):
        return {}
    try:
        conn = sqlite3.connect(path)
        cur = conn.cursor()
        cur.execute('SELECT doc_id, processed_text FROM documents')
        data = {row[0]: row[1] for row in cur.fetchall()}
        conn.close()
        return data
    except Exception:
        if 'conn' in locals():
            conn.close()
        return {}

def enhance_queries(queries, dataset_name, model, drive_path, doc_data,
                    enhance=True, batch_size=16, chunk_size=1000):
    if not enhance:
        return queries, [preprocess_text(q) for q in queries]

    query_embeddings = load_embeddings(dataset_name, drive_path)
    if len(query_embeddings) == 0:
        return queries, [preprocess_text(q) for q in queries]

    mapping_file = f"{drive_path}/{dataset_name}/doc_id_mapping.joblib"
    if not os.path.exists(mapping_file):
        return queries, [preprocess_text(q) for q in queries]
    index_to_doc_id = joblib.load(mapping_file)

    token_cache = {}
    processed_queries = [preprocess_text(q, token_cache) for q in queries]
    query_texts = [' '.join(pq) for pq in processed_queries]

    with torch.no_grad():
        query_vectors = model.encode(query_texts, convert_to_numpy=True, batch_size=batch_size, device=device)

    query_vectors = torch.tensor(query_vectors, dtype=torch.float32).to(device)
    query_embeddings = torch.tensor(query_embeddings, dtype=torch.float32).to(device)

    similarities = []
    for i in range(0, query_embeddings.shape[0], chunk_size):
        chunk = query_embeddings[i:i+chunk_size].to(device)
        sim = torch.nn.functional.cosine_similarity(query_vectors.unsqueeze(1), chunk.unsqueeze(0), dim=2)
        similarities.append(sim)
        del chunk, sim
        torch.cuda.empty_cache()
        gc.collect()

    similarities = torch.cat(similarities, dim=1)
    top_indices = torch.topk(similarities, k=5, dim=1)[1].cpu().numpy()
    similarities = similarities.cpu()

    enhanced_queries, final_processed_queries = [], []
    for i, (query, processed_query) in enumerate(zip(queries, processed_queries)):
        top_idx = top_indices[i]
        similar_words, added = [], 0

        for idx in top_idx:
            doc_id = index_to_doc_id.get(idx)
            if not doc_id or doc_id not in doc_data:
                continue
            try:
                score = similarities[i][idx].item()
                if 0.4 < score < 0.999:
                    words = ast.literal_eval(doc_data[doc_id])
                    if words:
                        similar_words.extend(words)
                        added += 1
                        if added >= 5:
                            break
            except Exception:
                continue

        seen = set(processed_query)
        new_words = [w for w in similar_words if w not in seen]
        new_words = new_words[:4]
        final_query = query + ' '.join(new_words) if new_words else query
        enhanced_queries.append(final_query)
        final_processed_queries.append(processed_query)

    return enhanced_queries, final_processed_queries

def process_csv(input_csv, output_csv, dataset_name, model, drive_path, batch_size=500):
    df = pd.read_csv(input_csv)
    if 'text' not in df.columns:
        raise ValueError("CSV must have a 'text' column")

    all_queries = df['text'].tolist()
    total = len(all_queries)

    doc_data = load_doc_data(dataset_name, drive_path)

    all_enhanced, all_tokens = [], []

    for i in tqdm(range(0, total, batch_size), desc="🔄 معالجة الاستعلامات"):
        batch_queries = all_queries[i:i+batch_size]
        enhanced, tokens = enhance_queries(batch_queries, dataset_name, model, drive_path, doc_data)
        all_enhanced.extend(enhanced)
        all_tokens.extend([' '.join(t) for t in tokens])
        gc.collect()
        torch.cuda.empty_cache()

    df['enhanced_query'] = all_enhanced
    df['processed_tokens'] = all_tokens
    df.to_csv(output_csv, index=False)  # السطر المصحح
    print(f"\n✅ تم حفظ {len(df)} استعلام إلى: {output_csv}")

# ✅ التنفيذ الرئيسي
model = SentenceTransformer('all-mpnet-base-v2', device=device)
dataset_name = "beir"
drive_path = "/content/data"
input_csv = f"{drive_path}/{dataset_name}/queries_beir.csv"
output_csv = "/content/enhanced_queries_beir.csv"

process_csv(input_csv, output_csv, dataset_name, model, drive_path, batch_size=500)

# ✅ تنزيل الناتج
from google.colab import files
if os.path.exists(output_csv):
    files.download(output_csv)
    print(f"📥 تم تنزيل الملف: {output_csv}")
else:
    print(f"❌ الملف {output_csv} غير موجود!")



Retrieving folder contents


Retrieving folder 1wYN3g7t8PSWm4LcZeq5W6J3zesI2WSZX antique
Retrieving folder 12vUDb_w60AQTRSi-EklbQzasw_W0mzu- logs
Retrieving folder 1Sw3oWsItL5S9sx4Fk7Z2kflYnHVv6p_7 tfidf
Processing file 1K7q-a957YqNpiuzaikCkgL21hsnVZ9DN doc_id_mapping.joblib
Processing file 1Txlz1nxyR8AwkDwMgBtBymMTvFgyWyOV docs_antique.csv
Processing file 1KppNyTl-06ZVF9BYGrfHOfphH8dHd_kw embedding_index.faiss
Processing file 1cdpLRWB_mP8AkdhnajUhA2_IlVVjwl2W embeddings_matrix.joblib
Processing file 1WvjaJEGgAz4bB1bqkUMenQH-ISZZ_YTv embeddings_vectorizer.joblib
Processing file 14I2bjpIVGqTUGo4H0-HdTiiRc6Zh0cu0 enhanced_queries_antique.json
Processing file 1m168ne8p049qOPrurWIntNu4nyocID2I index.db
Processing file 19wRo_0iHxPMtXtlJ2yHM8hhdd89uJiq4 qrels_antique.csv
Processing file 1U_OPYmDq0w1GLCnOSxYuPnU7nlChVwH6 queries_antique.csv
Processing file 1p57TfQy_aWkcSLSlL7FsKPTG8gKslxeT query_embeddings.joblib
Processing file 1z_lNnXe6l_Jek2mHjVo3CTSpF26ZVDkU query_enhancement_log.txt
Processing file 1_baTh3Yko_YUfNx4

Retrieving folder contents completed
Building directory structure
Building directory structure completed


❌ فشل تحميل المجلد: Failed to retrieve file url:

	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses.
	Check FAQ in https://github.com/wkentaro/gdown?tab=readme-ov-file#faq.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=1K7q-a957YqNpiuzaikCkgL21hsnVZ9DN

but Gdown can't. Please check connections and permissions.
⚠️ يرجى تحميل المجلد يدويًا إلى '/content/data' من الرابط:
https://drive.google.com/drive/folders/1Y19Gai5PCohj8Y4ny-gGHoXGIm0d7WJm
تأكد من أن إعدادات المشاركة مضبوطة على 'أي شخص لديه الرابط'.
✅ GPU متوفر: Tesla T4


🔄 معالجة الاستعلامات: 100%|██████████| 20/20 [1:34:46<00:00, 284.35s/it]


✅ تم حفظ 10000 استعلام إلى: /content/enhanced_queries_beir.csv





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📥 تم تنزيل الملف: /content/enhanced_queries_beir.csv
