In [1]:
pip install pymongo


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import os
import pymongo
from pymongo import MongoClient
from pymongo.errors import DocumentTooLarge
from gridfs import GridFS
import pandas as pd
import logging
from datetime import datetime

# إعداد السجل (Logging)
if not os.path.exists("logs"):
    os.makedirs("logs")
log_file = f"logs/mongo_upload_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    encoding="utf-8"
)

def connect_to_mongo():
    try:
        connection_string = "mongodb://localhost:27017/"
        client = MongoClient(connection_string)
        client.admin.command('ping')
        logging.info("Connected to MongoDB successfully")
        print("Connected to MongoDB successfully")
        return client
    except Exception as e:
        logging.error(f"Failed to connect to MongoDB: {str(e)}")
        print(f"Failed to connect to MongoDB: {str(e)}")
        return None

def read_file_content(file_path):
    try:
        file_name = os.path.basename(file_path)
        if file_name.endswith('.csv'):
            df = pd.read_csv(file_path)
            content = df.to_dict('records')  # تخزين كسجلات لقواعد البيانات
        elif file_name.endswith(('.joblib', '.faiss')):
            with open(file_path, 'rb') as file:
                content = file.read()  # بيانات ثنائية
        else:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
        return {"file_name": file_name, "content": content}
    except Exception as e:
        logging.error(f"Error reading file {file_path}: {str(e)}")
        print(f"Error reading file {file_path}: {str(e)}")
        return None

def upload_file_to_mongo(file_path, db_name, collection_name, client):
    try:
        document = read_file_content(file_path)
        if document is None:
            return

        db = client[db_name]

        file_name = os.path.basename(file_path)

        # نحدد حجم الوثيقة النصية (لو > 16MB نستخدم GridFS)
        content_size = len(document["content"]) if isinstance(document["content"], (bytes, str)) else 0
        if file_name.endswith(('.joblib', '.faiss')) or content_size > 16 * 1024 * 1024:
            # رفع باستخدام GridFS
            fs = GridFS(db, collection=collection_name)
            # تحقق هل الملف موجود مسبقاً بنفس الاسم وحذفه
            existing = fs.find_one({"filename": file_name})
            if existing:
                fs.delete(existing._id)
                logging.info(f"Deleted existing GridFS file {file_name} before upload")

            with open(file_path, 'rb') as file:
                fs.put(file, filename=file_name, metadata={"dataset_name": db_name, "file_label": collection_name})

            logging.info(f"Uploaded file {file_name} to {db_name}.{collection_name} using GridFS")
            print(f"Uploaded file {file_name} to {db_name}.{collection_name} using GridFS")
        else:
            # إدخال الوثيقة العادية بعد حذف الوثيقة بنفس الاسم إن وجدت
            collection = db[collection_name]
            existing = collection.find_one({"file_name": file_name})
            if existing:
                collection.delete_one({"_id": existing["_id"]})
                logging.info(f"Deleted existing document {file_name} before upload")

            # نضيف معلومات إضافية من dataset_name و collection_name
            doc_to_insert = {
                "file_name": file_name,
                "content": document["content"],
                "dataset_name": db_name,
                "file_label": collection_name
            }
            collection.insert_one(doc_to_insert)

            logging.info(f"Uploaded file {file_name} to {db_name}.{collection_name}")
            print(f"Uploaded file {file_name} to {db_name}.{collection_name}")

    except DocumentTooLarge as e:
        logging.error(f"Error uploading {file_path} to MongoDB: Document too large, consider using GridFS ({str(e)})")
        print(f"Error uploading {file_path} to MongoDB: Document too large, consider using GridFS ({str(e)})")
    except Exception as e:
        logging.error(f"Error uploading {file_path} to MongoDB: {str(e)}")
        print(f"Error uploading {file_path} to MongoDB: {str(e)}")

def upload_files():
    files_to_upload = [
        ("data/antique/tfidf_vectorizer.joblib", "antique", "representation"),
        ("data/antique/tfidf_matrix.joblib", "antique", "representation"),
        ("data/antique/embeddings_matrix.joblib", "antique", "representation"),
        ("data/antique/embeddings_vectorizer.joblib", "antique", "representation"),
        ("data/antique/embedding_index.faiss", "antique", "representation"),
    ]

    client = connect_to_mongo()
    if not client:
        return

    try:
        for file_path, db_name, collection_name in files_to_upload:
            absolute_path = os.path.abspath(file_path)
            if not os.path.isfile(absolute_path):
                print(f"Error: File '{absolute_path}' does not exist.")
                logging.error(f"File not found: {absolute_path}")
                continue
            upload_file_to_mongo(absolute_path, db_name, collection_name, client)
    except Exception as e:
        logging.error(f"Error in upload process: {str(e)}")
        print(f"Error in upload process: {str(e)}")
    finally:
        client.close()
        logging.info("Closed MongoDB connection")
        print("Closed MongoDB connection")

if __name__ == "__main__":
    upload_files()


Connected to MongoDB successfully
Uploaded file tfidf_vectorizer.joblib to antique.representation using GridFS
Uploaded file tfidf_matrix.joblib to antique.representation using GridFS
Uploaded file embeddings_matrix.joblib to antique.representation using GridFS
Uploaded file embeddings_vectorizer.joblib to antique.representation using GridFS
Uploaded file embedding_index.faiss to antique.representation using GridFS
Closed MongoDB connection


In [1]:
import os
import pymongo
from pymongo import MongoClient
from pymongo.errors import DocumentTooLarge
from gridfs import GridFS
import pandas as pd
import logging
from datetime import datetime

# إعداد السجل (Logging)
if not os.path.exists("logs"):
    os.makedirs("logs")
log_file = f"logs/mongo_upload_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    encoding="utf-8"
)

def connect_to_mongo():
    try:
        connection_string = "mongodb://localhost:27017/"
        client = MongoClient(connection_string)
        client.admin.command('ping')
        logging.info("Connected to MongoDB successfully")
        print("Connected to MongoDB successfully")
        return client
    except Exception as e:
        logging.error(f"Failed to connect to MongoDB: {str(e)}")
        print(f"Failed to connect to MongoDB: {str(e)}")
        return None

def read_file_content(file_path):
    try:
        file_name = os.path.basename(file_path)
        if file_name.endswith('.csv'):
            df = pd.read_csv(file_path)
            content = df.to_dict('records')  # تخزين كسجلات لقواعد البيانات
        elif file_name.endswith(('.joblib', '.faiss')):
            with open(file_path, 'rb') as file:
                content = file.read()  # بيانات ثنائية
        else:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
        return {"file_name": file_name, "content": content}
    except Exception as e:
        logging.error(f"Error reading file {file_path}: {str(e)}")
        print(f"Error reading file {file_path}: {str(e)}")
        return None

def upload_file_to_mongo(file_path, db_name, collection_name, client):
    try:
        document = read_file_content(file_path)
        if document is None:
            return

        db = client[db_name]

        file_name = os.path.basename(file_path)

        content_size = len(document["content"]) if isinstance(document["content"], (bytes, str)) else 0
        if file_name.endswith(('.joblib', '.faiss')) or content_size > 16 * 1024 * 1024:
            fs = GridFS(db, collection=collection_name)
            existing = fs.find_one({"filename": file_name})
            if existing:
                fs.delete(existing._id)
                logging.info(f"Deleted existing GridFS file {file_name} before upload")

            with open(file_path, 'rb') as file:
                fs.put(file, filename=file_name, metadata={"dataset_name": db_name, "file_label": collection_name})

            logging.info(f"Uploaded file {file_name} to {db_name}.{collection_name} using GridFS")
            print(f"Uploaded file {file_name} to {db_name}.{collection_name} using GridFS")
        else:
            collection = db[collection_name]
            existing = collection.find_one({"file_name": file_name})
            if existing:
                collection.delete_one({"_id": existing["_id"]})
                logging.info(f"Deleted existing document {file_name} before upload")

            doc_to_insert = {
                "file_name": file_name,
                "content": document["content"],
                "dataset_name": db_name,
                "file_label": collection_name
            }
            collection.insert_one(doc_to_insert)

            logging.info(f"Uploaded file {file_name} to {db_name}.{collection_name}")
            print(f"Uploaded file {file_name} to {db_name}.{collection_name}")

    except DocumentTooLarge as e:
        logging.error(f"Error uploading {file_path} to MongoDB: Document too large, consider using GridFS ({str(e)})")
        print(f"Error uploading {file_path} to MongoDB: Document too large, consider using GridFS ({str(e)})")
    except Exception as e:
        logging.error(f"Error uploading {file_path} to MongoDB: {str(e)}")
        print(f"Error uploading {file_path} to MongoDB: {str(e)}")

def upload_files():
    files_to_upload = [
        ("data/beir/tfidf_vectorizer.joblib", "beir", "representation"),
        ("data/beir/tfidf_matrix.joblib", "beir", "representation"),
        ("data/beir/embeddings_matrix.joblib", "beir", "representation"),
        ("data/beir/embeddings_vectorizer.joblib", "beir", "representation"),
        ("data/beir/embedding_index.faiss", "beir", "representation"),
    ]

    client = connect_to_mongo()
    if not client:
        return

    try:
        for file_path, db_name, collection_name in files_to_upload:
            absolute_path = os.path.abspath(file_path)
            if not os.path.isfile(absolute_path):
                print(f"Error: File '{absolute_path}' does not exist.")
                logging.error(f"File not found: {absolute_path}")
                continue
            upload_file_to_mongo(absolute_path, db_name, collection_name, client)
    except Exception as e:
        logging.error(f"Error in upload process: {str(e)}")
        print(f"Error in upload process: {str(e)}")
    finally:
        client.close()
        logging.info("Closed MongoDB connection")
        print("Closed MongoDB connection")

if __name__ == "__main__":
    upload_files()


Connected to MongoDB successfully
Uploaded file tfidf_vectorizer.joblib to beir.representation using GridFS
Uploaded file tfidf_matrix.joblib to beir.representation using GridFS
Uploaded file embeddings_matrix.joblib to beir.representation using GridFS
Uploaded file embeddings_vectorizer.joblib to beir.representation using GridFS
Uploaded file embedding_index.faiss to beir.representation using GridFS
Closed MongoDB connection


In [3]:
import joblib

file_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Representation\TF-IDF\beir\quora\test\doc\tfidf_data.joblib"
data = joblib.load(file_path)
print("Keys in file:", list(data.keys()))


Keys in file: ['tfidf_matrix', 'vectorizer', 'doc_ids']


In [4]:
file_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Representation\Bert\beir\quora\test\doc\bert_embedding.joblib"
data = joblib.load(file_path)
print("Keys in file:", list(data.keys()))


Keys in file: ['doc_ids', 'embeddings_matrix', 'model_name']
