In [1]:
import json
from pymongo import MongoClient
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [None]:
# === Configuración ===
json_file_path = "dataset/arxiv-metadata-oai-snapshot.json"
batch_size = 100
max_workers = 4  # Ajusta según tu CPU y MongoDB

In [3]:
# === Conectar a MongoDB ===
client = MongoClient(
    'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
)
db = client["arxiv_db"]
collection = db["articles"]

In [4]:
# === Función para insertar lote ===
def insert_batch(batch):
    try:
        collection.insert_many(batch, ordered=False)
    except Exception as e:
        print("Error al insertar batch:", e)

In [5]:
# === Contar líneas para tqdm ===
def count_lines(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return sum(1 for _ in f)

total_lines = count_lines(json_file_path)

In [6]:
# === Lectura + carga paralela ===
with open(json_file_path, 'r', encoding='utf-8') as f, tqdm(total=total_lines, desc="Cargando") as pbar:
    batch = []
    futures = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for line in f:
            try:
                record = json.loads(line)
                # 👉 Agregar campo pdf_source
                record["pdf_source"] = f"https://arxiv.org/pdf/{record['id']}"
                batch.append(record)
                if len(batch) >= batch_size:
                    future = executor.submit(insert_batch, batch)
                    futures.append(future)
                    batch = []
            except json.JSONDecodeError:
                continue
            pbar.update(1)

        # Último batch
        if batch:
            futures.append(executor.submit(insert_batch, batch))
            pbar.update(len(batch))

# === Esperar a que terminen todas las cargas ===
for future in futures:
    future.result()
print("✅ Carga paralela completa.")

Cargando:  74%|███████▍  | 2033469/2735264 [06:36<00:48, 14544.05it/s]IOStream.flush timed out
Cargando:  74%|███████▍  | 2034921/2735264 [10:17<7:24:39, 26.25it/s] 

: 

In [13]:
'''
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27018/")
db = client["arxiv_db"]
collection = db["articles"]

# ⚠️ Eliminar todos los documentos
collection.delete_many({})
print("🗑️ Todos los documentos eliminados.")
'''

🗑️ Todos los documentos eliminados.
