## Primera Etapa
Instalación de las librerías necesarias

In [None]:
#pip install pymongo
#pip install dnspython

## Se crea la wea de base de datos y los articulos

In [1]:
from pymongo import MongoClient

# se chequea que hay conexión a MongoDB
try:
    client = MongoClient(
        'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
    )
    result = client.admin.command('ping')
    print("Ping result:", result)
    print("Conexión exitosa a MongoDB")
except Exception as e:
    print("Error:", e)

Ping result: {'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1749487575, 1), 'signature': {'hash': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 'keyId': 0}}, 'operationTime': Timestamp(1749487575, 1)}
Conexión exitosa a MongoDB


## Insertar datos

Como la insercion de datos esta saturando la Ram del servidor, utilizaremos el modulo gc para forzar la liberacion de ram.

In [None]:
import json
from pymongo import MongoClient
import gc


client = MongoClient(
    'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
)
db = client["arxiv_db"]
collection = db["articles"]

# Por cada 500 datos usará aproximadamente 8~9 GB de RAM
batch_size = 500  # Ajusta según tu RAM

def batch_insert(file_path, batch_size=500):
    with open(file_path, "r") as f:
        batch = []
        for line in f:
            if line.strip():
                batch.append(json.loads(line))
                if len(batch) == batch_size:
                    collection.insert_many(batch)
                    batch = []
                    gc.collect()
        if batch:
            collection.insert_many(batch)
            gc.collect()

batch_insert("dataset/arxiv-metadata-oai-snapshot.json", batch_size)
print("Inserción por lotes completada.")

Inserción por lotes completada.


## Consulta

Se haran las consultas pedidas en la tarea

a) Primera consulta

In [None]:
from pymongo import MongoClient
from pandas import DataFrame

client = MongoClient(
    'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
)
db = client["arxiv_db"]
collection = db["articles"]

consulta = {"update_date": {"$regex": "^2025"}}
parametros = {"title": 1, "update_date": 1, "_id": 0}

# usamos list porque head() se come unos resultados
docs = list(collection.find(consulta, parametros).limit(20))
df = DataFrame(docs)
print(df)

                                                title update_date
0   Hamiltonian Graphs and the Traveling Salesman ...  2025-02-26
1   Spin-dependent three-nucleon force effects on ...  2025-03-20
2   Adjointability of densely defined closed opera...  2025-04-29
3                         Proof of Riemann Hypothesis  2025-04-11
4   Critical fluctuations of time-dependent magnet...  2025-01-06
5   Gauss--Berezin integral operators and spinors ...  2025-02-11
6                The nature of electromagnetic energy  2025-05-16
7   Periodic relativity: the theory of gravity in ...  2025-01-16
8      Quantization of Atomic and Nuclear Rest Masses  2025-02-08
9   Solution of the equation d/dx(pdu/dx)+qu=cu by...  2025-05-06
10  The role of the quark and gluon GPDs in hard v...  2025-01-22
11  A new weak approximation scheme of stochastic ...  2025-04-28
12  Strict essential extensions of C*-algebras and...  2025-04-29
13     Rationalization of EPR Coincidence Experiments  2025-04-03
14        

Viendo la base de DATOS

In [21]:
from pymongo import MongoClient

client = MongoClient(
    'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
)
db = client["arxiv_db"]
collection = db["articles"]

# Contar todos los documentos en la colección
total = collection.count_documents({})
print("Total de documentos en la colección:", total)

Total de documentos en la colección: 2735264


c)

In [24]:
from pymongo import MongoClient
import pprint

client = MongoClient(
    'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
)
db = client["arxiv_db"]
collection = db["articles"]

# Consulta: primeros 20 artículos con categoría "hep-ph", mostrando todos los datos
query = {"categories": "hep-ph"}

for doc in collection.find(query).limit(20):
    pprint.pprint(doc)

{'_id': ObjectId('68463bd6d9a27e093ce27023'),
 'abstract': '  A fully differential calculation in perturbative quantum '
             'chromodynamics is\n'
             'presented for the production of massive photon pairs at hadron '
             'colliders. All\n'
             'next-to-leading order perturbative contributions from '
             'quark-antiquark,\n'
             'gluon-(anti)quark, and gluon-gluon subprocesses are included, as '
             'well as\n'
             'all-orders resummation of initial-state gluon radiation valid '
             'at\n'
             'next-to-next-to-leading logarithmic accuracy. The region of '
             'phase space is\n'
             'specified in which the calculation is most reliable. Good '
             'agreement is\n'
             'demonstrated with data from the Fermilab Tevatron, and '
             'predictions are made for\n'
             'more detailed tests with CDF and DO data. Predictions are shown '
             'for\n'

## Dropea la tabla

In [14]:
from pymongo import MongoClient

# Conexión a MongoDB
client = MongoClient(
    'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
)
db = client["arxiv_db"]
collection = db["articles"]
collection.drop()


## Tercera Etapa
Consultar los datos

In [4]:
import pprint

doc=collection.find_one()
pprint.pprint(doc)

None


In [6]:
list_cur = list(doc)
df = DataFrame(doc)
df.head()