## Primera Etapa
Instalación de las librerías necesarias

In [13]:
from pymongo import MongoClient

# se chequea que hay conexión a MongoDB
try:
    client = MongoClient(
        'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
    )
    result = client.admin.command('ping')
    print("Ping result:", result)
    print("Conexión exitosa a MongoDB")
except Exception as e:
    print("Error:", e)

Ping result: {'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1749767975, 1), 'signature': {'hash': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 'keyId': 0}}, 'operationTime': Timestamp(1749767975, 1)}
Conexión exitosa a MongoDB


## Consulta

Se haran las consultas pedidas en la tarea

a) Primera consulta

In [3]:
from pymongo import MongoClient
from pandas import DataFrame

client = MongoClient(
    'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
)
db = client["arxiv_db"]
collection = db["articles"]

consulta = {"update_date": {"$regex": "^2025"}}
parametros = {"title": 1, "update_date": 1, "_id": 0}

# usamos list porque head() se come unos resultados
docs = list(collection.find(consulta, parametros).limit(20))
df = DataFrame(docs)
print(df)

                                                title update_date
0   Hamiltonian Graphs and the Traveling Salesman ...  2025-02-26
1   Spin-dependent three-nucleon force effects on ...  2025-03-20
2   Adjointability of densely defined closed opera...  2025-04-29
3                         Proof of Riemann Hypothesis  2025-04-11
4   Critical fluctuations of time-dependent magnet...  2025-01-06
5   Gauss--Berezin integral operators and spinors ...  2025-02-11
6                The nature of electromagnetic energy  2025-05-16
7   Periodic relativity: the theory of gravity in ...  2025-01-16
8      Quantization of Atomic and Nuclear Rest Masses  2025-02-08
9   Solution of the equation d/dx(pdu/dx)+qu=cu by...  2025-05-06
10  The role of the quark and gluon GPDs in hard v...  2025-01-22
11  A new weak approximation scheme of stochastic ...  2025-04-28
12  Strict essential extensions of C*-algebras and...  2025-04-29
13     Rationalization of EPR Coincidence Experiments  2025-04-03
14        

b. Devolver los títulos y los autores de artículos que pertenezcan a las categorías "cs.AI"
o "stat.ML" y que tengan al menos tres autores. Mostrar solo esos campos y limitar a los
primeros 10 resultados.

In [4]:
from pymongo import MongoClient
import pandas as pd
pd.set_option('display.max_colwidth', 50)

client = MongoClient(
    'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
)
db = client["arxiv_db"]
collection = db["articles"]

# parametros de consulta
consulta = {"categories": {"$regex": "cs.AI|stat.ML"}, "authors":{"$regex":".*,.*,"}}
parametros = {"title": 1, "authors": 1, "_id": 0}

# se devuelven solo los primeros 10 resultados
resultados = list(collection.find(consulta, parametros).limit(10))
df = pd.DataFrame(resultados)
print(df)

                                             authors  \
0  Tarik Hadzic, Rune Moller Jensen, Henrik Reif ...   
1  Stefano Bistarelli, Ugo Montanari, Francesca R...   
2  Juliana S Bernardes, Alberto Davila, Vitor San...   
3  Giorgio Terracina, Nicola Leone, Vincenzino Li...   
4               S. Mohamed, D. Rubin, and T. Marwala   
5                   J. Uglov, V. Schetinin, C. Maple   
6  Christian Gagn\'e (INFORMATIQUE WGZ INC.), Mic...   
7  Edgar H. de Graaf, Joost N. Kok, Walter A. Kos...   
8  Edgar H. de Graaf, Joost N. Kok, Walter A. Kos...   
9  Nicolas Godzik (INRIA Futurs, INRIA Rocquencou...   

                                               title  
0  Calculating Valid Domains for BDD-Based Intera...  
1  Unicast and Multicast Qos Routing with Soft Co...  
2  A study of structural properties on profiles HMMs  
3  Experimenting with recursive queries in databa...  
4  An Adaptive Strategy for the Classification of...  
5  Comparing Robustness of Pairwise and Multiclas... 

c. Devolver los títulos, las categorías y los enlaces al PDF de artículos que pertenezcan a
la categoría "hep-ph" y tengan un DOI asignado. Mostrar solo esos campos y limitar a 15
resultados.

In [5]:
from pymongo import MongoClient
import pandas as pd

pd.set_option('display.max_colwidth', 50)

client = MongoClient(
    'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
)
db = client["arxiv_db"]
collection = db["articles"]

consulta = {"categories":{"$regex":"hep-ph"}, "doi":{"$exists":True, "$ne":None}}
parametros = {"title":1, "categories":1, "pdf_source":1, "_id":0}

resultados = list(collection.find(consulta, parametros).limit(15))

# Add PDF links to the DataFrame
df = pd.DataFrame(resultados)
df['pdf_source'] = df['title'].apply(lambda x: f"https://arxiv.org/pdf/{x.replace(' ', '_')}.pdf")

print(df)

                                                title              categories  \
0   Calculation of prompt diphoton production cros...                  hep-ph   
1                  Lifetime of doubly charmed baryons                  hep-ph   
2   Understanding the Flavor Symmetry Breaking and...                  hep-ph   
3   Crystal channeling of LHC forward protons with...                  hep-ph   
4   Probing non-standard neutrino interactions wit...                  hep-ph   
5   Scalar radius of the pion and zeros in the for...  hep-ph hep-lat nucl-th   
6                    Strong decays of charmed baryons   hep-ph hep-ex nucl-ex   
7                       CP violation in beauty decays           hep-ph hep-ex   
8   Energy density for chiral lattice fermions wit...          hep-lat hep-ph   
9   Multiple Parton Scattering in Nuclei: Quark-qu...          hep-ph nucl-th   
10    Strong Phase and $D^0-D^0bar$ mixing at BES-III           hep-ex hep-ph   
11  Towards self-consistent 

d. Devolver los títulos, nombres de los autores y la referencia de publicación (journal-ref)
de los artículos que tengan un DOI asignado. Mostrar solo esos campos y ordenar los
resultados alfabéticamente por título. Limitar a los primeros 20 resultados.

In [6]:
from pymongo import MongoClient
import pandas as pd

pd.set_option('display.max_colwidth', 50)

client = MongoClient(
    'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
)
db = client["arxiv_db"]
collection = db["articles"]

consulta = {"doi":{"$exists":True,"$ne":None}}
parametros = {"title":1, "authors":1, "journal-ref":1, "_id":0}

resultados = list(collection.find(consulta, parametros).limit(20).sort("title", 1))

print(pd.DataFrame(resultados))

KeyboardInterrupt: 

e. Devolver los títulos y la fecha de la primera versión (versions.created) de los artículos
enviados entre los años 2010 y 2015. Mostrar solo esos campos y limitar a los primeros 15
resultados.

In [None]:
from pymongo import MongoClient
import pandas as pd

pd.set_option('display.max_colwidth', 50)

client = MongoClient(
    'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
)
db = client["arxiv_db"]
collection = db["articles"]

# nos vamos a la primera version del articulo en el indice 0 del array
consulta = {"versions.0.created":{"$regex":"2010|2011|2012|2013|2014|2015"}}
parametros = {"title":1, "versions":1,"_id":0}

resultados = list(collection.find(consulta, parametros).limit(15))

# Extract year from 'created' field
def extract_year(versions):
    if versions and 'created' in versions[0]:
        return versions[0]['created'].split(',')[-1].strip()
    return None

df = pd.DataFrame(resultados)
df['year'] = df['versions'].apply(extract_year)

print(df[['title', 'year']])

                                                title                     year
0   A Comprehensive Analysis of Uncertainties Affe...  3 Jan 2010 19:43:29 GMT
1   Testing product states, quantum Merlin-Arthur ...  4 Jan 2010 18:01:41 GMT
2   A landscape of non-supersymmetric AdS vacua on...  4 Jan 2010 13:51:46 GMT
3               Jet Shapes and Jet Algorithms in SCET  4 Jan 2010 20:56:57 GMT
4               Mu-Tau Production at Hadron Colliders  4 Jan 2010 04:10:52 GMT
5   Strong Constraints to the Putative Planet Cand...  1 Jan 2010 00:07:58 GMT
6            Bayesian Methods and Universal Darwinism  4 Jan 2010 17:01:57 GMT
7   Rigid Symmetries and Conservation Laws in Non-...  4 Jan 2010 12:30:50 GMT
8   New identities involving q-Euler polynomials o...  4 Jan 2010 15:34:13 GMT
9                   Nonmeasurability in Banach spaces  4 Jan 2010 16:31:26 GMT
10  Arrested phase separation in reproducing bacte...  4 Jan 2010 19:56:03 GMT
11  News on PHOTOS Monte Carlo: gamma^* -> pi^+ pi..

f. Devolver los títulos, comentarios y reportes técnicos (report-no) de artículos que tengan comentarios definidos y no nulos. Mostrar solo esos campos, ordenando por fecha de actualización (update_date) en orden descendente. Limitar a 10 resultados.

In [None]:
from pymongo import MongoClient
import pandas as pd

pd.set_option('display.max_colwidth', 50)

client = MongoClient(
    'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
)
db = client["arxiv_db"]
collection = db["articles"]

# Consulta: comentarios definidos y no nulos
consulta = {
    "comments": {"$exists": True, "$ne": None}
}
parametros = {"title": 1, "comments": 1, "report-no": 1, "_id": 0}

resultados = list(
    collection.find(consulta, parametros)
    .sort("update_date", -1)
    .limit(10)
)

df = pd.DataFrame(resultados)
print(df)

                                               title  \
0  The Filippov characteristic flow for the aggre...   
1  A currently true statement G of the form "$\ex...   
2  Effect of Tensile Strain in GaN Layer on the B...   
3     A Proof of the truth of the Riemann hypothesis   
4               The nature of electromagnetic energy   
5  On the global existence and blowup of smooth s...   
6  Sturm-Liouville and Carroll: at the heart of t...   
7  Conditions for Solvability in Chemical Reactio...   
8  Quantum Entanglement Dynamics of Spacetime and...   
9            The information-complete quantum theory   

                                            comments report-no  
0                                           33 pages      None  
1  presented at the 25th Conference Applications ...      None  
2                                 3 pages, 4 figures      None  
3                                         in Russian      None  
4  The previous version had a number of mistakes,...      

Parte 3

In [14]:
from pymongo import MongoClient
import pymongo
import time

# se chequea que hay conexión a MongoDB
try:
    client = MongoClient(
        'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
    )
    result = client.admin.command('ping')
    print("Ping result:", result)
    print("Conexión exitosa a MongoDB")
except Exception as e:
    print("Error:", e)

# Verify MongoDB connection before performing operations
try:
    client = pymongo.MongoClient(
        'mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set'
    )
    client.admin.command('ping')
    print("MongoDB connection verified.")
except Exception as e:
    print("Error verifying MongoDB connection:", e)
    raise

db = client["arxiv_db"]
collection = db["articles"]

# Document to work with
document = {
    "_id": "684a528d934c09df034b4c5b",
    "id": "0704.0002",
    "submitter": "Louis Theran",
    "authors": "Ileana Streinu and Louis Theran",
    "title": "Sparsity-certifying Graph Decompositions",
    "comments": "To appear in Graphs and Combinatorics",
    "categories": "math.CO cs.CG",
    "license": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/",
    "abstract": "  We describe a new algorithm, the $(k,\ell)$-pebble game with colors,…",
    "update_date": "2008-12-13"
}

# Check if the document exists and delete it if necessary
try:
    existing_doc = collection.find_one({"_id": document["_id"]})
    if existing_doc:
        print("Document already exists. Deleting it before insertion:", existing_doc)
        collection.delete_one({"_id": document["_id"]})
except Exception as e:
    print("Error checking or deleting existing document:", e)

# Insert the document into the collection
try:
    insert_result = collection.insert_one(document)
    print(f"Inserted document ID: {insert_result.inserted_id}")
except Exception as e:
    print("Error inserting document:", e)

# Print the inserted document from one node
try:
    found_doc = collection.find_one({"_id": document["_id"]})
    print("Document before deletion:", found_doc)
except Exception as e:
    print("Error finding document:", e)

# Delete the document
try:
    delete_result = collection.delete_one({"_id": document["_id"]})
    print(f"Deleted count for _id {document['_id']}: {delete_result.deleted_count}")
except Exception as e:
    print("Error deleting document:", e)

# Wait for replication
print("Waiting for replication...")
time.sleep(5)  # Wait 5 seconds to allow replication

# Verify deletion in all nodes
try:
    nodes = [
        'mongodb://localhost:30001',
        'mongodb://localhost:30002',
        'mongodb://localhost:30003'
    ]

    for node in nodes:
        node_client = pymongo.MongoClient(node)
        node_db = node_client["arxiv_db"]
        node_collection = node_db["articles"]
        found_doc = node_collection.find_one({"_id": document["_id"]})
        print(f"Document from node {node} after deletion: {found_doc}")

except Exception as e:
    print("Error verifying document deletion in nodes:", e)

# Recreate the document after deletion
try:
    insert_result = collection.insert_one(document)
    print(f"Recreated document ID: {insert_result.inserted_id}")
except Exception as e:
    print("Error recreating document:", e)

# Wait for replication
print("Waiting for replication...")
time.sleep(5)  # Wait 5 seconds to allow replication

# Verify recreation in all nodes
try:
    for node in nodes:
        node_client = pymongo.MongoClient(node)
        node_db = node_client["arxiv_db"]
        node_collection = node_db["articles"]
        found_doc = node_collection.find_one({"_id": document["_id"]})
        print(f"Document from node {node} after recreation: {found_doc}")

except Exception as e:
    print("Error verifying document recreation in nodes:", e)

Ping result: {'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1749767989, 2), 'signature': {'hash': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 'keyId': 0}}, 'operationTime': Timestamp(1749767989, 2)}
Conexión exitosa a MongoDB
MongoDB connection verified.
Document already exists. Deleting it before insertion: {'_id': '684a528d934c09df034b4c5b', 'id': '0704.0002', 'submitter': 'Louis Theran', 'authors': 'Ileana Streinu and Louis Theran', 'title': 'Sparsity-certifying Graph Decompositions', 'comments': 'To appear in Graphs and Combinatorics', 'categories': 'math.CO cs.CG', 'license': 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/', 'abstract': '  We describe a new algorithm, the $(k,\\ell)$-pebble game with colors,…', 'update_date': '2008-12-13'}
Inserted document ID: 684a528d934c09df034b4c5b
Document before deletion: {'_id': '684a528d934c09df034b4c5b', 'id': '0704.0002', 'submitter': 'Louis Theran', 'authors': 'Ileana Streinu and Loui