# Dimensions Data

This notebook demonstrates reading a large amount of data out of the database and writing it to a CSV file.

In [6]:
import os
import dotenv
import sqlalchemy

from sqlalchemy import text

dotenv.load_dotenv()

db_password = os.environ.get("DB_PASSWORD")
db_name = 'rialto_20250331230743'
db_user = 'analyst'

engine = sqlalchemy.create_engine(f'postgresql://{db_user}:{db_password}@localhost:9999/{db_name}')

We want to select out dimensions data along with some publication level data:

In [18]:
query = """
SELECT 
    publication.doi, 
    publication.title, 
    publication.pub_year, 
    publication.dim_json->>'id' AS dim_publication_id, 
    publication.dim_json->>'pmid' AS pmid,
    publication.dim_json->>'publisher' AS publisher,
    publication.dim_json->'journal'->'title' AS journal_title, 
    publication.dim_json->>'mesh_terms' AS mesh,
    publication.dim_json->'authors' AS authors,
    publication.dim_json->'concepts' AS dim_concepts,
    publication.dim_json->'category_rcdc' AS dim_category_rcdc,
    publication.dim_json->'abstract' AS abstract
FROM 
    publication
WHERE 
    publication.doi IS NOT NULL
"""

Now we create a CSV output file to write to:

In [20]:
import csv

writer = csv.DictWriter(open('dimensions.csv', 'w'), fieldnames=[
    "doi",
    "title",
    "pub_year",
    "dim_publication_id",
    "pmid",
    "publisher",
    "journal_title",
    "mesh",
    "authors",
    "dim_concepts",
    "dim_category_rcdc",
    "abstract"
])

Now we execute the query, being carefult use the `yield_per` execution option to get results in batches of 1000, so not all the results are pulled into memory.

In [None]:
with engine.connect() as connection:
    for row in connection.execute(text(query), execution_options={"yield_per": 1000}):
        writer.writerow(dict(row._asdict()))