In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dotenv import load_dotenv

load_dotenv("../../.env.localhost")

# Site Client Session

In [None]:
from nmdc_runtime.util import nmdc_jsonschema, nmdc_jsonschema_validate

In [None]:
from nmdc_runtime.pipelines.core import run_config_frozen__preset_normal_env
from nmdc_runtime.resources.mongo import get_mongo


mongo = get_mongo(run_config_frozen__preset_normal_env)
mdb = mongo.db

In [None]:
from nmdc_runtime.pipelines.core import run_config_frozen__preset_normal_env
from nmdc_runtime.resources.core import get_runtime_api_site_client


client = get_runtime_api_site_client(run_config_frozen__preset_normal_env)

In [None]:
# import os

# from nmdc_runtime.resources.core import RuntimeApiSiteClient

# client = RuntimeApiSiteClient(
#     base_url="https://drs.microbiomedata.org",
#     site_id=os.getenv("API_SITE_ID"),
#     client_id=os.getenv("API_SITE_CLIENT_ID"),
#     client_secret=os.getenv("API_SITE_CLIENT_SECRET"),
# )

# what fraction of ID authorities in biosciences use digits for the main body of IDs?

In [None]:
import requests

results = []
url = "https://registry.api.identifiers.org/restApi/namespaces"
while url is not None:
    rv = requests.get(url)
    results.extend(rv.json()['_embedded']['namespaces'])
    url = rv.json()['_links'].get('next',{}).get('href')

In [None]:
len(results)

In [None]:
'12345'[-3:]

In [None]:
n_integer_ends = 0

for i, r in enumerate(results):
    try:
        int(r['sampleId'][-3:])
        n_integer_ends += 1
    except:
        print(i+1, r['prefix'])
        print(r['pattern'])
        print(r['sampleId'])
        print()

In [None]:
n_integer_ends

In [None]:
len("11111111111111111111111111111111111111")

In [None]:
567  + 38

In [None]:
(567 + 38) / 750

# aggregated file counts and sizes by data type

In [None]:
ids = [d["_id"] for d in mdb.data_object_set.find({"data_object_type": {"$exists": True}}, ["_id"])]

In [None]:
stats_docs = list(mdb.data_object_set.aggregate([
    {"$match": {"data_object_type": {"$exists": True}}},
    {"$group": {
        "_id": "$data_object_type",
        "size_total": {"$sum": "$file_size_bytes"},
        "count": {"$sum": 1}
    }},
]))

In [None]:
for sdoc in stats_docs:
    edoc = mdb.file_type_enum.find_one({"id": sdoc["_id"]})
    sdoc["name"] = edoc["name"]
    sdoc["description"] = edoc["description"]

In [None]:
for d in stats_docs:
    print("{name} ({description}):".format(**d))
    print(f"total size (TB): {(d['size_total'] / (1024 * 1024 * 1024 * 1024)):.3}")
    print("count: {count}".format(**d))
    print()

# Manage User/Client Credentials

In [None]:
from nmdc_runtime.api.core.auth import get_password_hash

rv = mdb.users.replace_one(
    {"username": "corilo"},
    {"username": "corilo", "hashed_password": get_password_hash("_uXxpH9KRH8bjGE")},
    upsert=True
)
rv

# Test operations

In [None]:
from datetime import datetime, timezone, timedelta

from nmdc_runtime.api.core.idgen import generate_one_id
from nmdc_runtime.api.models.operation import Operation

dt_past = datetime.now(timezone.utc) - timedelta(days=1)

mdb.operations.insert_one(Operation(id=generate_one_id(mdb, "op"), expire_time=dt_past).dict(exclude_unset=True))

# Register objects with existing URLs
The mongoexports at https://portal.nersc.gov/project/m3408/meta/mongoexports/2021-07/

In [None]:
from pprint import pprint

nmdc_db_collection_names = set(nmdc_jsonschema["definitions"]["Database"]["properties"])
nmdc_db_collection_names -= {
    # not actually collections
    "activity_set",
    "nmdc_schema_version",
    "date_created",
    "etl_software_version",
}

pprint(nmdc_db_collection_names)

```bash
scp -r dtn01.nersc.gov:/project/projectdirs/m3408/www/meta/mongoexports/2021-07/ \
    /Users/dwinston/nmdc_files/mongoexports/
```

In [None]:
from glob import glob

from nmdc_runtime.util import drs_metadata_for
import requests
from tqdm.notebook import tqdm

paths = glob(os.path.expanduser("~/nmdc_files/mongoexports/2021-07/*"))
url_base = "https://portal.nersc.gov/project/m3408/meta/mongoexports/2021-07"
objects = []
for path in tqdm(paths):
    meta = drs_metadata_for(path)
    url = f'{url_base}/{meta["name"]}'
    
    rv = requests.head(
        url, allow_redirects=True, timeout=5, headers={"Accept-Encoding": "gzip;q=0"}
    )
    if not rv.status_code == 200:
        raise Exception(f"url {url} not OK")

    try:
        size_bytes = int(rv.headers['Content-Length'])
    except KeyError:
        raise Exception(f"no content-length response for {url}")
        
    if size_bytes != meta["size"]:
        raise Exception(f'size of local file {path} ({meta["size"]}) does not match size at {url} ({size_bytes})')
    
    objects.append(DrsObjectIn(
        access_methods=[{"access_url": {"url": url}}],
        **meta
    ))

In [None]:
rvs = []
for doc in tqdm([json.loads(o.json(exclude_unset=True)) for o in objects]):
    rvs.append(client.create_object(doc))

# Register a bundle

In [None]:
# checksum computed over sorted concatenation of the corresponding checksums of its top-level contents
# size is cumulative size of contents
# created_time must be >= max of contents created_time values

from glob import glob
import os
from pathlib import Path

from toolz import dissoc

paths = glob(os.path.expanduser("~/nmdc_files/mongoexports/2021-07/*"))
names = [Path(p).name for p in paths]
bundle_content_objects = [
    dissoc(db.objects.find_one(
        filter={"name": n},
        projection=["id", "name", "size", "checksums", "created_time"],
        sort=[("created_time", -1)]
    ), "_id")
    for n in names
]

In [None]:
checksums = [
    c["checksum"]
    for bco in bundle_content_objects
    for c in bco["checksums"]
    if c["type"] == "sha-256"
]
if len(checksums) != len(bundle_content_objects):
    raise Exception("Contents of bundle must have sha-256 checksums to compute bundle checksum")

In [None]:
from nmdc_runtime.api.core.util import hash_from_str

checksum = hash_from_str("".join(sorted(checksums)), algo="sha256")

In [None]:
size = sum(o["size"] for o in bundle_content_objects)

In [None]:
created_time = max(o["created_time"] for o in bundle_content_objects)

In [None]:
checksum, size, created_time

In [None]:
from nmdc_runtime.api.core.util import pick
from nmdc_runtime.api.models.object import DrsObjectIn

drs_bundle_in = DrsObjectIn(
    name="2021-07-nmdc-dbexport",
    description="bzip2-encoded JSON Lines mongoexports of the NMDC database",
    checksums=[{"type": "sha-256", "checksum": checksum}],
    contents=[pick(["id", "name"], o) for o in bundle_content_objects],
    created_time=created_time,
    size=size,
)

In [None]:
import json
from pprint import pprint

pprint(json.loads(drs_bundle_in.json(exclude_unset=True)))

In [None]:
rv = client.create_object(json.loads(drs_bundle_in.json(exclude_unset=True)))

In [None]:
rv.json()

# Register objects by S3 upload and access IDs

In [None]:
from pathlib import Path

from dagster import build_solid_context
from nmdc_runtime.solids.core import local_file_to_api_object as lftao

context = build_solid_context(
    resources={"mongo": mongo, "runtime_api_site_client": client}
)

storage_path = "../src/data/2021-07-02-study-changes.csv"


def local_file_to_api_object(file_info):
    return lftao(context, file_info)

obj = local_file_to_api_object({"storage_path": storage_path, "mime_type": 'text/csv'})

doc = db.objects.find_one({"id": obj["id"]})
assert doc["name"] == Path(storage_path).name

# Upload object and let sensor-triggered pipeline register object

In [None]:
!echo '{"hello": "donny"}' > test.json

In [None]:
rv = client.put_object_in_site({"mime_type": "application/json", "name": "test.json"})

In [None]:
op = rv.json()

In [None]:
op

In [None]:
from nmdc_runtime.util import put_object

rv = put_object("test.json", op["metadata"]["url"])
rv

In [None]:
from nmdc_runtime.util import drs_object_in_for

result = drs_object_in_for("test.json", op)

In [None]:
result

In [None]:
import json
from pprint import pprint

op_patch = {"done": True, "result": result}

In [None]:
rv = client.update_operation(op["id"], op_patch)

In [None]:
pprint(rv.json())

# S3 init

In [None]:
from functools import lru_cache
import os

import boto3

API_SITE_BUCKET = os.getenv("API_SITE_ID")


@lru_cache
def get_s3_client():
    _session = boto3.session.Session()
    return _session.client(
        "s3",
        region_name=os.getenv("DO_REGION_NAME"),
        endpoint_url=os.getenv("DO_ENDPOINT_URL"),
        aws_access_key_id=os.getenv("DO_SPACES_KEY"),
        aws_secret_access_key=os.getenv("DO_SPACES_SECRET"),
    )


def presigned_url_to_put(
    key, client=None, mime_type=None, bucket=API_SITE_BUCKET, expires_in=300
):
    return client.generate_presigned_url(
        ClientMethod="put_object",
        Params={"Bucket": bucket, "Key": key, "ContentType": mime_type},
        ExpiresIn=expires_in,
    )

s3client = get_s3_client()

response = s3client.list_buckets()
for space in response['Buckets']:
    print(space['Name'])
print(f"\nusing Bucket {API_SITE_BUCKET}\n")
response = s3client.list_objects(Bucket=API_SITE_BUCKET)
for obj in response['Contents']:
    print(obj['Key'])

# GSP schema  / Cordra stuff

Need "id" in payload, e.g.:
```
"results": [
    {
      "id": "test/activity",
      "type": "Schema",
      "content": {
        "name": "Activity",
        "schema": collschemas["activity_set"]
      }
    }
  ]
```

In [None]:
from time import time
import os

tic = time()

from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/.nmdc_mongo.env"))

In [None]:
os.environ["NMDC_JSON_SCHEMA_FILE"] = "/Users/dwinston/Desktop/nmdc.schema.gsp.json"

In [None]:
import json
import re
from toolz import assoc_in, dissoc
from zipfile import ZipFile

from mongospawn.schema import collschemas_for

from nmdc_mongo import (
    add_to_db,
    correct_metaP_doc,
    dbschema,
    fetch_and_validate_json,
    fetch_conform_and_persist_from_manifest,
    fetch_json,
    get_db,
    reset_database,
    snake_case_set_name
)

In [None]:
###########################
# Adjustments for GSP below
###########################

defined_object_names = set(dbschema["definitions"])

set_for_object_name = {
    spec["items"]["$ref"].split("#/definitions/")[-1]: set_name
    for set_name, spec in dbschema["properties"].items()
}

existing_set_names = set(dbschema["properties"])

for object_without_set in (defined_object_names - set(set_for_object_name.keys())):
    proposed_set_name = snake_case_set_name(object_without_set)
    if proposed_set_name not in existing_set_names:
        dbschema["properties"][proposed_set_name] = {
            "description": (f"This property links a database object to the set of"
                            f" {object_without_set} objects within it."),
            "items": {"$ref": f"#/definitions/{object_without_set}"},
            "type": "array",
        }
        
dbschema = assoc_in(dbschema, ["definitions", "ControlledTermValue", "properties", "term", "type"], "string")
del dbschema["definitions"]["ControlledTermValue"]["properties"]["term"]["$ref"]

# 'k' not capitalized upstream perhaps. should conform!
#dbschema = assoc_in(dbschema, ["definitions", "MetagenomeAssembly", "properties", "scaf_l_gt50k", "type"], "number")

In [None]:
collschemas = collschemas_for(dbschema)

# Reconstruct
set_for_object_name = {
    spec["items"]["$ref"].split("#/definitions/")[-1]: set_name
    for set_name, spec in dbschema["properties"].items()
}

In [None]:
sorted(collschemas.keys())

In [None]:
collschemas["biosample_set"]

In [None]:
import requests

In [None]:
rv = requests.post("http://localhost:8080/auth/token",
              {"grant_type": "password", "username": "admin", "password": "nmdcrulez"})

In [None]:
rv.json()

In [None]:
auth_header = {"Authorization": f'Bearer {rv.json()["access_token"]}'}

In [None]:
auth_header

In [None]:
rv = requests.post("http://localhost:8080/uploadObjects", json={
    "results": [
        {
          "type": "Schema",
          "content": {
            "name": "Biosample",
            "schema": collschemas["biosample_set"]
          }
        }
      ]
}, headers=auth_header)

In [None]:
rv

In [None]:
import json
with open('/Users/dwinston/Desktop/cordra-upload.json','w') as f:
    json.dump({
        "results": [
            {
              "id": "test/activity",
              "type": "Schema",
              "content": {
                "name": "Activity",
                "schema": collschemas["activity_set"]
              }
            }
          ]
    }, f, indent=2)

In [None]:
import json
with open('/Users/dwinston/Desktop/cordra-upload.json','w') as f:
    json.dump({
        "results": [
            {
              "id": "test/study",
              "type": "Schema",
              "content": {
                "name": "Study",
                "schema": collschemas["study_set"]
              }
            }
          ]
    }, f, indent=2)

In [None]:
rv = requests.get("http://localhost:8080/search?query=type:%22Schema%22", headers=auth_header)

In [None]:
from pprint import pprint

template = rv.json()
del template["pageNum"]
del template["pageSize"]
del template["size"]
del template["results"][0]
del template["results"][0]
template["results"][0]["id"] = "test/abcd1234"
template["results"][0]["content"]["name"] = "Document2"
template["results"][0]["content"]["schema"]["title"] = "Document2"
del template["results"][0]["content"]["identifier"]

pprint(template)

In [None]:
import json
with open('/Users/dwinston/Desktop/cordra-upload.json','w') as f:
    json.dump(template, f, indent=2)

In [None]:
!cat /Users/dwinston/Desktop/cordra-upload.json

In [None]:
rv = requests.post("http://localhost:8080/uploadObjects", json=template, headers=auth_header)
rv