In [1]:
from dask.distributed import Client
import dask.bag as db
import json
from pathlib import Path
import zipfile
import pandas as pd
import numpy as np

In [2]:
client = Client(n_workers=8, threads_per_worker=4)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 49676 instead


0,1
Client  Scheduler: tcp://127.0.0.1:49677  Dashboard: http://127.0.0.1:49676/status,Cluster  Workers: 8  Cores: 32  Memory: 17.18 GB


In [3]:
def gh_dask(location, subset=[]):
    gh_location = Path(location)
    if gh_location.is_dir():
        units = [unit_dir.name for unit_dir in Path(gh_location / 'metadata/objects').iterdir()]
        if len(subset) == 0:
            unit_subset = units
        else:
            unit_subset = [unit for unit in subset if unit in units]
        if len(unit_subset):
            metadata_files = []
            for unit in unit_subset:
                unit_dir = Path(gh_location / 'metadata/objects' / unit)
                valid_files = [str(txt_bz2) for txt_bz2 in unit_dir.glob('*.txt.bz2') \
                                       if ('index.txt' not in txt_bz2.name) and \
                                          (txt_bz2.stat().st_size > 20)]
                metadata_files += valid_files
            gh_dask_bag = db.read_text(metadata_files, compression='bz2').map(json.loads)
            return gh_dask_bag      

In [4]:
! wget -nc -O si_openaccess.zip https://github.com/Smithsonian/OpenAccess/archive/refs/heads/master.zip

File ‘si_openacess.zip’ already there; not retrieving.


In [5]:
! unzip -nq si_openaccess.zip -d si_openaccess

In [6]:
gh_location = Path('si_openaccess/OpenAccess-master')
gh_location.is_dir()
units = gh_location / 'metadata/objects'
units = [unit_dir.name for unit_dir in Path(gh_location / 'metadata/objects').iterdir()]
print(units)

['NMNHANTHRO', 'NMNHFISHES', 'SC', 'NMNHBIRDS', 'NMNHINV', 'NMAfA', 'ACAH', 'NMAI', 'SIA', 'NMNHENTO', 'NMAAHC', 'HSFA', 'CHNDM', 'HAC', 'NMAH', 'OCIO_DPO3D', 'NAA', 'NPG', 'NASM', 'SAAM', 'NASMAC', 'NMNHBOTANY', 'NMNHMAMMALS', 'AAA', 'EEPA', 'AAG', 'SI', 'NMAIA', 'OFEO-SG', 'ACMA', 'NMNHMINSCI', 'HMSG', 'SIL', 'SAAMPAIK', 'CFCHFOLKLIFE', 'NMNHEDUCATION', 'FBR', 'NMNHHERPS', 'NPM', 'FSA', 'NMNHPALEO', 'FSG', 'ACM', 'NZP']


In [7]:
b = gh_dask('si_openaccess/OpenAccess-master', subset = ['NMNHBOTANY'])
b

dask.bag<loads, npartitions=256>

In [8]:
sample_record = b.take(1000)[948]
print(json.dumps(sample_record, indent=2))

{
  "id": "edanmdm-nmnhbotany_14147093",
  "version": "",
  "unitCode": "NMNHBOTANY",
  "linkedId": "0",
  "type": "edanmdm",
  "content": {
    "descriptiveNonRepeating": {
      "record_ID": "nmnhbotany_14147093",
      "online_media": {
        "mediaCount": 1,
        "media": [
          {
            "thumbnail": "https://ids.si.edu/ids/deliveryService/id/ark:/65665/m355f22176273c403ba213879f4b8e6c6b/90",
            "idsId": "ark:/65665/m355f22176273c403ba213879f4b8e6c6b",
            "usage": {
              "access": "CC0"
            },
            "guid": "http://n2t.net/ark:/65665/m355f22176-273c-403b-a213-879f4b8e6c6b",
            "type": "Images",
            "content": "https://ids.si.edu/ids/deliveryService/id/ark:/65665/m355f22176273c403ba213879f4b8e6c6b",
            "resources": [
              {
                "label": "High-resolution TIFF",
                "url": "https://ids.si.edu/ids/download?id=NMNH-02813648.tif"
              },
              {
            

In [9]:
def extract_ids(record):
    """Take a single NMNH Botany metadata record, and pulls out ids

    Parameters
    ----------
    record : dict
        A single NMNH Botany metadata record in highly-nested dictionary format.

    Returns
    -------
    flattened_record: dict
        An un-nested dictionary that only contains the record id, unit code,
        object title, media_count, media_id, topic list, object type, and
        object medium.
    """
    flattened_record = dict()
    flattened_record['edan_id'] = record['id']
    flattened_record['title'] = record['title']
    flattened_record['timestamp_unix'] = record['timestamp']
    flattened_record['lastupdate_unix'] = record['lastTimeUpdated']
    
    flattened_record['Barcode'] = np.nan
    
    flattened_record['specimen_guid'] = record['content'].get('descriptiveNonRepeating', {}).get('guid',np.nan)
    
    media_count = record['content'].get('descriptiveNonRepeating', {}).get('online_media',{}).get('mediaCount',np.nan)
    flattened_record['media_count'] = float(media_count)
    media = record['content'].get('descriptiveNonRepeating', {}).get('online_media',{}).get('media',[])   
    if len(media):
        flattened_record['media_guid'] = media[0]['guid']
        if 'resources' in media[0]:
            guid_list = []
            aws_id_list = []
            for media_record in media:
                guid_list.append(media_record['guid'])
                if 'resources' in media_record:
                    for media_resource in media_record['resources']:
                        if 'JPEG' in media_resource['label']:
                            aws_id = media_resource['url'].split('=')[1].split('.')[0]
                            aws_id_list.append(aws_id)
            if len(guid_list):
                flattened_record['media_guid_list'] = ';'.join(guid_list)        
            if len(aws_id_list):
                flattened_record['media_aws_id_list'] = ';'.join(aws_id_list)
                flattened_record['aws_media_count'] = len(aws_id_list)
    if 'freetext' in record['content']:
        if 'identifier' in record['content']['freetext']:
            for identifier in record['content']['freetext']['identifier']:
                id_type = identifier['label']
                id_number = identifier['content']
                flattened_record[id_type] = id_number
          
    return flattened_record

In [10]:
print(extract_ids(sample_record))

{'edan_id': 'edanmdm-nmnhbotany_14147093', 'title': 'Thymus calcareus Klokov & Des.-Shost.', 'timestamp_unix': 1618320456, 'lastupdate_unix': 1618320456, 'Barcode': '02813648', 'specimen_guid': 'http://n2t.net/ark:/65665/332243886-8302-43ae-bbb6-047b404c2839', 'media_count': 1.0, 'media_guid': 'http://n2t.net/ark:/65665/m355f22176-273c-403b-a213-879f4b8e6c6b', 'media_guid_list': 'http://n2t.net/ark:/65665/m355f22176-273c-403b-a213-879f4b8e6c6b', 'media_aws_id_list': 'NMNH-02813648', 'aws_media_count': 1, 'USNM Number': '2410636'}


In [11]:
botany_ids = (b.map(extract_ids).compute())


In [12]:
botany_df = pd.DataFrame(botany_ids)
botany_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4199051 entries, 0 to 4199050
Data columns (total 13 columns):
 #   Column             Dtype  
---  ------             -----  
 0   edan_id            object 
 1   title              object 
 2   timestamp_unix     int64  
 3   lastupdate_unix    int64  
 4   Barcode            object 
 5   specimen_guid      object 
 6   media_count        float64
 7   media_guid         object 
 8   media_guid_list    object 
 9   media_aws_id_list  object 
 10  aws_media_count    float64
 11  USNM Number        object 
 12  Other Numbers      object 
dtypes: float64(2), int64(2), object(9)
memory usage: 416.5+ MB


In [13]:
botany_df['timestamp_dt'] = pd.to_datetime(botany_df['timestamp_unix'], unit = 's')
botany_df['lastupdate_dt'] = pd.to_datetime(botany_df['lastupdate_unix'], unit = 's')
botany_df[['timestamp_unix','timestamp_dt','lastupdate_unix','lastupdate_dt']].head(20)

Unnamed: 0,timestamp_unix,timestamp_dt,lastupdate_unix,lastupdate_dt
0,1618320530,2021-04-13 13:28:50,1618320530,2021-04-13 13:28:50
1,1618320531,2021-04-13 13:28:51,1618320531,2021-04-13 13:28:51
2,1618320531,2021-04-13 13:28:51,1618320531,2021-04-13 13:28:51
3,1618320533,2021-04-13 13:28:53,1618320533,2021-04-13 13:28:53
4,1618320534,2021-04-13 13:28:54,1618320534,2021-04-13 13:28:54
5,1618320534,2021-04-13 13:28:54,1618320534,2021-04-13 13:28:54
6,1618320534,2021-04-13 13:28:54,1618320534,2021-04-13 13:28:54
7,1618320534,2021-04-13 13:28:54,1618320534,2021-04-13 13:28:54
8,1618320534,2021-04-13 13:28:54,1618320534,2021-04-13 13:28:54
9,1618320235,2021-04-13 13:23:55,1618320235,2021-04-13 13:23:55


In [14]:
botany_df.to_csv('botany_ids_and_dates.tsv', index=False, sep='\t')

In [15]:
botany_df.sort_values('timestamp_unix', ascending=False).head().to_dict(orient='records')

[{'edan_id': 'edanmdm-nmnhbotany_16289832',
  'title': 'Parmotrema mordenii (Hale) Hale',
  'timestamp_unix': 1621934960,
  'lastupdate_unix': 1621934960,
  'Barcode': nan,
  'specimen_guid': 'http://n2t.net/ark:/65665/3a1ce0515-4135-4965-863d-436036770d5a',
  'media_count': nan,
  'media_guid': nan,
  'media_guid_list': nan,
  'media_aws_id_list': nan,
  'aws_media_count': nan,
  'USNM Number': nan,
  'Other Numbers': nan,
  'timestamp_dt': Timestamp('2021-05-25 09:29:20'),
  'lastupdate_dt': Timestamp('2021-05-25 09:29:20')},
 {'edan_id': 'edanmdm-nmnhbotany_16289382',
  'title': 'Asclepias linearis Scheele',
  'timestamp_unix': 1621934960,
  'lastupdate_unix': 1621934960,
  'Barcode': nan,
  'specimen_guid': 'http://n2t.net/ark:/65665/3d8412982-caa9-4141-adfd-f2ff79d1190d',
  'media_count': nan,
  'media_guid': nan,
  'media_guid_list': nan,
  'media_aws_id_list': nan,
  'aws_media_count': nan,
  'USNM Number': nan,
  'Other Numbers': nan,
  'timestamp_dt': Timestamp('2021-05-25 09:

In [16]:
botany_df[pd.notnull(botany_df['media_count'])].sort_values('timestamp_unix', ascending=False).tail().to_dict(orient='records')

[{'edan_id': 'edanmdm-nmnhbotany_2098230',
  'title': 'Lupinus crucis-viridis C.P. Sm.',
  'timestamp_unix': 1581932823,
  'lastupdate_unix': 1581932766,
  'Barcode': '00003171',
  'specimen_guid': 'http://n2t.net/ark:/65665/39306a5eb-359d-4224-a444-c77a21ca5b00',
  'media_count': 1.0,
  'media_guid': 'http://n2t.net/ark:/65665/m3fc87c872-11e7-4809-8c21-8a1b49c98d5f',
  'media_guid_list': 'http://n2t.net/ark:/65665/m3fc87c872-11e7-4809-8c21-8a1b49c98d5f',
  'media_aws_id_list': 'NMNH-00003171',
  'aws_media_count': 1.0,
  'USNM Number': '1850675',
  'Other Numbers': 'fiche number : 0392/D03',
  'timestamp_dt': Timestamp('2020-02-17 09:47:03'),
  'lastupdate_dt': Timestamp('2020-02-17 09:46:06')},
 {'edan_id': 'edanmdm-nmnhbotany_2079000',
  'title': 'Lupinus semiprostratus C.P. Sm.',
  'timestamp_unix': 1581932823,
  'lastupdate_unix': 1581932766,
  'Barcode': '00003399',
  'specimen_guid': 'http://n2t.net/ark:/65665/38c924c7d-e479-40a7-bb1c-bbe19da6fdfa',
  'media_count': 1.0,
  'medi

In [17]:
botany_df.sort_values('aws_media_count', ascending=False).head().to_dict(orient='records')

[{'edan_id': 'edanmdm-nmnhbotany_10335133',
  'title': 'Merostachys sp.',
  'timestamp_unix': 1601974850,
  'lastupdate_unix': 1601974838,
  'Barcode': nan,
  'specimen_guid': 'http://n2t.net/ark:/65665/35da0f2cd-5e92-411a-8013-d4d5737229d8',
  'media_count': 19.0,
  'media_guid': 'http://n2t.net/ark:/65665/m36506b79a-5c03-407a-bb13-fa5c80169a6b',
  'media_guid_list': 'http://n2t.net/ark:/65665/m36506b79a-5c03-407a-bb13-fa5c80169a6b;http://n2t.net/ark:/65665/m35a5cbe4c-3d33-4a8d-ac16-181c1fa957f8;http://n2t.net/ark:/65665/m3ab587aa9-7a6a-4d12-ad5b-d2fb4f39682e;http://n2t.net/ark:/65665/m3474324a5-7662-4cb5-bb4f-296c60ce33b8;http://n2t.net/ark:/65665/m3b4646c56-f856-4639-994b-b8879089761f;http://n2t.net/ark:/65665/m3027269b8-826d-4387-a198-7d2ed304ef33;http://n2t.net/ark:/65665/m35558fadf-4363-4005-9b4a-e8d58911441f;http://n2t.net/ark:/65665/m319f86801-ef0d-4819-ac79-4dca2c0fc537;http://n2t.net/ark:/65665/m31c76bbe4-afcd-4a39-866e-c71ec90d0657;http://n2t.net/ark:/65665/m3e84e46f0-bdea-4

In [18]:
print(botany_df['aws_media_count'].sum())
print(botany_df['media_count'].sum())

2385039.0
2554843.0
