In [1]:
from dask.distributed import Client
import dask.bag as db
import json
from pathlib import Path
import zipfile
import pandas as pd
import numpy as np

In [2]:
client = Client(n_workers=8, threads_per_worker=4)
client

0,1
Client  Scheduler: tcp://127.0.0.1:51988  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 32  Memory: 17.18 GB


In [3]:
def gh_dask(location, subset=[]):
    gh_location = Path(location)
    if gh_location.is_dir():
        units = [unit_dir.name for unit_dir in Path(gh_location / 'metadata/objects').iterdir()]
        if len(subset) == 0:
            unit_subset = units
        else:
            unit_subset = [unit for unit in subset if unit in units]
        if len(unit_subset):
            metadata_files = []
            for unit in unit_subset:
                unit_dir = Path(gh_location / 'metadata/objects' / unit)
                valid_files = [str(txt_bz2) for txt_bz2 in unit_dir.glob('*.txt.bz2') \
                                       if ('index.txt' not in txt_bz2.name) and \
                                          (txt_bz2.stat().st_size > 20)]
                metadata_files += valid_files
            gh_dask_bag = db.read_text(metadata_files, compression='bz2').map(json.loads)
            return gh_dask_bag      

In [4]:
! wget -nc -O si_openaccess.zip https://github.com/Smithsonian/OpenAccess/archive/refs/heads/master.zip

File ‘si_openacess.zip’ already there; not retrieving.


In [5]:
! unzip -nq si_openaccess.zip -d si_openaccess

In [7]:
gh_location = Path('/Users/triznam/Downloads/OpenAccess-master')
gh_location.is_dir()
#units = gh_location / 'metadata/objects'
units = [unit_dir.name for unit_dir in Path(gh_location / 'metadata/objects').iterdir()]
print(units)

['NMNHANTHRO', 'NMNHFISHES', 'NMNHBIRDS', 'NMNHINV', 'NMAfA', 'ACAH', 'NMAI', 'SIA', 'NMNHENTO', 'NMAAHC', 'HSFA', 'CHNDM', 'HAC', 'NMAH', 'NAA', 'NPG', 'NASM', 'SAAM', 'NMNHBOTANY', 'NMNHMAMMALS', 'SI', 'NMNHMINSCI', 'HMSG', 'SIL', 'CFCHFOLKLIFE', 'NMNHEDUCATION', 'FBR', 'NMNHHERPS', 'NPM', 'FSA', 'NMNHPALEO', 'FSG', 'ACM']


In [8]:
b = gh_dask(gh_location, subset = ['NMNHBOTANY'])
b

dask.bag<loads, npartitions=256>

In [9]:
sample_record = b.take(1000)[948]
print(json.dumps(sample_record, indent=2))

{
  "id": "edanmdm-nmnhbotany_13072159",
  "version": "",
  "unitCode": "NMNHBOTANY",
  "linkedId": "0",
  "type": "edanmdm",
  "content": {
    "descriptiveNonRepeating": {
      "record_ID": "nmnhbotany_13072159",
      "guid": "http://n2t.net/ark:/65665/35715fbeb-7d89-40bd-a9aa-29ddf58595fd",
      "title_sort": "EUGENIA LINEARIFOLIA O BERG",
      "unit_code": "NMNHBOTANY",
      "record_link": "http://n2t.net/ark:/65665/35715fbeb-7d89-40bd-a9aa-29ddf58595fd",
      "title": {
        "label": "title",
        "content": "Eugenia linearifolia O. Berg"
      },
      "metadata_usage": {
        "access": "CC0"
      },
      "data_source": "NMNH - Botany Dept."
    },
    "indexedStructured": {
      "tax_family": [
        "Myrtaceae"
      ],
      "geoLocation": [
        {
          "L1": {
            "type": "Continent",
            "content": "South America - Neotropics"
          },
          "L2": {
            "type": "Country",
            "content": "Brazil"
          },

In [10]:
def extract_ids(record):
    """Take a single NMNH Botany metadata record, and pulls out ids

    Parameters
    ----------
    record : dict
        A single NMNH Botany metadata record in highly-nested dictionary format.

    Returns
    -------
    flattened_record: dict
        An un-nested dictionary that only contains the record id, unit code,
        object title, media_count, media_id, topic list, object type, and
        object medium.
    """
    flattened_record = dict()
    flattened_record['edan_id'] = record['id']
    flattened_record['title'] = record['title']
    flattened_record['timestamp_unix'] = record['timestamp']
    flattened_record['lastupdate_unix'] = record['lastTimeUpdated']
    
    flattened_record['Barcode'] = np.nan
    
    flattened_record['specimen_guid'] = record['content'].get('descriptiveNonRepeating', {}).get('guid',np.nan)
    
    media_count = record['content'].get('descriptiveNonRepeating', {}).get('online_media',{}).get('mediaCount',np.nan)
    flattened_record['media_count'] = float(media_count)
    media = record['content'].get('descriptiveNonRepeating', {}).get('online_media',{}).get('media',[])   
    if len(media):
        flattened_record['media_guid'] = media[0]['guid']
        if 'resources' in media[0]:
            guid_list = []
            aws_id_list = []
            for media_record in media:
                guid_list.append(media_record['guid'])
                if 'resources' in media_record:
                    for media_resource in media_record['resources']:
                        if 'JPEG' in media_resource['label']:
                            aws_id = media_resource['url'].split('=')[1].split('.')[0]
                            aws_id_list.append(aws_id)
            if len(guid_list):
                flattened_record['media_guid_list'] = ';'.join(guid_list)        
            if len(aws_id_list):
                flattened_record['media_aws_id_list'] = ';'.join(aws_id_list)
                flattened_record['aws_media_count'] = len(aws_id_list)
    if 'freetext' in record['content']:
        if 'identifier' in record['content']['freetext']:
            for identifier in record['content']['freetext']['identifier']:
                id_type = identifier['label']
                id_number = identifier['content']
                flattened_record[id_type] = id_number
          
    return flattened_record

In [11]:
print(extract_ids(sample_record))

{'edan_id': 'edanmdm-nmnhbotany_13072159', 'title': 'Eugenia linearifolia O. Berg', 'timestamp_unix': 1581934917, 'lastupdate_unix': 1581934858, 'Barcode': '01898202', 'specimen_guid': 'http://n2t.net/ark:/65665/35715fbeb-7d89-40bd-a9aa-29ddf58595fd', 'media_count': nan, 'USNM Number': '291219'}


In [12]:
botany_ids = (b.map(extract_ids).compute())


In [13]:
botany_df = pd.DataFrame(botany_ids)
botany_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3973848 entries, 0 to 3973847
Data columns (total 13 columns):
 #   Column             Dtype  
---  ------             -----  
 0   edan_id            object 
 1   title              object 
 2   timestamp_unix     int64  
 3   lastupdate_unix    int64  
 4   Barcode            object 
 5   specimen_guid      object 
 6   media_count        float64
 7   USNM Number        object 
 8   Other Numbers      object 
 9   media_guid         object 
 10  media_guid_list    object 
 11  media_aws_id_list  object 
 12  aws_media_count    float64
dtypes: float64(2), int64(2), object(9)
memory usage: 394.1+ MB


In [14]:
botany_df['timestamp_dt'] = pd.to_datetime(botany_df['timestamp_unix'], unit = 's')
botany_df['lastupdate_dt'] = pd.to_datetime(botany_df['lastupdate_unix'], unit = 's')
botany_df[['timestamp_unix','timestamp_dt','lastupdate_unix','lastupdate_dt']].head(20)

Unnamed: 0,timestamp_unix,timestamp_dt,lastupdate_unix,lastupdate_dt
0,1579602961,2020-01-21 10:36:01,1579602926,2020-01-21 10:35:26
1,1579602961,2020-01-21 10:36:01,1579602948,2020-01-21 10:35:48
2,1580204960,2020-01-28 09:49:20,1580204947,2020-01-28 09:49:07
3,1580204960,2020-01-28 09:49:20,1580204947,2020-01-28 09:49:07
4,1580205061,2020-01-28 09:51:01,1580205053,2020-01-28 09:50:53
5,1580205606,2020-01-28 10:00:06,1580205585,2020-01-28 09:59:45
6,1580207061,2020-01-28 10:24:21,1580207044,2020-01-28 10:24:04
7,1579243794,2020-01-17 06:49:54,1579242667,2020-01-17 06:31:07
8,1579247274,2020-01-17 07:47:54,1579246883,2020-01-17 07:41:23
9,1579238973,2020-01-17 05:29:33,1579238512,2020-01-17 05:21:52


In [15]:
botany_df.to_csv('botany_ids_and_dates.tsv', index=False, sep='\t')

In [16]:
botany_df.sort_values('timestamp_unix', ascending=False).head().to_dict(orient='records')

[{'edan_id': 'edanmdm-nmnhbotany_15918739',
  'title': 'Bromus frondosus (Shear) Wooton & Standl.',
  'timestamp_unix': 1601977951,
  'lastupdate_unix': 1601977905,
  'Barcode': '04081884',
  'specimen_guid': 'http://n2t.net/ark:/65665/3b5243aed-72d1-482d-8681-31014e8ca0dc',
  'media_count': nan,
  'USNM Number': '906050',
  'Other Numbers': nan,
  'media_guid': nan,
  'media_guid_list': nan,
  'media_aws_id_list': nan,
  'aws_media_count': nan,
  'timestamp_dt': Timestamp('2020-10-06 09:52:31'),
  'lastupdate_dt': Timestamp('2020-10-06 09:51:45')},
 {'edan_id': 'edanmdm-nmnhbotany_15959480',
  'title': 'Bromus rubens L.',
  'timestamp_unix': 1601977951,
  'lastupdate_unix': 1601977905,
  'Barcode': '04085323',
  'specimen_guid': 'http://n2t.net/ark:/65665/397644d49-955e-42b2-bd25-47b474586cd7',
  'media_count': nan,
  'USNM Number': '390640',
  'Other Numbers': nan,
  'media_guid': nan,
  'media_guid_list': nan,
  'media_aws_id_list': nan,
  'aws_media_count': nan,
  'timestamp_dt': T

In [17]:
botany_df[pd.notnull(botany_df['media_count'])].sort_values('timestamp_unix', ascending=False).tail().to_dict(orient='records')

[{'edan_id': 'edanmdm-nmnhbotany_2130471',
  'title': 'Lupinus lepidus subsp. medius Detling',
  'timestamp_unix': 1581932823,
  'lastupdate_unix': 1581932766,
  'Barcode': '00003285',
  'specimen_guid': 'http://n2t.net/ark:/65665/364d97b4b-ef11-4320-9248-0ee9fa97d0d8',
  'media_count': 1.0,
  'USNM Number': '2054999',
  'Other Numbers': 'fiche number : 0394/C05',
  'media_guid': 'http://n2t.net/ark:/65665/m3c7cfca2d-0836-412b-86ff-bdf49b573638',
  'media_guid_list': 'http://n2t.net/ark:/65665/m3c7cfca2d-0836-412b-86ff-bdf49b573638',
  'media_aws_id_list': 'NMNH-00003285-000001',
  'aws_media_count': 1.0,
  'timestamp_dt': Timestamp('2020-02-17 09:47:03'),
  'lastupdate_dt': Timestamp('2020-02-17 09:46:06')},
 {'edan_id': 'edanmdm-nmnhbotany_2096410',
  'title': 'Piscidia grandifolia var. gentryi Rudd',
  'timestamp_unix': 1581932823,
  'lastupdate_unix': 1581932766,
  'Barcode': '00004336',
  'specimen_guid': 'http://n2t.net/ark:/65665/33ee2eb29-6b40-46d6-9707-d1aad7f740e9',
  'media_

In [17]:
botany_df.sort_values('aws_media_count', ascending=False).head().to_dict(orient='records')

[{'edan_id': 'edanmdm-nmnhbotany_10335133',
  'title': 'Merostachys sp.',
  'timestamp_unix': 1601974850,
  'lastupdate_unix': 1601974838,
  'Barcode': nan,
  'specimen_guid': 'http://n2t.net/ark:/65665/35da0f2cd-5e92-411a-8013-d4d5737229d8',
  'media_count': 19.0,
  'media_guid': 'http://n2t.net/ark:/65665/m36506b79a-5c03-407a-bb13-fa5c80169a6b',
  'media_guid_list': 'http://n2t.net/ark:/65665/m36506b79a-5c03-407a-bb13-fa5c80169a6b;http://n2t.net/ark:/65665/m35a5cbe4c-3d33-4a8d-ac16-181c1fa957f8;http://n2t.net/ark:/65665/m3ab587aa9-7a6a-4d12-ad5b-d2fb4f39682e;http://n2t.net/ark:/65665/m3474324a5-7662-4cb5-bb4f-296c60ce33b8;http://n2t.net/ark:/65665/m3b4646c56-f856-4639-994b-b8879089761f;http://n2t.net/ark:/65665/m3027269b8-826d-4387-a198-7d2ed304ef33;http://n2t.net/ark:/65665/m35558fadf-4363-4005-9b4a-e8d58911441f;http://n2t.net/ark:/65665/m319f86801-ef0d-4819-ac79-4dca2c0fc537;http://n2t.net/ark:/65665/m31c76bbe4-afcd-4a39-866e-c71ec90d0657;http://n2t.net/ark:/65665/m3e84e46f0-bdea-4

In [18]:
print(botany_df['aws_media_count'].sum())
print(botany_df['media_count'].sum())

2385039.0
2554843.0
