# Squamata Batch Update ScienceBase XML
A Jupyter Notebook used to download multiple XML Metadata files from ScicenceBase, make a change to this XML file and re-upload.

Uses code examples found in autoSB.py, https://github.com/pbrown-usgs/science-base-automationMT/blob/first-branch/autoSB.py

**Version 1.0**
 - Change one item in a single metadata file possible
 - Plans for new versions include adding a list for batch updates

In [96]:
# Load required Libraries
import sys
import os
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import json
import sys
import sciencebasepy as pysb

#Create Function that allows one to implement markdown display when printing
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

In [97]:
#Login to ScienceBase
sb = pysb.SbSession()
username = input("Username:  ")
sb.loginc(str(username))

#Check to see if login is successful
sb.is_logged_in()

#Get the ScienceBase Login session info
sb.get_session_info()

Username:  pbrown@usgs.gov
········


{'fullDisplayName': 'Philip J Brown II [pbrown@usgs.gov]',
 'isLoggedIn': True,
 'jossoSessionId': '9D57EB13DE7CC67B1841F36293024014',
 'displayName': 'Philip J Brown II',
 'email': 'pbrown@usgs.gov',
 'username': 'pbrown@usgs.gov'}

In [98]:
# Now lets get the item - be sure to use the correct catalog number!!! 5f2b00f782ceae4cb3c09708
#catNum = '5f066e2682ce21d4c3f90237' #Akansas
catNum = '5f2b00f782ceae4cb3c09708' #Loma San Andreas MT
item_json = sb.get_item(catNum)

#print ("Public Item: \n\t" + str(item_json))
print(json.dumps(item_json, indent=2, sort_keys=True))

{
  "body": "The U.S. Geological Survey (USGS) Geology, Geophysics and Geochemistry Science Center (GGGSC) collaborated with the USGS Science Analytics and Synthesis (SAS) team to preserve and release a subset of magnetotelluric data from the San Andreas Fault in Loma Prieta, California. The San Andreas Fault data were collected by the Branch of Geophysics, a precursor to the now GGGSC, between 1989 and 1994. The magnetotelluric data selected for this preservation project were collected in 1989 using USGS portable truck mounted systems that measure the distribution of electrical conductivity beneath the surface of the earth. Truck mounted systems of this era output data to 3.5\u201d discs, from which data were recovered and transformed to binary or ASCII formats using proprietary software. This USGS data release includes the original binary and ASCII data files and derivative EDI files (Wight, 1988), a common modern format for magnetotelluric modelling, created using Python-based softw

In [102]:
#List the XML files ScienceBase:
arrayDownloadUri = []
arrayName = []
for item in item_json['files']:
    arrayName.append(item['name'])
    arrayDownloadUri.append(item['downloadUri']) #downloadUri
    

for i in range(len(arrayName)): 
 print ('Name Index ' + str(i) + ': ' + arrayName[i])
 print ('URL Index ' + str(i) + ': ' + arrayDownloadUri[i] + '\n')
 

Name Index 0: Magnetotelluric-Data-from-the-San-Andreas-Fault_Loma_Prieta-CA_1989_Project-MetadataOld.xml
URL Index 0: https://www.sciencebase.gov/catalog/file/get/5f2b00f782ceae4cb3c09708?f=__disk__b3%2Fa2%2F78%2Fb3a2787091e131ea49fcb6c6268289d689c93f16



In [103]:
#Load the metadata file - be sure to use the correct URL below
#downloadUri = 'https://www.sciencebase.gov/catalog/file/get/5f066e2682ce21d4c3f90237?f=__disk__63%2F64%2F80%2F63648082412369fa65285e35e8487e08f5db696c'#item['downloadUri']
#print (arrayDownloadUri[0])
from urllib.request import urlopen
from xml.etree.ElementTree import parse

#This is how we get the item using a url - we have to use the sb-py command for SB objects that are not yet public
#var_url = urlopen(arrayDownloadUri[0])
#root = parse(var_url)

SBmetadata=sb.get(arrayDownloadUri[0])#Here is where one can change the index pointing to different URL values printed above
SBmetadata = SBmetadata.split('\n', 1)[-1]#remove first UTF encoding line, this messes up the etree parse
#print (SBmetadata) 

root = etree.fromstring(SBmetadata) # parse metadata using etree from a string instead of from a file like above
root


<Element metadata at 0x248fc2b6a08>

In [104]:
#Now let's replace a section using xtree followed by a find and replace example.
for descript in root.iter('descript'):
 abstract = descript.find('abstract')
 print (abstract.text) 


New Text3


In [105]:
#now replace the abstract text
abstract.text='Slackmaster Felipe Cafe'
#Now Check Replacement
for descript in root.iter('descript'):
 abstract = descript.find('abstract')
 print (abstract.text) 

Slackmaster Felipe Cafe


In [106]:
#Now Overwrite metadata on ScienceBase
#https://stackoverflow.com/questions/15634580/lxml-etree-element-object-has-no-attribute-write-python
#etree.tostring(root, encoding='utf-8', xml_declaration=True) 
#print(root

In [107]:
#Write Etree to XML String
strXMLout = etree.tostring(root, encoding='unicode')
#Now add that silly Unicode line
strXMLout = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n' + strXMLout  #Ahhh use \ to escape out those damn single quotes
strXMLout

'<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<metadata>\n  <idinfo>\n    <citation>\n      <citeinfo>\n        <origin>Brian D. Rodriguez</origin>\n        <origin>Michaela R. Johnson</origin>\n        <origin>Ricardo McClees-Funinan</origin>\n        <origin>Shafer Powell</origin>\n        <origin>Kyle D. Enns</origin>\n        <pubdate>2020</pubdate>\n        <title>Magnetotelluric Data from the San Andreas Fault, Loma Prieta CA, 1989</title>\n        <geoform>magnetotelluric data</geoform>\n        <onlink>https://doi.org/10.5066/P9DIJHDK</onlink>\n      </citeinfo>\n    </citation>\n    <descript>\n      <abstract>Slackmaster Felipe Cafe</abstract>\n      <purpose>The magnetotelluric data that were previously locked in an inaccessible format are now readily available as a result of open-source software. The released information could help increase understanding of the San Andreas Fault system, which runs the length of California, as well as fault systems elsewhere. Seismic activity

In [108]:
WorkingPath = r"C:\CurrentWork\DataReleases\Loma1989\Test"
WorkingFileName = arrayName[0]
WorkingPathAndFile = WorkingPath + "\\" + WorkingFileName
print ('The MT Data Path is: ' + '"' + WorkingPathAndFile + '"')

The MT Data Path is: "C:\CurrentWork\DataReleases\Loma1989\Test\Magnetotelluric-Data-from-the-San-Andreas-Fault_Loma_Prieta-CA_1989_Project-MetadataOld.xml"


In [109]:
#Now write this new string to ScienceBase
#First output string to a file
XMLFileFinal = open(WorkingPathAndFile,"w+")
XMLFileFinal.write(strXMLout)
XMLFileFinal.close()

# Define item somehow? arrayDownloadUri[0]
#item = sb.get_item(item_id)

#strXMLout = sb.update_item(arrayDownloadUri[0])

In [110]:
sb.replace_file(WorkingPathAndFile, item_json)