### This notebook allows the user to evaluate the structure and content of valid XML metadata using the metadataEvaluation Python module. 

The NCAR essential ISO elements are used to determine if the records contain sufficient metadata for NCAR documentation needs. 

Content for a selectable element is viewable

Consistency across elements is measured and graphed

Occurrence is measured across the lab collections in DASH

Records missing a selectable element are listed

In [24]:
# This cell imports modules and defines functions used to set variables and functions 
# that create dataframes of data products that can be saved to a csv

# modules to support functions
import pandas as pd
from ipywidgets import *
import ipywidgets as widgets
from IPython.display import display
import metadataEvaluation

from bokeh.charts import output_notebook, show, Bar
from bokeh.plotting import figure
from bokeh.models import Range1d, HoverTool, ResizeTool
from bokeh.charts import defaults
from bokeh.models import Legend

#defaults.width = 900
#defaults.height = 500
#output_notebook()

#functions to identify and label an xml collection for Metadata Evaluation
def OrganizationChoices(organization):
    global Organization
    Organization=organization
    print("Curating Organization", Organization)
    
def CollectionName(collection):
    global Collection
    Collection=collection
    print("Collection is named", Collection)
    
def DialectName(dialect):
    global Dialect
    Dialect=dialect
    print("Collection is written in the", Dialect,"dialect")   
    
def metadataLocation(directory):
    global MetadataLocation
    MetadataLocation=directory
    print("The local directory", MetadataLocation,"contains the metadata for evaluation")   
    
    
#Create a dropdown that allows multiple selection using command and click with the mouse 
#or arrow keys to select additional individual data tables, or 
#shift with the mouse to select each choice between the two clicked.

#Creates a list of all the paths to evaluated collection data.
EvaluatedMetadata=[]
for dirpath, dirnames, filenames in os.walk("../data"):
    for filename in [f for f in filenames if '.csv' in f]:
        EvaluatedMetadata.extend([os.path.join(dirpath, filename)])

#widget for selecting multiple collections        
w6=widgets.SelectMultiple(
    options=sorted(EvaluatedMetadata),
    value=['../data/Sample/Sample_Sample_Evaluated.csv.gz'],
    #rows=10,
    description='Evaluated Data',
    disabled=False
)
#Show the widget

#widget for naming the file
w5=widgets.Text(
    value='',
    placeholder='Your file name (no spaces)',
    description='Name your File:',
    disabled=False
)
        
w=interactive(OrganizationChoices, organization='')    
w2=interactive(CollectionName, collection='')
w3=interactive(DialectName, dialect='')
w4=interactive(metadataLocation, directory='')    

#### Now let's create the variables we will use for the functions needed from metadataEvaluation.py. If you change the text, the variable will change, no need to rerun the cell. This is helpful for creating data products from each lab

In [2]:
display(w)
display(w2)
display(w3)
display(w4)

### Run this cell for each lab to evaluate the collection and create data products using the variables in the widgets in the previous code cell. Remember you don't need to rerun the previous cell to change variables.

In [3]:
# XMLeval uploads a zip file to the webservice, and reads the return into a dataframe
EvaluatedMetadataDF=metadataEvaluation.XMLeval(MetadataLocation, Organization, Collection, Dialect)

# simpleXPathDataProduct clarifies the xpaths by removing namespaces and in the case of ISO 19115* dialects removes the ISO classes
SimplifiedEvaluatedMetadataDF=metadataEvaluation.simpleXPathDataProduct(EvaluatedMetadataDF, Organization, Collection, Dialect)

# xpathOccurrence creates a pivot table of xpaths, record occurrence counts, occurrence percentages and average occurrences per record
xpathOccurrenceDF=metadataEvaluation.xpathOccurrence(SimplifiedEvaluatedMetadataDF, Organization, Collection, Dialect)

# XpathCounts creates a pivot table with each record a row and xpaths for the columns, with a record count
XpathCounts=metadataEvaluation.XpathCounts(SimplifiedEvaluatedMetadataDF, Organization, Collection, Dialect)
xpathOccurrenceDF

Unnamed: 0,XPath,Collection,XPathCount,RecordCount,AverageOccurrencePerRecord,CollectionOccurrence%
0,Number of Records,CGD_ISO,5,5,1.00,100.00%
1,/characterSet/@nilReason,CGD_ISO,5,5,1.00,100.00%
2,/contact,CGD_ISO,10,5,2.00,100.00%
3,/contact/contactInfo,CGD_ISO,5,5,1.00,100.00%
4,/contact/contactInfo/address/electronicMailAdd...,CGD_ISO,10,5,2.00,100.00%
5,/contact/individualName,CGD_ISO,24,5,4.80,100.00%
6,/contact/individualName/@nilReason,CGD_ISO,1,1,0.20,20.00%
7,/contact/organisationName,CGD_ISO,15,5,3.00,100.00%
8,/contact/positionName,CGD_ISO,5,5,1.00,100.00%
9,/contact/positionName/@nilReason,CGD_ISO,5,5,1.00,100.00%


### Combine the data to compare between collections

In [4]:
# Fill in variables then run the function below to combine the XPathOccurrence csv from each lab
display(w5)
display(w6)

##### Combine XPath Counts csv from each lab for later exploration

In [5]:
# create variables and filepaths from the widget choices
CollectionComparisons=w6.value
DataDestination=os.path.join('../data/Combined', str(w5.value)+'.csv')
# run the function below to combine the XPathCounts csv from each lab
CombinedXPathCountsDF=metadataEvaluation.CombineXPathCounts(CollectionComparisons,DataDestination)

##### Combine Evaluated csv from each lab for later exploration

In [6]:
# create variables and filepaths from the widget choices
CollectionComparisons=w6.value
DataDestination=os.path.join('../data/Combined', str(w5.value)+'.csv.gz')
# run the function below to combine the EvaluatedMetadata csv from each lab
CombinedEvaluatedMetadataDF=metadataEvaluation.CombineEvaluatedMetadata(CollectionComparisons, DataDestination)

##### Combine XPath Occurrence csv from each lab to create an overview of the labs.

In [7]:
# create variables and filepaths from the widget choices
CollectionComparisons=w6.value
DataDestination=os.path.join('../data/Combined', str(w5.value)+'.csv')
# run function to combine selected xpathOccurrence data
CombinedXPathOccurrenceDF=metadataEvaluation.CombineXPathOccurrence(CollectionComparisons,DataDestination)
CombinedXPathOccurrenceDF

Unnamed: 0,XPath,ACOM_ISO,CGD_ISO,CISL_ISO,EOL_ISO,HAO_ISO,Library_ISO,MMM_ISO,OpenSky_ISO,RDA_ISO,Unidata_ISO
0,/@schemaLocation,0,0,0,100.00%,0,0,0,0,100.00%,0
1,/characterSet,0,0,0,100.00%,0,100.00%,0,0,100.00%,100.00%
2,/characterSet/@codeList,0,0,0,100.00%,0,100.00%,0,100.00%,100.00%,100.00%
3,/characterSet/@codeListValue,0,0,0,100.00%,0,100.00%,0,100.00%,100.00%,100.00%
4,/characterSet/@nilReason,100.00%,100.00%,100.00%,0,100.00%,0,100.00%,0,0,0
5,/contact,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%
6,/contact/contactInfo,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%
7,/contact/contactInfo/address/administrativeArea,0,0,0,100.00%,0,0,0,100.00%,100.00%,0
8,/contact/contactInfo/address/city,0,0,0,100.00%,0,0,0,100.00%,100.00%,0
9,/contact/contactInfo/address/country,0,0,0,0,0,0,0,0,100.00%,0


## Now that the data products are created and combined, we can look for a specific lab or across NCAR to discover which records don't contain an element, what content a certain element contains, and if a collection contains essential metadata using the following functions.

Create a dataframe of just the essential elements in the NCAR recommendation for each record in DASH

In [8]:
NCARessentialRecommendation=['Collection','Record','/fileIdentifier','/language/LanguageCode','/language/LanguageCode/@codeList','/language/LanguageCode/@codeListValue','/characterSet/@nilReason','/parentIdentifier/@nilReason','/hierarchyLevel','/hierarchyLevel/@codeList','/hierarchyLevel/@codeListValue','/hierarchyLevelName/@nilReason','/contact/individualName','/contact/organisationName','/contact/positionName','/contact/contactInfo/phone/voice','/contact/contactInfo/address/electronicMailAddress','/contact/role','/contact/role/@codeList','/contact/role/@codeListValue','/dateStamp/DateTime','/metadataStandardName','/metadataStandardVersion','/dataSetURI','/identificationInfo/citation/title','/identificationInfo/citation/date/date/Date','/identificationInfo/citation/date/dateType','/identificationInfo/citation/date/dateType/@codeList','/identificationInfo/citation/date/dateType/@codeListValue','/identificationInfo/citation/citedResponsibleParty/individualName','/identificationInfo/citation/citedResponsibleParty/organisationName','/identificationInfo/citation/citedResponsibleParty/positionName','/identificationInfo/citation/citedResponsibleParty/contactInfo/phone/voice','/identificationInfo/citation/citedResponsibleParty/contactInfo/address/electronicMailAddress','/identificationInfo/citation/citedResponsibleParty/role','/identificationInfo/citation/citedResponsibleParty/role/@codeList','/identificationInfo/citation/citedResponsibleParty/role/@codeListValue','/identificationInfo/citation/citedResponsibleParty/individualName','/identificationInfo/citation/citedResponsibleParty/organisationName','/identificationInfo/citation/citedResponsibleParty/positionName','/identificationInfo/citation/citedResponsibleParty/contactInfo/phone/voice','/identificationInfo/citation/citedResponsibleParty/contactInfo/address/electronicMailAddress','/identificationInfo/citation/citedResponsibleParty/role','/identificationInfo/citation/citedResponsibleParty/role/@codeList','/identificationInfo/citation/citedResponsibleParty/role/@codeListValue','/identificationInfo/citation/citedResponsibleParty/individualName/@nilReason','/identificationInfo/citation/citedResponsibleParty/organisationName','/identificationInfo/citation/citedResponsibleParty/positionName/@nilReason','/identificationInfo/citation/citedResponsibleParty/contactInfo/address/deliveryPoint','/identificationInfo/citation/citedResponsibleParty/contactInfo/address/city','/identificationInfo/citation/citedResponsibleParty/contactInfo/address/administrativeArea','/identificationInfo/citation/citedResponsibleParty/contactInfo/address/postalCode','/identificationInfo/citation/citedResponsibleParty/contactInfo/address/country','/identificationInfo/citation/citedResponsibleParty/contactInfo/address/electronicMailAddress','/identificationInfo/citation/citedResponsibleParty/contactInfo/onlineResource/linkage/URL','/identificationInfo/citation/citedResponsibleParty/role','/identificationInfo/citation/citedResponsibleParty/role/@codeList','/identificationInfo/citation/citedResponsibleParty/role/@codeListValue','/identificationInfo/abstract','/identificationInfo/pointOfContact/individualName','/identificationInfo/pointOfContact/organisationName','/identificationInfo/pointOfContact/positionName','/identificationInfo/pointOfContact/contactInfo/phone/voice','/identificationInfo/pointOfContact/contactInfo/address/electronicMailAddress','/identificationInfo/pointOfContact/role','/identificationInfo/pointOfContact/role/@codeList','/identificationInfo/pointOfContact/role/@codeListValue','/identificationInfo/descriptiveKeywords/keyword','/identificationInfo/descriptiveKeywords/type','/identificationInfo/descriptiveKeywords/type/@codeList','/identificationInfo/descriptiveKeywords/type/@codeListValue','/identificationInfo/descriptiveKeywords/thesaurusName/title','/identificationInfo/descriptiveKeywords/thesaurusName/date/date/Date','/identificationInfo/descriptiveKeywords/thesaurusName/date/dateType','/identificationInfo/descriptiveKeywords/thesaurusName/date/dateType/@codeList','/identificationInfo/descriptiveKeywords/thesaurusName/date/dateType/@codeListValue','/identificationInfo/descriptiveKeywords/thesaurusName/citedResponsibleParty/individualName/@nilReason','/identificationInfo/descriptiveKeywords/thesaurusName/citedResponsibleParty/organisationName','/identificationInfo/descriptiveKeywords/thesaurusName/citedResponsibleParty/positionName/@nilReason','/identificationInfo/descriptiveKeywords/thesaurusName/citedResponsibleParty/contactInfo/address/electronicMailAddress','/identificationInfo/descriptiveKeywords/thesaurusName/citedResponsibleParty/contactInfo/onlineResource/linkage/URL','/identificationInfo/descriptiveKeywords/thesaurusName/citedResponsibleParty/role','/identificationInfo/descriptiveKeywords/thesaurusName/citedResponsibleParty/role/@codeList','/identificationInfo/descriptiveKeywords/thesaurusName/citedResponsibleParty/role/@codeListValue','/identificationInfo/resourceConstraints/useLimitation','/identificationInfo/resourceConstraints/otherConstraints','/identificationInfo/language/LanguageCode','/identificationInfo/language/LanguageCode/@codeList','/identificationInfo/language/LanguageCode/@codeListValue','/identificationInfo/characterSet/@nilReason','/identificationInfo/topicCategory','/identificationInfo/extent/description/@nilReason','/identificationInfo/extent/geographicElement/extentTypeCode/Boolean','/identificationInfo/extent/geographicElement/westBoundLongitude/Decimal','/identificationInfo/extent/geographicElement/eastBoundLongitude/Decimal','/identificationInfo/extent/geographicElement/southBoundLatitude/Decimal','/identificationInfo/extent/geographicElement/northBoundLatitude/Decimal','/identificationInfo/extent/temporalElement/@nilReason','/identificationInfo/extent/verticalElement/@nilReason']
NCARessentialXPathCountsDF=CombinedXPathCountsDF[NCARessentialRecommendation]

NCARessentialXPathCountsDF

Unnamed: 0,Collection,Record,/fileIdentifier,/language/LanguageCode,/language/LanguageCode/@codeList,/language/LanguageCode/@codeListValue,/characterSet/@nilReason,/parentIdentifier/@nilReason,/hierarchyLevel,/hierarchyLevel/@codeList,...,/identificationInfo/characterSet/@nilReason,/identificationInfo/topicCategory,/identificationInfo/extent/description/@nilReason,/identificationInfo/extent/geographicElement/extentTypeCode/Boolean,/identificationInfo/extent/geographicElement/westBoundLongitude/Decimal,/identificationInfo/extent/geographicElement/eastBoundLongitude/Decimal,/identificationInfo/extent/geographicElement/southBoundLatitude/Decimal,/identificationInfo/extent/geographicElement/northBoundLatitude/Decimal,/identificationInfo/extent/temporalElement/@nilReason,/identificationInfo/extent/verticalElement/@nilReason
0,ACOM_ISO,FINN.xml,2,1,1,1,1,1,1,1,...,1,0,0,0,1,1,1,1,0,1
1,ACOM_ISO,FTIR%20time%20series%20of%20tropospheric%20and...,2,1,1,1,1,1,1,1,...,1,0,0,0,0,0,0,0,0,1
2,ACOM_ISO,MOPITT.xml,2,1,1,1,1,1,1,1,...,1,0,0,0,1,1,1,1,0,1
3,ACOM_ISO,MOZART.xml,2,1,1,1,1,1,1,1,...,1,0,0,0,1,1,1,1,0,1
4,ACOM_ISO,WRF-Chem%20Tools.xml,2,1,1,1,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0
5,CGD_ISO,cesm.cgd.ncar.ucar.edu_b.e11.B20TRLENS.f09_g16...,2,1,1,1,1,1,1,1,...,1,1,0,1,1,1,1,1,0,1
6,CGD_ISO,climatedataguide.ucar.edu__node.652.xml,2,1,1,1,1,1,1,1,...,1,1,0,1,1,1,1,1,0,1
7,CGD_ISO,climatedataguide.ucar.edu__node.658_NAM.xml,2,1,1,1,1,1,1,1,...,1,1,0,1,1,1,1,1,0,1
8,CGD_ISO,climatedataguide.ucar.edu__node.660_NP.xml,2,1,1,1,1,1,1,1,...,1,1,0,1,1,1,1,1,0,1
9,CGD_ISO,climatedataguide.ucar.edu__node.740_NAOpc.xml,2,1,1,1,1,1,1,1,...,1,1,0,1,1,1,1,1,0,1


Create a dataframe of the records in the collection missing a specific element, selectable by dropdown widget

In [23]:
XPathChoices=list(NCARessentialXPathCountsDF)
XPathChoices=XPathChoices[2:]
def RecordsMissingXPath(XPathChoice):
    global xPathChoice
    xPathChoice=XPathChoice
    doesntHaveDF=NCARessentialXPathCountsDF.loc[NCARessentialXPathCountsDF[xPathChoice] == 0.0]
    doesntHaveDF=doesntHaveDF[['Collection','Record']]
    return doesntHaveDF

w7=interact(RecordsMissingXPath, XPathChoice=XPathChoices) 
display(w7)

<function __main__.RecordsMissingXPath>

By using the function below, a dropdown can select what element to look at the content for

In [1]:
XPathChoices=list(CombinedXPathCountsDF)
XPathChoices=XPathChoices[2:]
def ContentAtElement(XPathChoice):
    global CombinedContentAtElementDF
    xPathChoice=XPathChoice
    CombinedContentAtElementDF=CombinedEvaluatedMetadataDF.loc[CombinedEvaluatedMetadataDF.XPath==xPathChoice]
    CombinedContentAtElementDF=CombinedContentAtElementDF[['Collection','Record','Content']]
    distribution=CombinedContentAtElementDF.groupby('Content').size()
    data = CombinedContentAtElementDF.Content.str.replace(':','.')

   # p = Bar(data, 'Content', title="Content Occurance Count", legend=False)

    
    return distribution

interact(ContentAtElement, XPathChoice=XPathChoices)

NameError: name 'CombinedXPathCountsDF' is not defined