### This notebook allows the user to evaluate the structure and content of valid XML metadata using the metadataEvaluation Python module. 


* Content for a selectable element is viewable

* Consistency across elements is checked

* Occurrence is compared through time

* Records missing a selectable element are listed

In [1]:
# This cell imports modules and defines functions used to set variables and functions 
# that create dataframes of data products that can be saved to a csv

# modules to support functions
import pandas as pd
from ipywidgets import *
import ipywidgets as widgets
from IPython.display import display
import metadataEvaluation

#functions to identify and label an xml collection for Metadata Evaluation
def OrganizationChoices(organization):
    global Organization
    Organization=organization
    print("Curating Organization", Organization)
    
def CollectionName(collection):
    global Collection
    Collection=collection
    print("Collection is named", Collection)
    
def DialectName(dialect):
    global Dialect
    Dialect=dialect
    print("Collection is written in the", Dialect,"dialect")   
    
def metadataLocation(directory):
    global MetadataLocation
    MetadataLocation=directory
    print("The local directory", MetadataLocation,"contains the metadata for evaluation")   
    
    
#Create a dropdown that allows multiple selection using command and click with the mouse 
#or arrow keys to select additional individual data tables, or 
#shift with the mouse to select each choice between the two clicked.

#Creates a list of all the paths to evaluated collection data.
EvaluatedMetadata=[]
for dirpath, dirnames, filenames in os.walk("../data"):
    for filename in [f for f in filenames if '.csv' in f]:
        EvaluatedMetadata.extend([os.path.join(dirpath, filename)])

#widget for selecting multiple collections        
w6=widgets.SelectMultiple(
    options=sorted(EvaluatedMetadata),
    value=['../data/Sample/Sample_Sample_Evaluated.csv.gz'],
    #rows=10,
    description='Evaluated Data',
    disabled=False
)
#Show the widget

#widget for naming the file
w5=widgets.Text(
    value='',
    placeholder='Your file name (no spaces)',
    description='Name your File:',
    disabled=False
)
        
w=interactive(OrganizationChoices, organization='')    
w2=interactive(CollectionName, collection='')
w3=interactive(DialectName, dialect='')
w4=interactive(metadataLocation, directory='')    

#### Now let's create the variables we will use for the functions needed from metadataEvaluation.py. If you change the text, the variable will change, no need to rerun the cell. This is helpful for creating data products from each lab

In [2]:
display(w)
display(w2)
display(w3)
display(w4)

### Run this cell for each time to evaluate the collection and create data products using the variables in the widgets in the previous code cell. Remember you don't need to rerun the previous cell to change variables.

In [6]:
# XMLeval uploads a zip file to the webservice, and reads the return into a dataframe
EvaluatedMetadataDF=metadataEvaluation.XMLeval(MetadataLocation, Organization, Collection, Dialect)

# simpleXPathDataProduct clarifies the xpaths by removing namespaces and in the case of ISO 19115* dialects removes the ISO classes
EvaluatedMetadataDF=metadataEvaluation.fullXPathDataProduct(EvaluatedMetadataDF, Organization, Collection, Dialect)

# xpathOccurrence creates a pivot table of xpaths, record occurrence counts, occurrence percentages and average occurrences per record
xpathOccurrenceDF=metadataEvaluation.xpathOccurrence(EvaluatedMetadataDF, Organization, Collection, Dialect)

# XpathCounts creates a pivot table with each record a row and xpaths for the columns, with a record count
XpathCounts=metadataEvaluation.XpathCounts(EvaluatedMetadataDF, Organization, Collection, Dialect)
xpathOccurrenceDF

Unnamed: 0,XPath,Collection,XPathCount,RecordCount,AverageOccurrencePerRecord,CollectionOccurrence%
0,Number of Records,2018-02_Re3data,2030,2030,1.00,100.00%
1,/r3d:re3data/@xsi:schemaLocation,2018-02_Re3data,2030,2030,1.00,100.00%
2,/r3d:re3data/r3d:repository/r3d:additionalName,2018-02_Re3data,2454,1663,1.21,81.92%
3,/r3d:re3data/r3d:repository/r3d:additionalName...,2018-02_Re3data,1830,1663,0.90,81.92%
4,/r3d:re3data/r3d:repository/r3d:aidSystem,2018-02_Re3data,388,380,0.19,18.72%
5,/r3d:re3data/r3d:repository/r3d:api,2018-02_Re3data,1289,902,0.63,44.43%
6,/r3d:re3data/r3d:repository/r3d:api/@apiType,2018-02_Re3data,1253,903,0.62,44.48%
7,/r3d:re3data/r3d:repository/r3d:certificate,2018-02_Re3data,215,184,0.11,9.06%
8,/r3d:re3data/r3d:repository/r3d:citationGuidel...,2018-02_Re3data,1180,1180,0.58,58.13%
9,/r3d:re3data/r3d:repository/r3d:contentType,2018-02_Re3data,9365,2029,4.61,99.95%


### Combine the data to compare between collections

In [7]:
#Creates a list of all the paths to evaluated collection data.
EvaluatedMetadata=[]
for dirpath, dirnames, filenames in os.walk("../data"):
    for filename in [f for f in filenames if '.csv' in f]:
        EvaluatedMetadata.extend([os.path.join(dirpath, filename)])

#widget for selecting multiple collections        
w6=widgets.SelectMultiple(
    options=sorted(EvaluatedMetadata),
    value=['../data/Sample/Sample_Sample_Evaluated.csv.gz'],
    #rows=10,
    description='Evaluated Data',
    disabled=False
)
#Show the widget

#widget for naming the file
w5=widgets.Text(
    value='',
    placeholder='Your file name (no spaces)',
    description='Name your File:',
    disabled=False
)     
# Fill in variables then run the function below to combine the XPathOccurrence csv from each lab
display(w5)
display(w6)

##### Combine XPath Counts csv from each lab for later exploration

In [8]:
# create variables and filepaths from the widget choices
CollectionComparisons=w6.value
DataDestination=os.path.join('../data/Combined', str(w5.value)+'.csv')
# run the function below to combine the XPathCounts csv from each lab
CombinedXPathCountsDF=metadataEvaluation.CombineXPathCounts(CollectionComparisons,DataDestination)

##### Combine Evaluated csv from each lab for later exploration

In [10]:
# create variables and filepaths from the widget choices
CollectionComparisons=w6.value
DataDestination=os.path.join('../data/Combined', str(w5.value)+'.csv.gz')
# run the function below to combine the EvaluatedMetadata csv from each lab
CombinedEvaluatedMetadataDF=metadataEvaluation.CombineEvaluatedMetadata(CollectionComparisons, DataDestination)

##### Combine XPath Occurrence csv from each lab to create an overview of the labs.

In [9]:
# create variables and filepaths from the widget choices
CollectionComparisons=w6.value
DataDestination=os.path.join('../data/Combined', str(w5.value)+'.csv')
# run function to combine selected xpathOccurrence data
CombinedXPathOccurrenceDF=metadataEvaluation.CombineXPathOccurrence(CollectionComparisons,DataDestination)
CombinedXPathOccurrenceDF

Unnamed: 0,XPath,2017-11_Re3data,2018-02_Re3data
0,/r3d:re3data/@xsi:schemaLocation,100.00%,100.00%
1,/r3d:re3data/r3d:repository/r3d:additionalName,82.15%,81.92%
2,/r3d:re3data/r3d:repository/r3d:additionalName...,82.15%,81.92%
3,/r3d:re3data/r3d:repository/r3d:aidSystem,18.20%,18.72%
4,/r3d:re3data/r3d:repository/r3d:api,44.49%,44.43%
5,/r3d:re3data/r3d:repository/r3d:api/@apiType,44.54%,44.48%
6,/r3d:re3data/r3d:repository/r3d:certificate,8.85%,9.06%
7,/r3d:re3data/r3d:repository/r3d:citationGuidel...,57.94%,58.13%
8,/r3d:re3data/r3d:repository/r3d:contentType,99.95%,99.95%
9,/r3d:re3data/r3d:repository/r3d:contentType/@c...,99.95%,99.95%


## Now that the data products are created and combined, we can look at Re3data through time using the Exploration Notebook.

Create a dataframe of the records in the collection missing a specific element, selectable by dropdown widget

In [11]:
XPathChoices=list(CombinedXPathCountsDF)
XPathChoices=XPathChoices[2:]
def RecordsMissingXPath(XPathChoice):
    global xPathChoice
    xPathChoice=XPathChoice
    doesntHaveDF=CombinedXPathCountsDF.loc[CombinedXPathCountsDF[xPathChoice] == 0.0]
    doesntHaveDF=doesntHaveDF[['Collection','Record']]
    return doesntHaveDF

w7=interact(RecordsMissingXPath, XPathChoice=XPathChoices) 
display(w7)

<function __main__.RecordsMissingXPath>

By using the function below, a dropdown can select what element to look at the content for

In [14]:
XPathChoices=list(CombinedXPathCountsDF)
XPathChoices=XPathChoices[2:]
def ContentAtElement(XPathChoice):
    global CombinedContentAtElementDF
    xPathChoice=XPathChoice
    CombinedContentAtElementDF=CombinedEvaluatedMetadataDF.loc[CombinedEvaluatedMetadataDF.XPath==xPathChoice]
    CombinedContentAtElementDF=CombinedContentAtElementDF[['Collection','Record','Content']]
    distribution=CombinedContentAtElementDF.groupby('Content').size().sort_values(0,False)
    data = CombinedContentAtElementDF.Content.str.replace(':','.')

   # p = Bar(data, 'Content', title="Content Occurance Count", legend=False)

    
    return distribution

interact(ContentAtElement, XPathChoice=XPathChoices)

<function __main__.ContentAtElement>