### This notebook allows the user to evaluate the structure and content of valid XML metadata using the metadataEvaluation Python module. Documentation concepts can be identified for a broad subset of earth science metadata standards. These concepts allow for records to be assessed by many communities conventions for essential discovery metadata. 

#### Now let's create the variables we will use for the functions needed from metadataEvaluation.py. If you change the text, the variable will change, no need to rerun the cell. This is helpful for creating data products from each lab

In [1]:
#modules to support functions
import pandas
from ipywidgets import *
import ipywidgets as widgets
from IPython.display import display
import metadataEvaluation

#functions to identify and label an xml collection for Metadata Evaluation
def OrganizationChoices(organization):
    global Organization
    Organization=organization
    print("Curating Organization", Organization)
    
def CollectionName(collection):
    global Collection
    Collection=collection
    print("Collection is named", Collection)
    
def DialectName(dialect):
    global Dialect
    Dialect=dialect
    print("Collection is written in the", Dialect,"dialect")   
    
def metadataLocation(directory):
    global MetadataLocation
    MetadataLocation=directory
    print("The local directory", MetadataLocation,"contains the metadata for evaluation")    
        
w=interactive(OrganizationChoices, organization='')    
w2=interactive(CollectionName, collection='')
w3=interactive(DialectName, dialect='')
w4=interactive(metadataLocation, directory='')    

display(w)
display(w2)
display(w3)
display(w4)

### Run this cell for each lab to evaluate the collection and create data products using the variables in the widgets in the previous code cell.

In [None]:
# XMLeval uploads a zip file to the webservice, and reads the return into a dataframe
EvaluatedMetadataDF=metadataEvaluation.XMLeval(MetadataLocation, Organization, Collection, Dialect)
# simpleXPathDataProduct clarifies the xpaths by removing namespaces and in the case of ISO 19115* dialects removes the ISO classes
SimplifiedEvaluatedMetadataDF=metadataEvaluation.simpleXPathDataProduct(EvaluatedMetadataDF, Organization, Collection, Dialect)
# xpathOccurrence creates a pivot table of xpaths, record occurrence counts, occurrence percentages and average occurrences per record
xpathOccurrenceDF=metadataEvaluation.xpathOccurrence(SimplifiedEvaluatedMetadataDF, Organization, Collection, Dialect)
# XpathCounts creates a pivot table with each record a row and xpaths for the columns, with a record count
metadataEvaluation.XpathCounts(SimplifiedEvaluatedMetadataDF, Organization, Collection, Dialect)
xpathOccurrenceDF

In [None]:
f=metadataEvaluation.XpathCounts(SimplifiedEvaluatedMetadataDF, Organization, Collection, Dialect)
f

### Combine the data to compare between collections

In [2]:
#Create a dropdown that allows multiple selection using command and click with the mouse 
#or arrow keys to select additional individual data tables, or 
#shift with the mouse to select each choice between the two clicked.

#Creates a list of all the paths to evaluated collection data.
EvaluatedMetadata=[]
for dirpath, dirnames, filenames in os.walk("../data"):
    for filename in [f for f in filenames if '.csv' in f]:
        EvaluatedMetadata.extend([os.path.join(dirpath, filename)])

#widget for selecting multiple collections        
w6=widgets.SelectMultiple(
    options=sorted(EvaluatedMetadata),
    value=['../data/Sample/Sample_Sample_Evaluated.csv.gz'],
    #rows=10,
    description='Evaluated Data',
    disabled=False
)
#Show the widget



#widget for naming the file
w5=widgets.Text(
    value='',
    placeholder='Your file name (no spaces)',
    description='Name your File:',
    disabled=False
)

display(w5)
display(w6)

create variables from the widget choices and run functions to combine selected xpathOccurrence data

In [None]:
CollectionComparisons=w6.value
DataDestination=os.path.join('../data/Combined', str(w5.value)+'.csv')
CombinedXPathOccurrenceDF=metadataEvaluation.CombineXPathOccurrence(CollectionComparisons,DataDestination)

create variables from the widget choices and run functions to combine selected xpathCounts data

In [None]:
display(w5)
display(w6)

In [None]:
CollectionComparisons=w6.value
DataDestination=os.path.join('../data/Combined', str(w5.value)+'.csv')
CombinedXPathCountsDF=metadataEvaluation.CombineXPathCounts(CollectionComparisons,DataDestination)
CombinedXPathCountsDF

create variables from the widget choices and run functions to combine selected EvaluatedSimplified data

In [3]:
CollectionComparisons=w6.value
DataDestination=os.path.join('../data/Combined', str(w5.value)+'.csv.gz')
CombinedEvaluatedMetadataDF=metadataEvaluation.CombineEvaluatedMetadata(CollectionComparisons, DataDestination)

In [None]:
display(w5)
display(w6)

## Now that the data products are created and combined, we can look for a specific lab or across NCAR to discover which records don't contain an element, what content a certain element contains, and if a collection contains essential metadata using the following functions.

In [None]:
fredDF=XpathCounts(EvaluatedMetadataDF, Organization, Collection, Dialect)
df2=fredDF.filter([XPath,"/characterSet", "/characterSet/@codeList"])
df2

In [None]:
doesntHaveDF=fredDF.loc[fredDF['/metadataMaintenance/maintenanceAndUpdateFrequency'] == 0]
doesntHaveDF

In [None]:
content=EvaluatedMetadataDF.loc[EvaluatedMetadataDF['XPath']=='/metadataMaintenance/maintenanceAndUpdateFrequency']
content

In [None]:
ConceptVerticals=EvaluatedMetadataDF.Concept.unique()
Verticals=ConceptVerticals.tolist()

def ConceptVerticalTable(Concept):
    global VerticalTable
    VerticalTable = EvaluatedMetadataDF[EvaluatedMetadataDF.Concept == Concept]
    return VerticalTable
interact(ConceptVerticalTable, Concept=Verticals) 

In [None]:
from bokeh.io import show, output_file
from bokeh.plotting import figure
VerticalTable.groupby('Content').size()
data = VerticalTable.Content.str.replace(':','.')

p = Bar(data, 'Content', title="Vertical Value Occurance Count", legend=False)

show(p)

In [None]:
output_file("bars.html")


p = figure(y_range=VerticalTable, plot_height=250, title="Content Consistency")

p.hbar(y=VerticalTable, top=[5, 3, 4, 2, 4, 6], width=0.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)

In [None]:
XPathVerticals=EvaluatedMetadataDF.XPath.unique()
XPathVerticals=XPathVerticals.tolist()
XPathVerticals
def XPathVerticalTable(XPath):
    global XPathVerticalTable
    XPathVerticalTable = EvaluatedMetadataDF[EvaluatedMetadataDF.XPath == XPath]
    return XPathVerticalTable
interact(XPathVerticalTable, XPath=XPathVerticals) 
