### Notebook Goals
* evaluate metadata for concepts and xpaths
* Create data about the collection's concepts and xpaths
* create collection reports on data in Excel and Google Sheets


In [21]:
import os
import pandas as pd
from ipywidgets import *
from IPython.display import display
import MDeval as md

First we need to set some variables that identify where the metadata is and create a place for the resulting analysis and reports.

In [22]:
Organization = 'BCO-DMO'
Collection = 'GeoTraces'
Dialect = 'ISO'
MetadataLocation = (
    './metadata/' + Organization + '/' +
    Collection + '/' + Dialect + '/xml'
)

os.makedirs('./data/' + Organization, exist_ok=True)
os.makedirs('./reports/' + Organization, exist_ok=True)

#### Evaluate metadata for element content and concept content 
* Upload metadata to Metadata Evaluation Web Service
* Read returned element content of records into a table
* Read returned concept content of records into a table

In [23]:
md.XMLeval(MetadataLocation, Organization, Collection, Dialect)

Metadata evaluated. Results in the "./data/BCO-DMO" directory.


In [24]:
# assign csv filepath of concept results to a variable
ConceptEvaluatedCSV = os.path.join(
        './data/', Organization, Collection +
        '_' + Dialect + "_ConceptEvaluated.csv")

# read csv into pandas dataframe
ConceptDF = pd.read_csv(ConceptEvaluatedCSV, quotechar='"')

# show dataframe
ConceptDF

Unnamed: 0,Collection,Dialect,Record,Concept,XPath,Content
0,GeoTraces,ISO,dataset_3484.xml,Resource Title,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,GT10 - Cruise Tracks from the U.S. GEOTRACES N...
1,GeoTraces,ISO,dataset_3484.xml,Resource Creation/Revision Date,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,2012-03-07
2,GeoTraces,ISO,dataset_3484.xml,Abstract,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,GT10 - Cruise tracks from Athena data - 1 minu...
3,GeoTraces,ISO,dataset_3484.xml,Topic Category,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,oceans
4,GeoTraces,ISO,dataset_3484.xml,Theme Keyword,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,date_gmt
5,GeoTraces,ISO,dataset_3484.xml,Theme Keyword,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,time_gmt
6,GeoTraces,ISO,dataset_3484.xml,Theme Keyword,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,longitude
7,GeoTraces,ISO,dataset_3484.xml,Theme Keyword,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,latitude
8,GeoTraces,ISO,dataset_3484.xml,Theme Keyword,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,cruise_id
9,GeoTraces,ISO,dataset_3484.xml,Theme Keyword,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,Global Positioning System Receiver


In [25]:
# assign csv filepath of concept results to a variable
ElementEvaluatedCSV = os.path.join(
        './data/', Organization, Collection +
        '_' + Dialect + "_ElementEvaluated.csv")

# read csv into pandas dataframe
ElementDF = pd.read_csv(ElementEvaluatedCSV, quotechar='"')

# show dataframe
ElementDF

Unnamed: 0,Collection,Record,XPath,Content
0,GeoTraces,dataset_3484.xml,/gmi:MI_Metadata/@xsi:schemaLocation,http://www.isotc211.org/2005/gmi http://www.ng...
1,GeoTraces,dataset_3484.xml,/gmi:MI_Metadata/gmd:fileIdentifier/gco:Charac...,http://lod.bco-dmo.org/id/dataset/3484
2,GeoTraces,dataset_3484.xml,/gmi:MI_Metadata/gmd:language/gco:CharacterString,eng; USA
3,GeoTraces,dataset_3484.xml,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_Chara...,utf8
4,GeoTraces,dataset_3484.xml,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_Chara...,http://www.isotc211.org/2005/resources/Codelis...
5,GeoTraces,dataset_3484.xml,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_Chara...,utf8
6,GeoTraces,dataset_3484.xml,/gmi:MI_Metadata/gmd:hierarchyLevel/gmd:MD_Sco...,dataset
7,GeoTraces,dataset_3484.xml,/gmi:MI_Metadata/gmd:hierarchyLevel/gmd:MD_Sco...,http://www.isotc211.org/2005/resources/Codelis...
8,GeoTraces,dataset_3484.xml,/gmi:MI_Metadata/gmd:hierarchyLevel/gmd:MD_Sco...,dataset
9,GeoTraces,dataset_3484.xml,/gmi:MI_Metadata/gmd:hierarchyLevel/gmd:MD_Sco...,005


In [26]:
# import recTags
RecommendationsDF = pd.read_csv('./RecTag.csv')
# select recommendation

RecommendationChoices = RecommendationsDF['Recommendation'].tolist()
RecommendationChoices = [x for x in RecommendationChoices if str(x) != 'nan']

def RecChoices(Rec):
    global Recommendation
    Recommendation = (RecommendationsDF[RecommendationsDF['Recommendation'] == Rec]).values.tolist()[0]
    Recommendation = [x for x in Recommendation if str(x) != 'nan']
    del Recommendation[0]
    return Recommendation

# recommendation selector dropdown    
w=interactive(RecChoices, Rec=RecommendationChoices) 
#get list to use
display(w)

In [27]:
Recommendation

['Keyword',
 'Resource Title',
 'Abstract',
 'Resource Format',
 'Metadata Identifier',
 'Metadata Modified Date',
 'Resource Type',
 'Bounding Box',
 'Coordinate Reference System (CRS)',
 'Association',
 'Resource Title',
 'Author / Originator',
 'Keyword',
 'Abstract',
 'Publisher',
 'Contributor Name',
 'Metadata Modified Date',
 'Resource Type',
 'Resource Format',
 'Metadata Identifier',
 'Source Citation',
 'Metadata Language',
 'Related Resource Citation',
 'Bounding Box',
 'Rights',
 'Resource Revision Date',
 'Resource Creation/Revision Date',
 'Publication Date',
 'Organization Name',
 'Security Constraints',
 'Metadata Language',
 'Resource Identifier',
 'Parent Identifier',
 'Keyword Type']

In [28]:
RecommendationConceptsDF = (
    ConceptDF.loc[ConceptDF['Concept'].isin(
        Recommendation
    )])
RecommendationConceptsDF

Unnamed: 0,Collection,Dialect,Record,Concept,XPath,Content
0,GeoTraces,ISO,dataset_3484.xml,Resource Title,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,GT10 - Cruise Tracks from the U.S. GEOTRACES N...
1,GeoTraces,ISO,dataset_3484.xml,Resource Creation/Revision Date,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,2012-03-07
2,GeoTraces,ISO,dataset_3484.xml,Abstract,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,GT10 - Cruise tracks from Athena data - 1 minu...
25,GeoTraces,ISO,dataset_3484.xml,Organization Name,/gmi:MI_Metadata/gmd:contact/gmd:CI_Responsibl...,Biological and Chemical Oceanography Data Mana...
26,GeoTraces,ISO,dataset_3484.xml,Organization Name,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,Woods Hole Oceanographic Institution
27,GeoTraces,ISO,dataset_3484.xml,Organization Name,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,Woods Hole Oceanographic Institution
28,GeoTraces,ISO,dataset_3484.xml,Organization Name,/gmi:MI_Metadata/gmd:contentInfo/gmd:MD_Featur...,Biological and Chemical Oceanography Data Mana...
29,GeoTraces,ISO,dataset_3484.xml,Organization Name,/gmi:MI_Metadata/gmd:distributionInfo/gmd:MD_D...,Biological and Chemical Oceanography Data Mana...
30,GeoTraces,ISO,dataset_3484.xml,Organization Name,/gmi:MI_Metadata/gmd:metadataMaintenance/gmd:M...,Biological and Chemical Oceanography Data Mana...
31,GeoTraces,ISO,dataset_3484.xml,Organization Name,/gmi:MI_Metadata/gmi:acquisitionInformation/gm...,Woods Hole Oceanographic Institution


In [29]:
RecommendationConceptEvaluatedCSV = (
    './data/' + Organization + '/' + Collection +
    '_' + Dialect + '_RecommendationEvaluated.csv'
)

RecommendationConceptsDF.to_csv(
    RecommendationConceptEvaluatedCSV, index=False
)

Create a Google Sheets report on the collection containing the occurrence, counts, and content of Schema.org concepts and absolute content of the elements and attributes in the records
* Concept Occurrence Analysis
* Concept Counts Analysis
* Xpath Occurrence Analysis
* Xpath Counts Analysis
* Collection Spreadsheet

In [30]:
# create concept occurrence

ConceptOccurrenceCSV = (
    './data/' + Organization + '/' +
    Collection + '_' + Dialect + '_ConceptOccurrence.csv'
)

ConceptOccurrenceDF = md.conceptOccurrence(
    RecommendationConceptsDF, Organization,
    Collection, Dialect, ConceptOccurrenceCSV
)
ConceptOccurrenceDF = pd.read_csv(ConceptOccurrenceCSV, index_col=0)
# change order of rows to be meaningful for recommendation
ConceptOccurrenceDF = ConceptOccurrenceDF.reindex(
    ['Number of Records'] + Recommendation
)
''' fill blank spaces with the collection columns value of the first concep
'''

values = {
    'Collection': Organization+'_'+Collection, 'ConceptCount': 0, 'RecordCount': 0,
    'AverageOccurrencePerRecord': 0.00, 'CollectionOccurrence%': 0.00
}

ConceptOccurrenceDF = ConceptOccurrenceDF.fillna(value=values)
ConceptOccurrenceDF.to_csv(ConceptOccurrenceCSV, mode='w')




In [31]:
ConceptCountsCSV = (
    './data/' + Organization + '/' +
    Collection + '_' + Dialect + '_ConceptCounts.csv'
)
occurrenceMatrix = md.conceptCounts(
    RecommendationConceptsDF, Organization,
    Collection, Dialect, ConceptCountsCSV
)
# order columns to reflect recommendation order

occurrenceMatrix = (occurrenceMatrix[
    ['Collection', 'Record'] + Recommendation])
occurrenceMatrix.to_csv(ConceptCountsCSV, mode='w', index=False)

TypeError: concat() got an unexpected keyword argument 'sort'

In [None]:
XpathOccurrenceCSV = (
    './data/' + Organization + '/' + Collection +
    '_' + Dialect + '_ElementOccurrence.csv'
)

md.xpathOccurrence(
    ElementDF, Organization,
    Collection, Dialect, XpathOccurrenceCSV
)

In [None]:
XpathCountsCSV = (
    './data/' + Organization + '/' +
    Collection + '_' + Dialect + '_ElementCounts.csv'
)

md.XpathCounts(
    ElementDF, Organization,
    Collection, Dialect, XpathCountsCSV
)

In [None]:

ReportLocation = (
    './reports/' + Organization + '/' + Organization +
    '_' + Collection + '_' + Dialect + '_Report.xlsx'
)

md.collectionSpreadsheet(
    Organization, Collection, Dialect,
    RecommendationConceptEvaluatedCSV, ElementEvaluatedCSV,
    XpathOccurrenceCSV, XpathCountsCSV,
    ConceptOccurrenceCSV, ConceptCountsCSV, ReportLocation
)

In [None]:
md.WriteGoogleSheets(ReportLocation)


[Next Notebook: Create JSON-LD for Datasets Using the schema.org Vocabulary and Test the Results](./Create%20JSON-LD%20for%20Datasets%20Using%20the%20schema.org%20Vocabulary%20and%20Test%20the%20Results.ipynb)