### Notebook Goals
* upload metadata from repo/website(10)(Sean)
* evaluate for concepts and xpaths(10)(Sean)
* Create collection spreadsheet(10)(Sean)
* create Google sheet(10)(Sean)


In [1]:
import os
import pandas as pd
from ipywidgets import *
import ipywidgets as widgets
from IPython.display import display
import MDeval as md

#### Choose some Metadata
* a note on schema conformance- the transform identifies dialect from the default or explicit schema location. This means if I declare I'm using EML, but not the schema location the dialect was built from, the conceptual content of the record will not be recognized unless
* cleaning metadata - you decide to conform to the schema namespaces(should perhaps repurpose using wildcard namespace prefixes to take any local name?)(skip for first time use)
* Upload your metadata to the server using a metadataEvaluation function 
* Choose metadata from the metadata directory

In [2]:
CollectionName = "MetadataAnalysis"
if not os.path.exists('./metadata/' + CollectionName):
    os.makedirs('./metadata/' + CollectionName)
    
urls = ["https://data.datacite.org/application/vnd.datacite.datacite+xml/10.1016/j.ecoinf.2017.09.005"
       ]
xml_files = ['./metadata/' + CollectionName + '/10.1016.j.ecoinf.2017.09.005.xml'
            ]    
md.get_records(urls, xml_files, well_formed=False)

#### Evaluate metadata for element content and Schema.org concepts using Metadata Evaluation Webservice
* Request element content of records
* Request concept content of records
* take concept results, filter for Schema.org concepts

First we need to set some variables that identify where the metadata is and create a place for the resulting analysis and reports.

In [3]:
Organization = 'BCO-DMO'
Collection = 'GeoTraces'
Dialect = 'ISO'
MetadataLocation = (
    './metadata/' + Organization + '/' +
    Collection + '/' + Dialect + '/xml'
)

os.makedirs('./metadata/' + Organization + Collection, exist_ok=True)
os.makedirs('./data/' + Organization, exist_ok=True)
os.makedirs('./reports/' + Organization, exist_ok=True)

In [4]:
md.XMLeval(MetadataLocation, Organization, Collection, Dialect)

Metadata evaluated. Results in the "./data/BCO-DMO" directory.


In [5]:
ConceptDF = pd.read_csv(os.path.join(
        './data/', Organization, Collection +
        '_' + Dialect + "_ConceptEvaluated.csv"), quotechar='"')
ConceptDF

Unnamed: 0,Collection,Dialect,Record,Concept,XPath,Content
0,GeoTraces,ISO,dataset_3470.xml,Resource Title,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,GT10 - Nanomolar Nutrients - Surface from the ...
1,GeoTraces,ISO,dataset_3470.xml,Resource Creation/Revision Date,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,2013-02-27
2,GeoTraces,ISO,dataset_3470.xml,Abstract,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,"Nanomolar concentrations of PO4, NO3, NO2 (sur..."
3,GeoTraces,ISO,dataset_3470.xml,Topic Category,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,oceans
4,GeoTraces,ISO,dataset_3470.xml,Theme Keyword,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,cruise_id
5,GeoTraces,ISO,dataset_3470.xml,Theme Keyword,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,date
6,GeoTraces,ISO,dataset_3470.xml,Theme Keyword,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,time
7,GeoTraces,ISO,dataset_3470.xml,Theme Keyword,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,latitude
8,GeoTraces,ISO,dataset_3470.xml,Theme Keyword,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,longitude
9,GeoTraces,ISO,dataset_3470.xml,Theme Keyword,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,sample


In [6]:
ElementEvaluatedCSV = os.path.join(
        './data/', Organization, Collection +
        '_' + Dialect + "_ElementEvaluated.csv")

ElementDF = pd.read_csv(ElementEvaluatedCSV, quotechar='"')
ElementDF

Unnamed: 0,Collection,Record,XPath,Content
0,GeoTraces,dataset_3470.xml,/gmi:MI_Metadata/@xsi:schemaLocation,http://www.isotc211.org/2005/gmi http://www.ng...
1,GeoTraces,dataset_3470.xml,/gmi:MI_Metadata/gmd:fileIdentifier/gco:Charac...,http://lod.bco-dmo.org/id/dataset/3470
2,GeoTraces,dataset_3470.xml,/gmi:MI_Metadata/gmd:language/gco:CharacterString,eng; USA
3,GeoTraces,dataset_3470.xml,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_Chara...,utf8
4,GeoTraces,dataset_3470.xml,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_Chara...,http://www.isotc211.org/2005/resources/Codelis...
5,GeoTraces,dataset_3470.xml,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_Chara...,utf8
6,GeoTraces,dataset_3470.xml,/gmi:MI_Metadata/gmd:hierarchyLevel/gmd:MD_Sco...,dataset
7,GeoTraces,dataset_3470.xml,/gmi:MI_Metadata/gmd:hierarchyLevel/gmd:MD_Sco...,http://www.isotc211.org/2005/resources/Codelis...
8,GeoTraces,dataset_3470.xml,/gmi:MI_Metadata/gmd:hierarchyLevel/gmd:MD_Sco...,dataset
9,GeoTraces,dataset_3470.xml,/gmi:MI_Metadata/gmd:hierarchyLevel/gmd:MD_Sco...,005


In [7]:
# import recTags
RecommendationsDF = pd.read_csv('./RecTag.csv')
# select recommendation

RecommendationChoices = RecommendationsDF['Recommendation'].tolist()
RecommendationChoices = [x for x in RecommendationChoices if str(x) != 'nan']

def RecChoices(Rec):
    global Recommendation
    Recommendation = (RecommendationsDF[RecommendationsDF['Recommendation'] == Rec]).values.tolist()[0]
    Recommendation = [x for x in Recommendation if str(x) != 'nan']
    del Recommendation[0]
    return Recommendation

# recommendation selector dropdown    
w=interactive(RecChoices, Rec=RecommendationChoices) 
#get list to use
display(w)

In [8]:
Recommendation

['Resource Title',
 'Abstract',
 'Online Resource',
 'Keyword',
 'Author / Originator',
 'Distribution Format',
 'Resource Type',
 'Resource Version',
 'Temporal Extent',
 'Spatial Extent',
 'Resource Citation']

In [9]:
RecommendationConceptsDF = (
    ConceptDF.loc[ConceptDF['Concept'].isin(
        Recommendation
    )])
RecommendationConceptsDF

Unnamed: 0,Collection,Dialect,Record,Concept,XPath,Content
0,GeoTraces,ISO,dataset_3470.xml,Resource Title,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,GT10 - Nanomolar Nutrients - Surface from the ...
2,GeoTraces,ISO,dataset_3470.xml,Abstract,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,"Nanomolar concentrations of PO4, NO3, NO2 (sur..."
133,GeoTraces,ISO,dataset_3470.xml,Online Resource,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,http://www.geotraces.org/ U.S. GEOTRACES GEOTR...
134,GeoTraces,ISO,dataset_3470.xml,Online Resource,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,http://www.geotraces.org/ U.S. GEOTRACES North...
135,GeoTraces,ISO,dataset_3470.xml,Online Resource,/gmi:MI_Metadata/gmd:contentInfo/gmd:MD_Featur...,http://www.bco-dmo.org
136,GeoTraces,ISO,dataset_3470.xml,Online Resource,/gmi:MI_Metadata/gmi:acquisitionInformation/gm...,http://bcodata.whoi.edu/US_GEOTRACES/AtlanticS...
154,GeoTraces,ISO,dataset_3470.xml,Author / Originator,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,Dr Gregory Cutter Old Dominion University prin...
236,GeoTraces,ISO,dataset_3470.xml,Resource Type,/gmi:MI_Metadata/gmd:hierarchyLevel/gmd:MD_Sco...,dataset
237,GeoTraces,ISO,dataset_3470.xml,Temporal Extent,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,2013-02-27
245,GeoTraces,ISO,dataset_3470.xml,Spatial Extent,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...,-78.9949 -9.60547 16.8546 38.34679


In [10]:
RecommendationConceptEvaluatedCSV = (
    './data/' + Organization + '/' + Collection +
    '_' + Dialect + '_RecommendationEvaluated.csv'
)

RecommendationConceptsDF.to_csv(
    RecommendationConceptEvaluatedCSV, index=False
)

Create a Google Sheets report on the collection containing the occurrence, counts, and content of Schema.org concepts and absolute content of the elements and attributes in the records
* Concept Occurrence Analysis
* Concept Counts Analysis
* Xpath Occurrence Analysis
* Xpath Counts Analysis
* Collection Spreadsheet

In [11]:
# create concept occurrence

ConceptOccurrenceCSV = (
    './data/' + Organization + '/' +
    Collection + '_' + Dialect + '_ConceptOccurrence.csv'
)

ConceptOccurrenceDF = md.conceptOccurrence(
    RecommendationConceptsDF, Organization,
    Collection, Dialect, ConceptOccurrenceCSV
)
ConceptOccurrenceDF = pd.read_csv(ConceptOccurrenceCSV, index_col=0)
# change order of rows to be meaningful for recommendation
ConceptOccurrenceDF = ConceptOccurrenceDF.reindex(
    ['Number of Records'] + Recommendation
)
''' fill blank spaces with the collection columns value of the first concep
'''

values = {
    'Collection': Organization+'_'+Collection, 'ConceptCount': 0, 'RecordCount': 0,
    'AverageOccurrencePerRecord': 0.00, 'CollectionOccurrence%': 0.00
}

ConceptOccurrenceDF = ConceptOccurrenceDF.fillna(value=values)
ConceptOccurrenceDF.to_csv(ConceptOccurrenceCSV, mode='w')




In [12]:
ConceptCountsCSV = (
    './data/' + Organization + '/' +
    Collection + '_' + Dialect + '_ConceptCounts.csv'
)
occurrenceMatrix = md.conceptCounts(
    RecommendationConceptsDF, Organization,
    Collection, Dialect, ConceptCountsCSV
)
# order columns to reflect recommendation order

occurrenceMatrix = (occurrenceMatrix[
    ['Collection', 'Record'] + Recommendation])
occurrenceMatrix.to_csv(ConceptCountsCSV, mode='w', index=False)

TypeError: concat() got an unexpected keyword argument 'sort'

In [13]:
XpathOccurrenceCSV = (
    './data/' + Organization + '/' + Collection +
    '_' + Dialect + '_ElementOccurrence.csv'
)

md.xpathOccurrence(
    ElementDF, Organization,
    Collection, Dialect, XpathOccurrenceCSV
)

Unnamed: 0,XPath,Collection,XPathCount,RecordCount,AverageOccurrencePerRecord,CollectionOccurrence%
0,Number of Records,BCO-DMO_GeoTraces_ISO,2,2,2,2.0
1,/gmi:MI_Metadata/@xsi:schemaLocation,BCO-DMO_GeoTraces_ISO,2,2,1.00,1.0
2,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_Chara...,BCO-DMO_GeoTraces_ISO,2,2,1.00,1.0
3,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_Chara...,BCO-DMO_GeoTraces_ISO,2,2,1.00,1.0
4,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_Chara...,BCO-DMO_GeoTraces_ISO,2,2,1.00,1.0
5,/gmi:MI_Metadata/gmd:contact/gmd:CI_Responsibl...,BCO-DMO_GeoTraces_ISO,2,2,1.00,1.0
6,/gmi:MI_Metadata/gmd:contact/gmd:CI_Responsibl...,BCO-DMO_GeoTraces_ISO,2,2,1.00,1.0
7,/gmi:MI_Metadata/gmd:contact/gmd:CI_Responsibl...,BCO-DMO_GeoTraces_ISO,2,2,1.00,1.0
8,/gmi:MI_Metadata/gmd:contact/gmd:CI_Responsibl...,BCO-DMO_GeoTraces_ISO,2,2,1.00,1.0
9,/gmi:MI_Metadata/gmd:contact/gmd:CI_Responsibl...,BCO-DMO_GeoTraces_ISO,2,2,1.00,1.0


In [14]:
XpathCountsCSV = (
    './data/' + Organization + '/' +
    Collection + '_' + Dialect + '_ElementCounts.csv'
)

md.XpathCounts(
    ElementDF, Organization,
    Collection, Dialect, XpathCountsCSV
)

NameError: name 'DataDestination4' is not defined

In [None]:

ReportLocation = (
    './reports/' + Organization + '/' + Organization +
    '_' + Collection + '_' + Dialect + '_Report.xlsx'
)

md.collectionSpreadsheet(
    Organization, Collection, Dialect,
    RecommendationConceptEvaluatedCSV, ElementEvaluatedCSV,
    XpathOccurrenceCSV, XpathCountsCSV,
    ConceptOccurrenceCSV, ConceptCountsCSV, ReportLocation
)

In [None]:
md.WriteGoogleSheets(ReportLocation)


transform to Schema.org JSON-LD for required elements for datasets