### Notebook Goals
* upload metadata from repo/website(10)(Sean)
* evaluate for concepts and xpaths(10)(Sean)
* Create collection spreadsheet(10)(Sean)
* create Google sheet(10)(Sean)


In [1]:
import os
import pandas as pd
from ipywidgets import *
import ipywidgets as widgets
from IPython.display import display
import MDeval as md

#### Choose some Metadata
* a note on schema conformance- the transform identifies dialect from the default or explicit schema location. This means if I declare I'm using EML, but not the schema location the dialect was built from, the conceptual content of the record will not be recognized unless
* cleaning metadata - you decide to conform to the schema namespaces(should perhaps repurpose using wildcard namespace prefixes to take any local name?)(skip for first time use)
* Upload your metadata to the server using a metadataEvaluation function 
* Choose metadata from the metadata directory

In [2]:
CollectionName = "MetadataAnalysis"
if not os.path.exists('./metadata/' + CollectionName):
    os.makedirs('./metadata/' + CollectionName)
    
urls = ["https://data.datacite.org/application/vnd.datacite.datacite+xml/10.1016/j.ecoinf.2017.09.005"
       ]
xml_files = ['./metadata/' + CollectionName + '/10.1016.j.ecoinf.2017.09.005.xml'
            ]    
md.get_records(urls, xml_files, well_formed=False)

#### Evaluate metadata for element content and Schema.org concepts using Metadata Evaluation Webservice
* Request element content of records
* Request concept content of records
* take concept results, filter for Schema.org concepts

First we need to set some variables that identify where the metadata is and create a place for the resulting analysis and reports.

In [3]:
Organization = 'BCO-DMO'
Collection = 'GeoTraces'
Dialect = 'ISO'
MetadataLocation = (
    './metadata/' + Organization + '/' +
    Collection + '/' + Dialect + '/xml'
)

os.makedirs('./metadata/' + Organization + Collection, exist_ok=True)
os.makedirs('./data/' + Organization, exist_ok=True)
os.makedirs('./reports/' + Organization, exist_ok=True)

In [4]:
ElementDF = md.elementEval(MetadataLocation, Organization, Collection, Dialect, WebService=True)
ElementDF

Unnamed: 0,Collection,Concept,Content,Dialect,Record,XPath
0,GeoTraces,Abstract,"Nanomolar concentrations of PO4, NO3, NO2 (sur...",ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
1,GeoTraces,Acknowledgement,Funding provided by NSF Ocean Sciences (NSF OC...,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
2,GeoTraces,Acknowledgement,Funding provided by NSF Ocean Sciences (NSF OC...,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
3,GeoTraces,Address,WHOI MS#36,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:contact/gmd:CI_Responsibl...
4,GeoTraces,Address,"Department of Ocean, Earth, and Atmospheric Sc...",ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
5,GeoTraces,Address,WHOI MS#36,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:contentInfo/gmd:MD_Featur...
6,GeoTraces,Address,WHOI MS#36,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:distributionInfo/gmd:MD_D...
7,GeoTraces,Address,WHOI MS#36,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:metadataMaintenance/gmd:M...
8,GeoTraces,AssociatedDIFs,U.S. GEOTRACES,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
9,GeoTraces,AssociatedDIFs,U.S. GEOTRACES NAT,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...


In [5]:
ConceptDF = md.conceptEval(MetadataLocation, Organization, Collection, Dialect, WebService=True)
ConceptDF

Unnamed: 0,Collection,Concept,Content,Dialect,Record,XPath
0,GeoTraces,Abstract,"Nanomolar concentrations of PO4, NO3, NO2 (sur...",ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
1,GeoTraces,Acknowledgement,Funding provided by NSF Ocean Sciences (NSF OC...,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
2,GeoTraces,Acknowledgement,Funding provided by NSF Ocean Sciences (NSF OC...,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
3,GeoTraces,Address,WHOI MS#36,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:contact/gmd:CI_Responsibl...
4,GeoTraces,Address,"Department of Ocean, Earth, and Atmospheric Sc...",ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
5,GeoTraces,Address,WHOI MS#36,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:contentInfo/gmd:MD_Featur...
6,GeoTraces,Address,WHOI MS#36,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:distributionInfo/gmd:MD_D...
7,GeoTraces,Address,WHOI MS#36,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:metadataMaintenance/gmd:M...
8,GeoTraces,AssociatedDIFs,U.S. GEOTRACES,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
9,GeoTraces,AssociatedDIFs,U.S. GEOTRACES NAT,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...


In [6]:
# import recTags
RecommendationsDF = pd.read_csv('./RecTag.csv')
# select recommendation

RecommendationChoices = RecommendationsDF['Recommendation'].tolist()
RecommendationChoices = [x for x in RecommendationChoices if str(x) != 'nan']

def RecChoices(recommendation):
    global Recommendation
    Recommendation = (RecommendationsDF[RecommendationsDF['Recommendation'] == recommendation]).values.tolist()[0]
    Recommendation = [x for x in Recommendation if str(x) != 'nan']
    del Recommendation[0]
    return Recommendation

# recommendation selector dropdown    
w=interactive(RecChoices, recommendation=RecommendationChoices) 
#get list to use


In [7]:
display(w)

In [8]:
Recommendation

['Keyword',
 'Resource Title',
 'Abstract',
 'Resource Format',
 'Metadata Identifier',
 'Metadata Modified Date',
 'Resource Type',
 'Bounding Box',
 'Coordinate Reference System (CRS)',
 'Association',
 'Resource Title',
 'Author / Originator',
 'Keyword',
 'Abstract',
 'Publisher',
 'Contributor Name',
 'Metadata Modified Date',
 'Resource Type',
 'Resource Format',
 'Metadata Identifier',
 'Source Citation',
 'Metadata Language',
 'Related Resource Citation',
 'Bounding Box',
 'Rights',
 'Resource Revision Date',
 'Resource Creation/Revision Date',
 'Publication Date',
 'Organization Name',
 'Security Constraints',
 'Metadata Language',
 'Resource Identifier',
 'Parent Identifier',
 'Keyword Type']

In [9]:
RecommendationDF = (
    ConceptDF.loc[ConceptDF['Concept'].isin(
        Recommendation
    )])
RecommendationDF

Unnamed: 0,Collection,Concept,Content,Dialect,Record,XPath
0,GeoTraces,Abstract,"Nanomolar concentrations of PO4, NO3, NO2 (sur...",ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
10,GeoTraces,Association,U.S. GEOTRACES http://www.geotraces.org/ U.S. ...,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
11,GeoTraces,Association,U.S. GEOTRACES North Atlantic Transect http://...,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
46,GeoTraces,Author / Originator,Dr Gregory Cutter Old Dominion University prin...,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
47,GeoTraces,Bounding Box,-78.9949 -9.60547 16.8546 38.34679,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
140,GeoTraces,Keyword,cruise_id,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
141,GeoTraces,Keyword,date,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
142,GeoTraces,Keyword,time,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
143,GeoTraces,Keyword,latitude,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...
144,GeoTraces,Keyword,longitude,ISO,dataset_3470.xml,/gmi:MI_Metadata/gmd:identificationInfo/gmd:MD...


In [10]:
DataDestination6 = (
    './data/' + Organization + '/' + Collection +
    '_' + Dialect + '_RecommendationEvaluated.csv.gz'
)
md.writeCSV(
    DataDestination6, RecommendationDF, Concept=True
)

In [11]:
DataDestination5 = (
    './data/' + Organization + '/' + Collection +
    '_' + Dialect + '_ElementEvaluated.csv.gz'
)
md.writeCSV(
    DataDestination5, ElementDF, Concept=False
)

Create a Google Sheets report on the collection containing the occurrence, counts, and content of Schema.org concepts and absolute content of the elements and attributes in the records
* Concept Occurrence Analysis
* Concept Counts Analysis
* Xpath Occurrence Analysis
* Xpath Counts Analysis
* Collection Spreadsheet

In [12]:
# create concept occurrence

DataDestination2 = (
    './data/' + Organization + '/' +
    Collection + '_' + Dialect + '_ConceptOccurrence.csv'
)

ConceptOccurrenceDF = md.conceptOccurrence(
    RecommendationDF, Organization,
    Collection, Dialect, DataDestination2
)
ConceptOccurrenceDF = pd.read_csv(DataDestination2, index_col=0)
# change order of rows to be meaningful for recommendation
ConceptOccurrenceDF = ConceptOccurrenceDF.reindex(
    ['Number of Records'] + Recommendation
)
''' fill blank spaces with the collection columns value of the first concep
'''

values = {
    'Collection': Organization+'_'+Collection, 'ConceptCount': 0, 'RecordCount': 0,
    'AverageOccurrencePerRecord': 0.00, 'CollectionOccurrence%': 0.00
}

ConceptOccurrenceDF = ConceptOccurrenceDF.fillna(value=values)
ConceptOccurrenceDF.to_csv(DataDestination2, mode='w')




In [13]:
DataDestination3 = (
    './data/' + Organization + '/' +
    Collection + '_' + Dialect + '_ConceptCounts.csv'
)
occurrenceMatrix = md.conceptCounts(
    RecommendationDF, Organization,
    Collection, Dialect, DataDestination3
)
# order columns to reflect recommendation order

occurrenceMatrix = (occurrenceMatrix[
    ['Collection', 'Record'] + Recommendation])
occurrenceMatrix.to_csv(DataDestination3, mode='w', index=False)

In [14]:
DataDestination = (
    './data/' + Organization + '/' + Collection +
    '_' + Dialect + '_XpathOccurrence.csv'
)

md.xpathOccurrence(
    ElementDF, Organization,
    Collection, Dialect, DataDestination
)

Unnamed: 0,XPath,Collection,XPathCount,RecordCount,AverageOccurrencePerRecord,CollectionOccurrence%
0,Number of Records,BCO-DMO_GeoTraces_ISO,22,22,22,22.0
1,/gmi:MI_Metadata/@xsi:schemaLocation,BCO-DMO_GeoTraces_ISO,22,22,1.00,1.0
2,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_Chara...,BCO-DMO_GeoTraces_ISO,22,22,1.00,1.0
3,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_Chara...,BCO-DMO_GeoTraces_ISO,22,22,1.00,1.0
4,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_Chara...,BCO-DMO_GeoTraces_ISO,22,22,1.00,1.0
5,/gmi:MI_Metadata/gmd:contact,BCO-DMO_GeoTraces_ISO,22,22,1.00,1.0
6,/gmi:MI_Metadata/gmd:contact/gmd:CI_Responsibl...,BCO-DMO_GeoTraces_ISO,22,22,1.00,1.0
7,/gmi:MI_Metadata/gmd:contact/gmd:CI_Responsibl...,BCO-DMO_GeoTraces_ISO,22,22,1.00,1.0
8,/gmi:MI_Metadata/gmd:contact/gmd:CI_Responsibl...,BCO-DMO_GeoTraces_ISO,22,22,1.00,1.0
9,/gmi:MI_Metadata/gmd:contact/gmd:CI_Responsibl...,BCO-DMO_GeoTraces_ISO,22,22,1.00,1.0


In [15]:
DataDestination4 = (
    './data/' + Organization + '/' +
    Collection + '_' + Dialect + '_XpathCounts.csv'
)

md.XpathCounts(
    ElementDF, Organization,
    Collection, Dialect, DataDestination4
)

XPath,Collection,Record,/gmi:MI_Metadata/@xsi:schemaLocation,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_CharacterSetCode,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_CharacterSetCode/@codeList,/gmi:MI_Metadata/gmd:characterSet/gmd:MD_CharacterSetCode/@codeListValue,/gmi:MI_Metadata/gmd:contact,/gmi:MI_Metadata/gmd:contact/gmd:CI_ResponsibleParty,/gmi:MI_Metadata/gmd:contact/gmd:CI_ResponsibleParty/gmd:contactInfo,/gmi:MI_Metadata/gmd:contact/gmd:CI_ResponsibleParty/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:administrativeArea,...,/gmi:MI_Metadata/gmi:acquisitionInformation/gmi:MI_AcquisitionInformation/gmi:operation/gmi:MI_Operation/gmi:platform/gmi:MI_Platform/gmi:identifier/gmd:MD_Identifier/gmd:code,/gmi:MI_Metadata/gmi:acquisitionInformation/gmi:MI_AcquisitionInformation/gmi:operation/gmi:MI_Operation/gmi:platform/gmi:MI_Platform/gmi:identifier/gmd:MD_Identifier/gmd:code/gmx:Anchor,/gmi:MI_Metadata/gmi:acquisitionInformation/gmi:MI_AcquisitionInformation/gmi:operation/gmi:MI_Operation/gmi:platform/gmi:MI_Platform/gmi:identifier/gmd:MD_Identifier/gmd:code/gmx:Anchor/@xlink:actuate,/gmi:MI_Metadata/gmi:acquisitionInformation/gmi:MI_AcquisitionInformation/gmi:operation/gmi:MI_Operation/gmi:platform/gmi:MI_Platform/gmi:identifier/gmd:MD_Identifier/gmd:code/gmx:Anchor/@xlink:href,/gmi:MI_Metadata/gmi:acquisitionInformation/gmi:MI_AcquisitionInformation/gmi:operation/gmi:MI_Operation/gmi:platform/gmi:MI_Platform/gmi:identifier/gmd:MD_Identifier/gmd:code/gmx:Anchor/@xlink:title,/gmi:MI_Metadata/gmi:acquisitionInformation/gmi:MI_AcquisitionInformation/gmi:operation/gmi:MI_Operation/gmi:platform/gmi:MI_Platform/gmi:instrument/@gco:nilReason,/gmi:MI_Metadata/gmi:acquisitionInformation/gmi:MI_AcquisitionInformation/gmi:operation/gmi:MI_Operation/gmi:status/gmd:MD_ProgressCode/@codeList,/gmi:MI_Metadata/gmi:acquisitionInformation/gmi:MI_AcquisitionInformation/gmi:operation/gmi:MI_Operation/gmi:status/gmd:MD_ProgressCode/@codeListValue,/gmi:MI_Metadata/gmi:acquisitionInformation/gmi:MI_AcquisitionInformation/gmi:operation/gmi:MI_Operation/gmi:type/gmi:MI_OperationTypeCode/@codeList,/gmi:MI_Metadata/gmi:acquisitionInformation/gmi:MI_AcquisitionInformation/gmi:operation/gmi:MI_Operation/gmi:type/gmi:MI_OperationTypeCode/@codeListValue
0,GeoTraces,dataset_3470.xml,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,GeoTraces,dataset_3484.xml,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,GeoTraces,dataset_3485.xml,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,GeoTraces,dataset_3486.xml,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,GeoTraces,dataset_3508.xml,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
5,GeoTraces,dataset_3510.xml,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
6,GeoTraces,dataset_3513.xml,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
7,GeoTraces,dataset_3514.xml,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
8,GeoTraces,dataset_3515.xml,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
9,GeoTraces,dataset_3516.xml,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [16]:

ReportLocation = (
    './reports/' + Organization + '/' + Organization +
    '_' + Collection + '_' + Dialect + '_Report.xlsx'
)

md.collectionSpreadsheet(
    Organization, Collection, Dialect,
    DataDestination6, DataDestination5,
    DataDestination, DataDestination4,
    DataDestination2, DataDestination3, ReportLocation
)

In [17]:
md.WriteGoogleSheets(ReportLocation)

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=1030561033399-udhkgkjhr7qc4ukd7nvv6d8ejm2ejtkf.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.file+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.install&access_type=offline&response_type=code&approval_prompt=force

Authentication successful.
https://docs.google.com/spreadsheets/d/1aQwCpMmyq6CZVOe-CQh70BLsholuRwDA6IKd6Eikfk8/edit?usp=drivesdk


transform to Schema.org JSON-LD for required elements for datasets