### Notebook Goals
* upload metadata from repo/website(10)(Sean)
* evaluate for concepts and xpaths(10)(Sean)
* Create collection spreadsheet(10)(Sean)
* create Google sheet(10)(Sean)


In [None]:
import os
import pandas as pd
import MDeval as md

#### Choose some Metadata
* a note on schema conformance- the transform identifies dialect from the default or explicit schema location. This means if I declare I'm using EML, but not the schema location the dialect was built from, the conceptual content of the record will not be recognized unless
* cleaning metadata - you decide to conform to the schema namespaces(should perhaps repurpose using wildcard namespace prefixes to take any local name?)(skip for first time use)
* Upload your metadata to the server using a metadataEvaluation function 
* Choose metadata from the metadata directory

In [None]:
CollectionName = "MetadataAnalysis"
if not os.path.exists('./metadata/' + CollectionName):
    os.makedirs('./metadata/' + CollectionName)
    
urls = ["https://data.datacite.org/application/vnd.datacite.datacite+xml/10.1016/j.ecoinf.2017.09.005"
       ]
xml_files = ['./metadata/' + CollectionName + '/10.1016.j.ecoinf.2017.09.005.xml'
            ]    
md.get_records(urls, xml_files, well_formed=False)

#### Evaluate metadata for element content and Schema.org concepts using Metadata Evaluation Webservice
* Request element content of records
* Request concept content of records
* take concept results, filter for Schema.org concepts

In [None]:
Organization = 'BCO-DMO'
Collection = 'GeoTraces'
Dialect = 'ISO'
MetadataLocation = (
    './metadata/' + Organization + '/' +
    Collection + '/' + Dialect + '/xml'
)
if not os.path.exists('./data/' + Collection):
    os.makedirs('./data/' + Collection)

In [None]:
ElementDF = md.elementEval(MetadataLocation, Organization, Collection, Dialect, WebService=True)
ElementDF

In [None]:
ConceptDF = md.conceptEval(
    MetadataLocation, Organization, Collection, Dialect, WebService=False
)
ConceptDF

In [None]:
RecommendationList = [
    'Resource Identifier', 'Resource Title',
    'Author / Originator', 'Metadata Contact', 'Contributor Name',
    'Publisher', 'Publication Date', 'Resource Contact', 'Abstract',
    'Keyword', 'Resource Distribution', 'Spatial Extent',
    'Taxonomic Extent', 'Temporal Extent', 'Maintenance',
    'Resource Use Constraints', 'Process Step', 'Project Description',
    'Entity Type Definition', 'Attribute Definition',
    'Resource Access Constraints', 'Resource Format', 'Attribute List',
    'Attribute Constraints', 'Resource Quality Description'
]
RecommendationDF = (
    ConceptDF.loc[ConceptDF['Concept'].isin(
        RecommendationList
    )])
RecommendationDF

In [None]:

DataDestination = (
    '../data/' + Organization + '/' + Collection +
    '_' + Dialect + '_RecommendationEvaluated.csv.gz'
)
RecommendationDF.to_csv(DataDestination, mode='w')

Create a Google Sheets report on the collection containing the occurrence, counts, and content of Schema.org concepts and absolute content of the elements and attributes in the records
* Concept Occurrence Analysis
* Concept Counts Analysis
* Xpath Occurrence Analysis
* Xpath Counts Analysis
* Collection Spreadsheet

In [None]:
# create concept occurrence

DataDestination2 = (
    '../data/' + Organization + '/' +
    Collection + '_' + Dialect + '_ConceptOccurrence.csv'
)

ConceptOccurrenceDF = md.conceptOccurrence(
    ConceptDF, Organization,
    Collection, Dialect, DataDestination2
)

# change order of rows to be meaningful for recommendation
ConceptOccurrenceDF = ConceptOccurrenceDF.reindex(
    ['Number of Records'] + Recommendation
)
''' fill blank spaces with the collection columns value of a
concept that is always present in an EML record
'''
collectionFill = ConceptOccurrenceDF.at[
    1, 'Collection'
]
values = {
    'Collection': collectionFill, 'ConceptCount': 0, 'RecordCount': 0,
    'AverageOccurrencePerRecord': 0.00, 'CollectionOccurrence%': 0.00
}

ConceptOccurrenceDF = ConceptOccurrenceDF.fillna(value=values)
ConceptOccurrenceDF.to_csv(DataDestination2, mode='w')




In [None]:
DataDestination3 = (
    '../data/' + Organization + '/' +
    Collection + '_' + Dialect + '_ConceptCounts.csv'
)
occurrenceMatrix = ConceptCountsDF.conceptCounts(
    RecommendationDF, Organization,
    Collection, Dialect, DataDestination3
)
# order columns to reflect recommendation order

occurrenceMatrix = (occurrenceMatrix[
    ['Collection', 'Record'] + Recommendation])
occurrenceMatrix.to_csv(DataDestination3, mode='w', index=False)

In [None]:
DataDestination = (
    '../data/' + Organization + '/' + Collection +
    '_' + Dialect + '_XpathOccurrence.csv'
)

md.xpathOccurrence(
    ElementDF, Organization,
    Collection, Dialect, DataDestination
)

In [None]:
DataDestination4 = (
    '../data/' + Organization + '/' +
    Collection + '_' + Dialect + '_XpathCounts.csv'
)

md.XpathCounts(
    ElementDF, Organization,
    Collection, Dialect, DataDestination4
)

In [None]:
ReportLocation = (
    '../reports/' + Organization + '/' + Organization +
    '_' + Collection + '_' + Dialect + '_Report.xlsx'
)

md.collectionSpreadsheet(
    Organization, Collection, Dialect,
    EvaluatedConcepts, EvaluatedXpaths,
    DataDestination, DataDestination4,
    DataDestination2, DataDestination3, ReportLocation
)

In [None]:
md.WriteGoogleSheets(ReportLocation)

transform to Schema.org JSON-LD for required elements for datasets