### This notebook allows the user to create a directory structure and locally upload the metadata to the MILE2 repository.

In [2]:
import pandas as pd
import os
from os import walk
import shutil
from ipywidgets import *
import ipywidgets as widgets
import requests
from contextlib import closing
import csv
import io
from IPython.display import display

### Now let's select some metadata.

Create a list of subdirectories in the collection directory of MILE2 to select metadata for evaluation

In [3]:
def OrganizationChoices(organization):
    global OrganizationChoice
    global Organization
    Organization=organization
    print("Curating Organization", Organization)
    
def CollectionName(collection):
    global CollectionChoice
    global Collection
    Collection=collection
    print("Collection is named", Collection)
    
def DialectName(dialect):
    global DialectChoice
    global Dialect
    Dialect=dialect
    print("Collection is written in the", Dialect,"dialect")   
    
def metadataLocation(directory):
    global LocationChoice
    global MetadataLocation
    MetadataLocation=directory
    print("The local directory", MetadataLocation,"contains the metadata for evaluation")    
    
w=interactive(OrganizationChoices, organization='')    

w2=interactive(CollectionName, collection='')

w3=interactive(DialectName, dialect='')

w4=interactive(metadataLocation, directory='')    

display(w)
display(w2)
display(w3)
display(w4)

Copy the metadata to the new directory structure.

In [5]:
%cd ../zip
MetadataDestination=os.path.join(Organization,Collection,Dialect,'xml')
os.makedirs(MetadataDestination, exist_ok=True)

src_files = os.listdir(MetadataLocation)
for file_name in src_files:
    full_file_name = os.path.join(MetadataLocation, file_name)
    if (os.path.isfile(full_file_name)):
        shutil.copy(full_file_name, MetadataDestination)
shutil.make_archive('../upload/metadata', 'zip', os.getcwd())

/Users/scgordon/Evaluator/zip


'/Users/scgordon/Evaluator/upload/metadata.zip'

In [6]:
%cd ../upload 
# Send metadata package, read the response into a dataframe
url = 'http://metadig.nceas.ucsb.edu/metadata/evaluator'
files = {'zipxml': open('metadata.zip', 'rb')}
r = requests.post(url, files=files, headers={"Accept-Encoding": "gzip"})
r.raise_for_status()
EvaluatedMetadataDF = pd.read_csv(io.StringIO(r.text), quotechar='"')

#build filepaths and file names
Filedirectory=os.path.join('../data/',Organization)
Filename='/'+Collection+'_'+Dialect+'_Evaluated.csv.gz'
SimplfiedFilename='/'+Collection+'_'+Dialect+'_EvaluatedSimplified.csv.gz'
FilePath=Filedirectory+Filename
SimplifiedFilePath=Filedirectory+SimplfiedFilename
EvaluatedMetadataDF.insert(3, 'Collection', Organization+'_'+Collection+'_'+Dialect)

EvaluatedMetadataDF.to_csv(FilePath, mode = 'w', compression='gzip', index=False)

#Change directories, delete upload directory and zip. Delete copied metadata.
%cd ../
shutil.rmtree('upload')
%cd zip
shutil.rmtree(Organization)
%cd ../data

#Create a simplified XPath output
EvaluatedSimplifiedMetadataDF = EvaluatedMetadataDF.copy()
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.replace('/gco:CharacterString', '')
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.replace('/[a-z]+:+?', '/')
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.replace('/[A-Z]+_[A-Za-z]+/?', '/')
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.replace('//', '/')
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.rstrip('//')
EvaluatedSimplifiedMetadataDF.to_csv(SimplifiedFilePath, mode = 'w', compression='gzip', index=False)

/Users/scgordon/Evaluator/upload
/Users/scgordon/Evaluator
/Users/scgordon/Evaluator/zip
/Users/scgordon/Evaluator/data


Unnamed: 0,Concept,Content,Record,Collection,XPath
0,Abstract,"Nanomolar concentrations of PO4, NO3, NO2 (sur...",dataset_3470.xml,a_d_c,/identificationInfo/abstract
1,Acknowledgement,Funding provided by NSF Ocean Sciences (NSF OC...,dataset_3470.xml,a_d_c,/identificationInfo/credit
2,Acknowledgement,Funding provided by NSF Ocean Sciences (NSF OC...,dataset_3470.xml,a_d_c,/identificationInfo/credit
3,Address,WHOI MS#36,dataset_3470.xml,a_d_c,/contact/contactInfo/address/deliveryPoint
4,Address,"Department of Ocean, Earth, and Atmospheric Sc...",dataset_3470.xml,a_d_c,/identificationInfo/pointOfContact/contactInfo...
5,Address,WHOI MS#36,dataset_3470.xml,a_d_c,/contentInfo/featureCatalogueCitation/citedRes...
6,Address,WHOI MS#36,dataset_3470.xml,a_d_c,/distributionInfo/distributor/distributorConta...
7,Address,WHOI MS#36,dataset_3470.xml,a_d_c,/metadataMaintenance/contact/contactInfo/addre...
8,AssociatedDIFs,U.S. GEOTRACES,dataset_3470.xml,a_d_c,/identificationInfo/aggregationInfo/aggregateD...
9,AssociatedDIFs,U.S. GEOTRACES NAT,dataset_3470.xml,a_d_c,/identificationInfo/aggregationInfo/aggregateD...


In [28]:
FiledirectoryRAD=os.path.join('../data/',Organization)
FilenameRAD='/'+Collection+'_'+Dialect+'_RAD.csv'
FilePathRAD=FiledirectoryRAD+FilenameRAD
group_name = EvaluatedSimplifiedMetadataDF.groupby(['Collection','Record', 'Concept'], as_index=False)
occuranceMatrix=group_name.size().unstack().reset_index()
occuranceMatrix=occuranceMatrix.fillna(0)
pd.options.display.float_format = '{:,.0f}'.format
occuranceMatrix.to_csv(FilePathRAD, mode = 'w', index=False)
occuranceMatrix

Concept,Collection,Record,Abstract,Acknowledgement,Address,AssociatedDIFs,Association,Attribute Definition,Attribute Label,Author / Originator,...,Start Time,Temporal Extent,Theme Keyword,Topic Category,URL,Units,Unknown,VariableType,Web Page,Westernmost Longitude
0,a_d_c,dataset_3470.xml,1,2,5,2,2,17,17,1,...,0,1,15,1,1,17,477,1,7,1
1,a_d_c,dataset_3484.xml,1,2,5,2,2,5,5,1,...,0,1,6,1,1,5,352,1,7,1
2,a_d_c,dataset_3485.xml,1,2,5,2,2,14,14,1,...,0,1,14,1,1,14,429,1,7,1
3,a_d_c,dataset_3486.xml,1,2,5,2,2,16,16,1,...,0,1,15,1,1,16,455,1,7,1
4,a_d_c,dataset_3508.xml,1,2,5,2,2,10,10,1,...,0,1,10,1,1,10,398,1,7,1
5,a_d_c,dataset_3510.xml,1,2,5,2,2,11,11,1,...,0,1,11,1,1,11,408,1,7,1
6,a_d_c,dataset_3513.xml,1,2,5,2,2,11,11,1,...,0,1,10,1,1,11,395,1,7,1
7,a_d_c,dataset_3514.xml,1,2,5,2,2,10,10,1,...,0,1,9,1,1,10,385,1,7,1
8,a_d_c,dataset_3515.xml,1,2,5,2,2,11,11,1,...,0,1,10,1,1,11,395,1,7,1
9,a_d_c,dataset_3516.xml,1,2,5,2,2,27,27,1,...,0,1,24,1,1,27,584,1,7,1


In [26]:
FiledirectoryQuickE=os.path.join('../data/',Organization)
FilenameQuickE='/'+Collection+'_'+Dialect+'_QuickE.csv'
FilePathQuickE=FiledirectoryQuickE+FilenameQuickE
group_name = EvaluatedSimplifiedMetadataDF.groupby(['XPath', 'Record'], as_index=False)
QuickEdf=group_name.size().unstack().reset_index()
QuickEdf=QuickEdf.fillna(0)
pd.options.display.float_format = '{:,.0f}'.format
QuickEdf.to_csv(FilePathQuickE, mode = 'w', index=False)
QuickEdf

Record,XPath,dataset_3470.xml,dataset_3484.xml,dataset_3485.xml,dataset_3486.xml,dataset_3508.xml,dataset_3510.xml,dataset_3513.xml,dataset_3514.xml,dataset_3515.xml,...,dataset_647580.xml,dataset_647606.xml,dataset_647909.xml,dataset_648030.xml,dataset_648543.xml,dataset_648753.xml,dataset_650087.xml,dataset_650135.xml,dataset_650340.xml,dataset_651138.xml
0,/@xsi:schemaLocation,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,/acquisitionInformation/instrument,1,1,0,0,1,1,0,0,0,...,4,6,3,3,3,3,2,0,2,3
2,/acquisitionInformation/instrument/description,1,1,0,0,1,1,0,0,0,...,4,6,3,3,3,3,2,0,2,3
3,/acquisitionInformation/instrument/identifier/...,2,2,0,0,2,2,0,0,0,...,8,12,6,6,6,2,4,0,4,6
4,/acquisitionInformation/instrument/identifier/...,1,1,0,0,1,1,0,0,0,...,4,5,3,3,3,0,2,0,2,3
5,/acquisitionInformation/instrument/identifier/...,1,1,0,0,1,1,0,0,0,...,1,1,1,1,1,1,1,0,1,1
6,/acquisitionInformation/instrument/identifier/...,1,1,0,0,1,1,0,0,0,...,4,6,3,3,3,3,2,0,2,3
7,/acquisitionInformation/instrument/identifier/...,1,1,0,0,1,1,0,0,0,...,4,6,3,3,3,3,2,0,2,3
8,/acquisitionInformation/instrument/type,2,2,0,0,2,2,0,0,0,...,8,11,6,6,6,1,4,0,4,6
9,/acquisitionInformation/operation/description,2,2,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,3


In [29]:
FiledirectoryOccurance=os.path.join('../data/',Organization)
FilenameOccurance='/'+Organization+'_'+Collection+'_'+Dialect+'_Occurance.csv'
FilePathOccurance=FiledirectoryOccurance+FilenameOccurance

group_name = EvaluatedSimplifiedMetadataDF.groupby(['Record', 'Concept'], as_index=False)
occuranceMatrix=group_name.size().unstack().reset_index()
occuranceMatrix=occuranceMatrix.fillna(0)
occuranceSum=occuranceMatrix.sum()
occuranceCount=occuranceMatrix[occuranceMatrix!=0].count()
CollectionName=FilenameOccurance.partition("/")[2].partition("_Occurance.csv")[0]
result = pd.concat([occuranceSum, occuranceCount], axis=1).reset_index()
result.insert(1, 'Collection', CollectionName)
result.insert(4, 'CollectionOccurance%', CollectionName)
result.insert(4, 'AverageOccurancePerRecord', CollectionName)
result.columns = ['Concept', 'Collection', 'ConceptCount', 'RecordCount', 'AverageOccurancePerRecord', 'CollectionOccurance%' ]
NumberOfRecords = result.at[0, 'ConceptCount'].count('.xml')
result['CollectionOccurance%'] = result['RecordCount']/NumberOfRecords
result['CollectionOccurance%'] = pd.Series(["{0:.2f}%".format(val * 100) for val in result['CollectionOccurance%']], index = result.index)
result.at[0, 'ConceptCount'] = NumberOfRecords
result.at[0, 'Concept'] = 'Number of Records'
result['AverageOccurancePerRecord'] = result['ConceptCount']/NumberOfRecords
result['AverageOccurancePerRecord'] = result['AverageOccurancePerRecord'].astype(float)
result[["ConceptCount","RecordCount"]] = result[["ConceptCount","RecordCount"]].astype(int)
result['AverageOccurancePerRecord'] = pd.Series(["{0:.2f}".format(val) for val in result['AverageOccurancePerRecord']], index = result.index)
result.to_csv(FilePathOccurance, mode = 'w', index=False)
result

Unnamed: 0,Concept,Collection,ConceptCount,RecordCount,AverageOccurancePerRecord,CollectionOccurance%
0,Number of Records,a_d_c,117,117,1.00,100.00%
1,Abstract,a_d_c,117,117,1.00,100.00%
2,Acknowledgement,a_d_c,191,115,1.63,98.29%
3,Address,a_d_c,592,117,5.06,100.00%
4,AssociatedDIFs,a_d_c,261,117,2.23,100.00%
5,Association,a_d_c,261,117,2.23,100.00%
6,Attribute Definition,a_d_c,4132,117,35.32,100.00%
7,Attribute Label,a_d_c,4385,117,37.48,100.00%
8,Author / Originator,a_d_c,125,117,1.07,100.00%
9,Bounding Box,a_d_c,88,88,0.75,75.21%


### Go back to the Metadata2Data Notebook

* [Metadata To Data](Metadata2Data.ipynb)
