### This notebook allows the user to create a directory structure and locally upload the metadata to the MILE2 repository.

In [13]:
import pandas as pd
import os
from os import walk
import shutil
from ipywidgets import *
import ipywidgets as widgets
import requests
from contextlib import closing
import csv
import io
from IPython.display import display

### Now let's select some metadata.

Create a list of subdirectories in the collection directory of MILE2 to select metadata for evaluation

In [14]:
def OrganizationChoices(organization):
    global OrganizationChoice
    global Organization
    Organization=organization
    print("Curating Organization", Organization)
    
def CollectionName(collection):
    global CollectionChoice
    global Collection
    Collection=collection
    print("Collection is named", Collection)
    
def DialectName(dialect):
    global DialectChoice
    global Dialect
    Dialect=dialect
    print("Collection is written in the", Dialect,"dialect")   
    
def metadataLocation(directory):
    global LocationChoice
    global MetadataLocation
    MetadataLocation=directory
    print("The local directory", MetadataLocation,"contains the metadata for evaluation")    
    
w=interactive(OrganizationChoices, organization='')    

w2=interactive(CollectionName, collection='')

w3=interactive(DialectName, dialect='')

w4=interactive(metadataLocation, directory='')    

display(w)
display(w2)
display(w3)
display(w4)

Copy the metadata to the new directory structure.

In [50]:
%cd ../zip
MetadataDestination=os.path.join(Organization,Collection,Dialect,'xml')
os.makedirs(MetadataDestination, exist_ok=True)
os.makedirs(os.path.join('../data',Organization), exist_ok=True)
src_files = os.listdir(MetadataLocation)
for file_name in src_files:
    full_file_name = os.path.join(MetadataLocation, file_name)
    if (os.path.isfile(full_file_name)):
        shutil.copy(full_file_name, MetadataDestination)
shutil.make_archive('../upload/metadata', 'zip', os.getcwd())

/Users/scgordon/MetadataEvaluation/zip


'/Users/scgordon/MetadataEvaluation/upload/metadata.zip'

In [51]:
%cd ../upload 
# Send metadata package, read the response into a dataframe
url = 'http://metadig.nceas.ucsb.edu/metadata/evaluator'
files = {'zipxml': open('metadata.zip', 'rb')}
r = requests.post(url, files=files, headers={"Accept-Encoding": "gzip"})
r.raise_for_status()
EvaluatedMetadataDF = pd.read_csv(io.StringIO(r.text), quotechar='"')

#build filepaths and file names
Filedirectory=os.path.join('../data/',Organization)

Filename='/'+Collection+'_'+Dialect+'_Evaluated.csv.gz'
SimplfiedFilename='/'+Collection+'_'+Dialect+'_EvaluatedSimplified.csv.gz'
FilePath=Filedirectory+Filename
SimplifiedFilePath=Filedirectory+SimplfiedFilename
EvaluatedMetadataDF.insert(3, 'Collection', Collection+'_'+Dialect)

EvaluatedMetadataDF.to_csv(FilePath, mode = 'w', compression='gzip', index=False)

#Change directories, delete upload directory and zip. Delete copied metadata.
%cd ../
shutil.rmtree('upload')
%cd zip
shutil.rmtree(Organization)
%cd ../data

#Create a simplified XPath output
EvaluatedSimplifiedMetadataDF = EvaluatedMetadataDF.copy()
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.replace('/gco:CharacterString', '')
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.replace('/[a-z]+:+?', '/')
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.replace('/[A-Z]+_[A-Za-z]+/?', '/')
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.replace('//', '/')
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.rstrip('//')
EvaluatedSimplifiedMetadataDF.to_csv(SimplifiedFilePath, mode = 'w', compression='gzip', index=False)

/Users/scgordon/MetadataEvaluation/upload
/Users/scgordon/MetadataEvaluation
/Users/scgordon/MetadataEvaluation/zip
/Users/scgordon/MetadataEvaluation/data


In [52]:
FiledirectoryRAD=os.path.join('../data/',Organization)
FilenameRAD='/'+Collection+'_'+Dialect+'_RAD.csv'
FilePathRAD=FiledirectoryRAD+FilenameRAD
group_name = EvaluatedSimplifiedMetadataDF.groupby(['Collection','Record', 'Concept'], as_index=False)
occuranceMatrix=group_name.size().unstack().reset_index()
occuranceMatrix=occuranceMatrix.fillna(0)
occuranceMatrix.columns.names = ['']
pd.options.display.float_format = '{:,.0f}'.format
occuranceMatrix.to_csv(FilePathRAD, mode = 'w', index=False)
occuranceMatrix

Unnamed: 0,Collection,Record,Abstract,Address,Bounding Box,Browse Description,Browse File Name,Browse Format,Browse URL,Cited Resource Identifier,...,Standard Name Vocabulary,Start Time,Supplemental Information,Temporal Extent,Theme Keyword,URL,Unknown,VariableType,Web Page,Westernmost Longitude
0,Sample2_Sample,C1242104955.xml,1,2,1,0,0,0,0,1,...,5,1,1,1,1,2,183,1,4,1
1,Sample2_Sample,C1242476856.xml,1,2,1,0,0,0,0,1,...,5,1,1,1,1,2,174,1,4,1
2,Sample2_Sample,C1242560486.xml,1,2,1,0,0,0,0,1,...,5,1,1,1,1,2,177,1,4,1
3,Sample2_Sample,C1251066968.xml,1,2,1,0,0,0,0,1,...,5,1,1,1,1,5,183,1,4,1
4,Sample2_Sample,C1251117718.xml,1,2,1,0,0,0,0,1,...,5,1,1,1,1,5,182,1,4,1
5,Sample2_Sample,C1254640879.xml,1,2,1,0,0,0,0,1,...,5,1,1,1,1,4,267,1,4,1
6,Sample2_Sample,C1257704009.xml,1,2,1,0,0,0,0,1,...,5,1,1,1,1,4,160,1,4,1
7,Sample2_Sample,C1257710580.xml,1,2,1,0,0,0,0,1,...,5,1,1,1,1,4,163,1,4,1
8,Sample2_Sample,C1257843632.xml,1,2,1,0,0,0,0,1,...,5,1,1,1,1,4,160,1,4,1
9,Sample2_Sample,C1268959235.xml,1,2,1,0,0,0,0,1,...,5,1,1,1,1,4,212,1,4,1


In [53]:
FiledirectoryQuickE=os.path.join('../data/',Organization)
FilenameQuickE='/'+Collection+'_'+Dialect+'_QuickE.csv'
FilePathQuickE=FiledirectoryQuickE+FilenameQuickE
group_name = EvaluatedSimplifiedMetadataDF.groupby(['XPath', 'Record'], as_index=False)
QuickEdf=group_name.size().unstack().reset_index()
QuickEdf=QuickEdf.fillna(0)
pd.options.display.float_format = '{:,.0f}'.format
QuickEdf.to_csv(FilePathQuickE, mode = 'w', index=False)
QuickEdf

Record,XPath,C1242104955.xml,C1242476856.xml,C1242560486.xml,C1251066968.xml,C1251117718.xml,C1254640879.xml,C1257704009.xml,C1257710580.xml,C1257843632.xml,...,C1285672056.xml,C1287440846.xml,C1287673549.xml,C1289641362.xml,C1289642534.xml,C1289643454.xml,C1289648148.xml,C1289648318.xml,C1289653609.xml,C1289656046.xml
0,/acquisitionInformation/instrument,2,1,1,1,1,3,1,1,1,...,5,1,4,5,1,1,3,3,1,1
1,/acquisitionInformation/instrument/@id,2,1,1,1,1,8,1,1,1,...,7,1,5,7,1,1,3,3,1,1
2,/acquisitionInformation/instrument/citation/da...,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,/acquisitionInformation/instrument/citation/title,4,2,2,2,2,6,2,2,2,...,10,2,8,10,2,2,6,6,2,2
4,/acquisitionInformation/instrument/description...,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
5,/acquisitionInformation/instrument/identifier/...,6,3,3,3,3,9,3,3,3,...,15,3,12,15,3,3,9,9,3,3
6,/acquisitionInformation/instrument/identifier/...,2,1,1,1,1,3,1,1,1,...,5,1,4,5,1,1,3,3,1,1
7,/acquisitionInformation/instrument/mountedOn/@...,1,1,1,1,1,8,1,1,1,...,7,1,4,7,1,1,1,1,1,1
8,/acquisitionInformation/instrument/sensor/@id,2,1,1,1,1,8,1,1,1,...,7,1,5,7,1,1,3,3,1,1
9,/acquisitionInformation/instrument/sensor/cita...,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [54]:
FiledirectoryOccurance=os.path.join('../data/',Organization)
FilenameOccurance='/'+Collection+'_'+Dialect+'_Occurance.csv'
FilePathOccurance=FiledirectoryOccurance+FilenameOccurance

group_name = EvaluatedSimplifiedMetadataDF.groupby(['Record', 'Concept'], as_index=False)
occuranceMatrix=group_name.size().unstack().reset_index()
occuranceMatrix=occuranceMatrix.fillna(0)
occuranceSum=occuranceMatrix.sum()
occuranceCount=occuranceMatrix[occuranceMatrix!=0].count()
CollectionName=FilenameOccurance.partition("/")[2].partition("_Occurance.csv")[0]
result = pd.concat([occuranceSum, occuranceCount], axis=1).reset_index()
result.insert(1, 'Collection', CollectionName)
result.insert(4, 'CollectionOccurance%', CollectionName)
result.insert(4, 'AverageOccurancePerRecord', CollectionName)
result.columns = ['Concept', 'Collection', 'ConceptCount', 'RecordCount', 'AverageOccurancePerRecord', 'CollectionOccurance%' ]
NumberOfRecords = result.at[0, 'ConceptCount'].count('.xml')
result['CollectionOccurance%'] = result['RecordCount']/NumberOfRecords
result['CollectionOccurance%'] = pd.Series(["{0:.2f}%".format(val * 100) for val in result['CollectionOccurance%']], index = result.index)
result.at[0, 'ConceptCount'] = NumberOfRecords
result.at[0, 'Concept'] = 'Number of Records'
result['AverageOccurancePerRecord'] = result['ConceptCount']/NumberOfRecords
result['AverageOccurancePerRecord'] = result['AverageOccurancePerRecord'].astype(float)
result[["ConceptCount","RecordCount"]] = result[["ConceptCount","RecordCount"]].astype(int)
result['AverageOccurancePerRecord'] = pd.Series(["{0:.2f}".format(val) for val in result['AverageOccurancePerRecord']], index = result.index)
result.to_csv(FilePathOccurance, mode = 'w', index=False)
result

Unnamed: 0,Concept,Collection,ConceptCount,RecordCount,AverageOccurancePerRecord,CollectionOccurance%
0,Number of Records,Sample2_Sample,101,101,1.00,100.00%
1,Abstract,Sample2_Sample,101,101,1.00,100.00%
2,Address,Sample2_Sample,201,101,1.99,100.00%
3,Bounding Box,Sample2_Sample,103,101,1.02,100.00%
4,Browse Description,Sample2_Sample,15,15,0.15,14.85%
5,Browse File Name,Sample2_Sample,15,15,0.15,14.85%
6,Browse Format,Sample2_Sample,15,15,0.15,14.85%
7,Browse URL,Sample2_Sample,15,15,0.15,14.85%
8,Cited Resource Identifier,Sample2_Sample,101,101,1.00,100.00%
9,Cited Resource Title,Sample2_Sample,1054,101,10.44,100.00%


In [56]:
# Create dataframe of just the elements that do not have a version of Not Provided for their content
ContentProvidedDF = EvaluatedSimplifiedMetadataDF[EvaluatedSimplifiedMetadataDF.Content!=("Not provided" or "Not%20provided")]

if len(ContentProvidedDF)==len(EvaluatedSimplifiedMetadataDF):
   print("No elements contain a variant of 'Not provided' in their content for this collection")
   
else:
    print("Secondary data products, RAD, QuickE, Occurance, being created for collection for all elements that contain a variant of 'Not provided' in their content and a set of products for the elements that do not contain a variant of 'Not provided' in their content")
    
    # Create dataframe of just the elements that do not have a version of Not Provided for their content
    ContentProvidedDF = EvaluatedSimplifiedMetadataDF[EvaluatedSimplifiedMetadataDF.Content!=("Not provided" or "Not%20provided")]

    # Create secondary data products: RAD, QuickE, Occurance for both provided and not provided content.

    #not provided RAD
    FiledirectoryRADnotProvided=os.path.join('../data/',Organization)
    FilenameRADnotProvided='/'+Collection+'_'+Dialect+'_NotProvided_RAD.csv'
    FilePathRADnotProvided=FiledirectoryRADnotProvided+FilenameRADnotProvided
    group_namenotProvided = ContentNotProvidedDF.groupby(['Collection','Record', 'Concept'], as_index=False)
    occuranceMatrixnotProvided=group_namenotProvided.size().unstack().reset_index()
    occuranceMatrixnotProvided=occuranceMatrixnotProvided.fillna(0)
    pd.options.display.float_format = '{:,.0f}'.format
    occuranceMatrixnotProvided.to_csv(FilePathRADnotProvided, mode = 'w', index=False)

    #Provided RAD
    FiledirectoryRADProvided=os.path.join('../data/',Organization)
    FilenameRADProvided='/'+Collection+'_'+Dialect+'_Provided_RAD.csv'
    FilePathRADProvided=FiledirectoryRADProvided+FilenameRADProvided
    group_nameProvided = ContentProvidedDF.groupby(['Collection','Record', 'Concept'], as_index=False)
    occuranceMatrixProvided=group_nameProvided.size().unstack().reset_index()
    occuranceMatrixProvided=occuranceMatrixProvided.fillna(0)
    pd.options.display.float_format = '{:,.0f}'.format
    occuranceMatrixProvided.to_csv(FilePathRADProvided, mode = 'w', index=False)

    #not provided QuickE
    FiledirectoryQuickEnotProvided=os.path.join('../data/',Organization)
    FilenameQuickEnotProvided='/'+Collection+'_'+Dialect+'_NotProvided_QuickE.csv'
    FilePathQuickEnotProvided=FiledirectoryQuickEnotProvided+FilenameQuickEnotProvided
    group_namenotProvided = ContentNotProvidedDF.groupby(['XPath', 'Record'], as_index=False)
    QuickEdfnotProvided=group_namenotProvided.size().unstack().reset_index()
    QuickEdfnotProvided=QuickEdfnotProvided.fillna(0)
    pd.options.display.float_format = '{:,.0f}'.format
    QuickEdfnotProvided.to_csv(FilePathQuickEnotProvided, mode = 'w', index=False)

    #Provided QuickE
    FiledirectoryQuickEProvided=os.path.join('../data/',Organization)
    FilenameQuickEProvided='/'+Collection+'_'+Dialect+'_Provided_QuickE.csv'
    FilePathQuickEProvided=FiledirectoryQuickEProvided+FilenameQuickEProvided
    group_nameProvided = ContentProvidedDF.groupby(['XPath', 'Record'], as_index=False)
    QuickEdfProvided=group_nameProvided.size().unstack().reset_index()
    QuickEdfProvided=QuickEdfProvided.fillna(0)
    pd.options.display.float_format = '{:,.0f}'.format
    QuickEdfProvided.to_csv(FilePathQuickEProvided, mode = 'w', index=False)

    #Provided Occurance
    FiledirectoryOccurance=os.path.join('../data/',Organization)
    FilenameOccurance='/'+Collection+'_'+Dialect+'_Provided_Occurance.csv'
    FilePathOccurance=FiledirectoryOccurance+FilenameOccurance

    group_name = ContentProvidedDF.groupby(['Record', 'Concept'], as_index=False)
    occuranceMatrix=group_name.size().unstack().reset_index()
    occuranceMatrix=occuranceMatrix.fillna(0)
    occuranceSum=occuranceMatrix.sum()
    occuranceCount=occuranceMatrix[occuranceMatrix!=0].count()
    CollectionName=FilenameOccurance.partition("/")[2].partition("_Provided_Occurance.csv")[0]
    result = pd.concat([occuranceSum, occuranceCount], axis=1).reset_index()
    result.insert(1, 'Collection', CollectionName)
    result.insert(4, 'CollectionOccurance%', CollectionName)
    result.insert(4, 'AverageOccurancePerRecord', CollectionName)
    result.columns = ['Concept', 'Collection', 'ConceptCount', 'RecordCount', 'AverageOccurancePerRecord', 'CollectionOccurance%' ]
    NumberOfRecords = result.at[0, 'ConceptCount'].count('.xml')
    result['CollectionOccurance%'] = result['RecordCount']/NumberOfRecords
    result['CollectionOccurance%'] = pd.Series(["{0:.2f}%".format(val * 100) for val in result['CollectionOccurance%']], index = result.index)
    result.at[0, 'ConceptCount'] = NumberOfRecords
    result.at[0, 'Concept'] = 'Number of Records'
    result['AverageOccurancePerRecord'] = result['ConceptCount']/NumberOfRecords
    result['AverageOccurancePerRecord'] = result['AverageOccurancePerRecord'].astype(float)
    result[["ConceptCount","RecordCount"]] = result[["ConceptCount","RecordCount"]].astype(int)
    result['AverageOccurancePerRecord'] = pd.Series(["{0:.2f}".format(val) for val in result['AverageOccurancePerRecord']], index = result.index)
    result.to_csv(FilePathOccurance, mode = 'w', index=False)
   
    #Not provided Occurance
    FiledirectoryOccurance=os.path.join('../data/',Organization)
    FilenameOccurance='/'+Collection+'_'+Dialect+'_NotProvided_Occurance.csv'
    FilePathOccurance=FiledirectoryOccurance+FilenameOccurance

    group_name = ContentNotProvidedDF.groupby(['Record', 'Concept'], as_index=False)
    occuranceMatrix=group_name.size().unstack().reset_index()
    occuranceMatrix=occuranceMatrix.fillna(0)
    occuranceSum=occuranceMatrix.sum()
    occuranceCount=occuranceMatrix[occuranceMatrix!=0].count()
    CollectionName=FilenameOccurance.partition("/")[2].partition("_NotProvided_Occurance.csv")[0]
    result = pd.concat([occuranceSum, occuranceCount], axis=1).reset_index()
    result.insert(1, 'Collection', CollectionName)
    result.insert(4, 'CollectionOccurance%', CollectionName)
    result.insert(4, 'AverageOccurancePerRecord', CollectionName)
    result.columns = ['Concept', 'Collection', 'ConceptCount', 'RecordCount', 'AverageOccurancePerRecord', 'CollectionOccurance%' ]
    NumberOfRecords = result.at[0, 'ConceptCount'].count('.xml')
    result['CollectionOccurance%'] = result['RecordCount']/NumberOfRecords
    result['CollectionOccurance%'] = pd.Series(["{0:.2f}%".format(val * 100) for val in result['CollectionOccurance%']], index = result.index)
    result.at[0, 'ConceptCount'] = NumberOfRecords
    result.at[0, 'Concept'] = 'Number of Records'
    result['AverageOccurancePerRecord'] = result['ConceptCount']/NumberOfRecords
    result['AverageOccurancePerRecord'] = result['AverageOccurancePerRecord'].astype(float)
    result[["ConceptCount","RecordCount"]] = result[["ConceptCount","RecordCount"]].astype(int)
    result['AverageOccurancePerRecord'] = pd.Series(["{0:.2f}".format(val) for val in result['AverageOccurancePerRecord']], index = result.index)
    result.to_csv(FilePathOccurance, mode = 'w', index=False)
print("Good bye!")




No elements contain a variant of 'Not provided' in their content for this collection
Good bye!


### Combine different types of evaluated data to compare between collections or observe trends over time

* [Combine_Data](Combine_Data.ipynb)
