### This notebook allows the user to create a directory structure and locally upload the metadata to the MILE2 repository.

In [1]:
import pandas as pd
import os
from os import walk
import shutil
from ipywidgets import *
import ipywidgets as widgets
import requests
from contextlib import closing
import csv
import io
from IPython.display import display

### Now let's select some metadata.

Create a list of subdirectories in the collection directory of MILE2 to select metadata for evaluation

In [2]:
def OrganizationChoices(organization):
    global OrganizationChoice
    global Organization
    Organization=organization
    print("Curating Organization", Organization)
    
def CollectionName(collection):
    global CollectionChoice
    global Collection
    Collection=collection
    print("Collection is named", Collection)
    
def DialectName(dialect):
    global DialectChoice
    global Dialect
    Dialect=dialect
    print("Collection is written in the", Dialect,"dialect")   
    
def metadataLocation(directory):
    global LocationChoice
    global MetadataLocation
    MetadataLocation=directory
    print("The local directory", MetadataLocation,"contains the metadata for evaluation")    
    
w=interactive(OrganizationChoices, organization='')    

w2=interactive(CollectionName, collection='')

w3=interactive(DialectName, dialect='')

w4=interactive(metadataLocation, directory='')    

display(w)
display(w2)
display(w3)
display(w4)

Copy the metadata to the new directory structure.

In [16]:
%cd ../zip
MetadataDestination=os.path.join(Organization,Collection,Dialect,'xml')
os.makedirs(MetadataDestination, exist_ok=True)
os.makedirs(os.path.join('../data',Organization), exist_ok=True)
src_files = os.listdir(MetadataLocation)
for file_name in src_files:
    full_file_name = os.path.join(MetadataLocation, file_name)
    if (os.path.isfile(full_file_name)):
        shutil.copy(full_file_name, MetadataDestination)
shutil.make_archive('../upload/metadata', 'zip', os.getcwd())

%cd ../upload 
# Send metadata package, read the response into a dataframe
url = 'http://metadig.nceas.ucsb.edu/metadata/evaluator'
files = {'zipxml': open('metadata.zip', 'rb')}
r = requests.post(url, files=files, headers={"Accept-Encoding": "gzip"})
r.raise_for_status()
EvaluatedMetadataDF = pd.read_csv(io.StringIO(r.text), quotechar='"')

#build filepaths and file names
Filedirectory=os.path.join('../data/',Organization)

Filename='/'+Collection+'_'+Dialect+'_Evaluated.csv.gz'
SimplfiedFilename='/'+Collection+'_'+Dialect+'_EvaluatedSimplified.csv.gz'
FilePath=Filedirectory+Filename
SimplifiedFilePath=Filedirectory+SimplfiedFilename
EvaluatedMetadataDF.insert(3, 'Collection', Collection+'_'+Dialect)

EvaluatedMetadataDF.to_csv(FilePath, mode = 'w', compression='gzip', index=False)

#Change directories, delete upload directory and zip. Delete copied metadata.
%cd ../
shutil.rmtree('upload')
%cd zip
shutil.rmtree(Organization)
%cd ../data

#Create a simplified XPath output
EvaluatedSimplifiedMetadataDF = EvaluatedMetadataDF.copy()
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.replace('/gco:CharacterString', '')
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.replace('/[a-z]+:+?', '/')
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.replace('/[A-Z]+_[A-Za-z]+/?', '/')
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.replace('//', '/')
EvaluatedSimplifiedMetadataDF['XPath']=EvaluatedSimplifiedMetadataDF['XPath'].str.rstrip('//')
EvaluatedSimplifiedMetadataDF.to_csv(SimplifiedFilePath, mode = 'w', compression='gzip', index=False)

FiledirectoryRAD=os.path.join('../data/',Organization)
FilenameRAD='/'+Collection+'_'+Dialect+'_RAD.csv'
FilePathRAD=FiledirectoryRAD+FilenameRAD
group_name = EvaluatedSimplifiedMetadataDF.groupby(['Collection','Record', 'Concept'], as_index=False)
occuranceMatrix=group_name.size().unstack().reset_index()
occuranceMatrix=occuranceMatrix.fillna(0)
occuranceMatrix.columns.names = ['']
pd.options.display.float_format = '{:,.0f}'.format
occuranceMatrix.to_csv(FilePathRAD, mode = 'w', index=False)
occuranceMatrix

FiledirectoryQuickE=os.path.join('../data/',Organization)
FilenameQuickE='/'+Collection+'_'+Dialect+'_QuickE.csv'
FilePathQuickE=FiledirectoryQuickE+FilenameQuickE
group_name = EvaluatedSimplifiedMetadataDF.groupby(['XPath', 'Record'], as_index=False)
QuickEdf=group_name.size().unstack().reset_index()
QuickEdf=QuickEdf.fillna(0)
pd.options.display.float_format = '{:,.0f}'.format
QuickEdf.to_csv(FilePathQuickE, mode = 'w', index=False)
QuickEdf

FiledirectoryOccurance=os.path.join('../data/',Organization)
FilenameOccurance='/'+Collection+'_'+Dialect+'_Occurance.csv'
FilePathOccurance=FiledirectoryOccurance+FilenameOccurance
group_name = EvaluatedSimplifiedMetadataDF.groupby(['Record', 'Concept'], as_index=False)
occuranceMatrix=group_name.size().unstack().reset_index()
occuranceMatrix=occuranceMatrix.fillna(0)
occuranceSum=occuranceMatrix.sum()
occuranceCount=occuranceMatrix[occuranceMatrix!=0].count()
CollectionName=FilenameOccurance.partition("/")[2].partition("_Occurance.csv")[0]
result = pd.concat([occuranceSum, occuranceCount], axis=1).reset_index()
result.insert(1, 'Collection', CollectionName)
result.insert(4, 'CollectionOccurance%', CollectionName)
result.insert(4, 'AverageOccurancePerRecord', CollectionName)
result.columns = ['Concept', 'Collection', 'ConceptCount', 'RecordCount', 'AverageOccurancePerRecord', 'CollectionOccurance%' ]
NumberOfRecords = result.at[0, 'ConceptCount'].count('.xml')
result['CollectionOccurance%'] = result['RecordCount']/NumberOfRecords
result['CollectionOccurance%'] = pd.Series(["{0:.2f}%".format(val * 100) for val in result['CollectionOccurance%']], index = result.index)
result.at[0, 'ConceptCount'] = NumberOfRecords
result.at[0, 'Concept'] = 'Number of Records'
result['AverageOccurancePerRecord'] = result['ConceptCount']/NumberOfRecords
result['AverageOccurancePerRecord'] = result['AverageOccurancePerRecord'].astype(float)
result[["ConceptCount","RecordCount"]] = result[["ConceptCount","RecordCount"]].astype(int)
result['AverageOccurancePerRecord'] = pd.Series(["{0:.2f}".format(val) for val in result['AverageOccurancePerRecord']], index = result.index)
result.to_csv(FilePathOccurance, mode = 'w', index=False)
result

/Users/scgordon/MetadataEvaluation/zip
/Users/scgordon/MetadataEvaluation/upload
/Users/scgordon/MetadataEvaluation
/Users/scgordon/MetadataEvaluation/zip
/Users/scgordon/MetadataEvaluation/data


Unnamed: 0,Concept,Collection,ConceptCount,RecordCount,AverageOccurancePerRecord,CollectionOccurance%
0,Number of Records,2016_EML,250,250,1.00,100.00%
1,Abstract,2016_EML,250,250,1.00,100.00%
2,Attribute Constraints,2016_EML,42,8,0.17,3.20%
3,Attribute Definition,2016_EML,5594,231,22.38,92.40%
4,Attribute List,2016_EML,415,231,1.66,92.40%
5,Author,2016_EML,462,250,1.85,100.00%
6,Author / Originator,2016_EML,462,250,1.85,100.00%
7,Author / Originator Email Address,2016_EML,91,57,0.36,22.80%
8,Author / Originator Identifier,2016_EML,75,46,0.30,18.40%
9,Author / Originator World Wide Web Address,2016_EML,271,176,1.08,70.40%


In [56]:
# Create dataframe of just the elements that do not have a version of Not Provided for their content
ContentProvidedDF = EvaluatedSimplifiedMetadataDF[EvaluatedSimplifiedMetadataDF.Content!=("Not provided" or "Not%20provided")]

if len(ContentProvidedDF)==len(EvaluatedSimplifiedMetadataDF):
   print("No elements contain a variant of 'Not provided' in their content for this collection")
   
else:
    print("Secondary data products, RAD, QuickE, Occurance, being created for collection for all elements that contain a variant of 'Not provided' in their content and a set of products for the elements that do not contain a variant of 'Not provided' in their content")
    
    # Create dataframe of just the elements that do not have a version of Not Provided for their content
    ContentProvidedDF = EvaluatedSimplifiedMetadataDF[EvaluatedSimplifiedMetadataDF.Content!=("Not provided" or "Not%20provided")]

    # Create secondary data products: RAD, QuickE, Occurance for both provided and not provided content.

    #not provided RAD
    FiledirectoryRADnotProvided=os.path.join('../data/',Organization)
    FilenameRADnotProvided='/'+Collection+'_'+Dialect+'_NotProvided_RAD.csv'
    FilePathRADnotProvided=FiledirectoryRADnotProvided+FilenameRADnotProvided
    group_namenotProvided = ContentNotProvidedDF.groupby(['Collection','Record', 'Concept'], as_index=False)
    occuranceMatrixnotProvided=group_namenotProvided.size().unstack().reset_index()
    occuranceMatrixnotProvided=occuranceMatrixnotProvided.fillna(0)
    pd.options.display.float_format = '{:,.0f}'.format
    occuranceMatrixnotProvided.to_csv(FilePathRADnotProvided, mode = 'w', index=False)

    #Provided RAD
    FiledirectoryRADProvided=os.path.join('../data/',Organization)
    FilenameRADProvided='/'+Collection+'_'+Dialect+'_Provided_RAD.csv'
    FilePathRADProvided=FiledirectoryRADProvided+FilenameRADProvided
    group_nameProvided = ContentProvidedDF.groupby(['Collection','Record', 'Concept'], as_index=False)
    occuranceMatrixProvided=group_nameProvided.size().unstack().reset_index()
    occuranceMatrixProvided=occuranceMatrixProvided.fillna(0)
    pd.options.display.float_format = '{:,.0f}'.format
    occuranceMatrixProvided.to_csv(FilePathRADProvided, mode = 'w', index=False)

    #not provided QuickE
    FiledirectoryQuickEnotProvided=os.path.join('../data/',Organization)
    FilenameQuickEnotProvided='/'+Collection+'_'+Dialect+'_NotProvided_QuickE.csv'
    FilePathQuickEnotProvided=FiledirectoryQuickEnotProvided+FilenameQuickEnotProvided
    group_namenotProvided = ContentNotProvidedDF.groupby(['XPath', 'Record'], as_index=False)
    QuickEdfnotProvided=group_namenotProvided.size().unstack().reset_index()
    QuickEdfnotProvided=QuickEdfnotProvided.fillna(0)
    pd.options.display.float_format = '{:,.0f}'.format
    QuickEdfnotProvided.to_csv(FilePathQuickEnotProvided, mode = 'w', index=False)

    #Provided QuickE
    FiledirectoryQuickEProvided=os.path.join('../data/',Organization)
    FilenameQuickEProvided='/'+Collection+'_'+Dialect+'_Provided_QuickE.csv'
    FilePathQuickEProvided=FiledirectoryQuickEProvided+FilenameQuickEProvided
    group_nameProvided = ContentProvidedDF.groupby(['XPath', 'Record'], as_index=False)
    QuickEdfProvided=group_nameProvided.size().unstack().reset_index()
    QuickEdfProvided=QuickEdfProvided.fillna(0)
    pd.options.display.float_format = '{:,.0f}'.format
    QuickEdfProvided.to_csv(FilePathQuickEProvided, mode = 'w', index=False)

    #Provided Occurance
    FiledirectoryOccurance=os.path.join('../data/',Organization)
    FilenameOccurance='/'+Collection+'_'+Dialect+'_Provided_Occurance.csv'
    FilePathOccurance=FiledirectoryOccurance+FilenameOccurance

    group_name = ContentProvidedDF.groupby(['Record', 'Concept'], as_index=False)
    occuranceMatrix=group_name.size().unstack().reset_index()
    occuranceMatrix=occuranceMatrix.fillna(0)
    occuranceSum=occuranceMatrix.sum()
    occuranceCount=occuranceMatrix[occuranceMatrix!=0].count()
    CollectionName=FilenameOccurance.partition("/")[2].partition("_Provided_Occurance.csv")[0]
    result = pd.concat([occuranceSum, occuranceCount], axis=1).reset_index()
    result.insert(1, 'Collection', CollectionName)
    result.insert(4, 'CollectionOccurance%', CollectionName)
    result.insert(4, 'AverageOccurancePerRecord', CollectionName)
    result.columns = ['Concept', 'Collection', 'ConceptCount', 'RecordCount', 'AverageOccurancePerRecord', 'CollectionOccurance%' ]
    NumberOfRecords = result.at[0, 'ConceptCount'].count('.xml')
    result['CollectionOccurance%'] = result['RecordCount']/NumberOfRecords
    result['CollectionOccurance%'] = pd.Series(["{0:.2f}%".format(val * 100) for val in result['CollectionOccurance%']], index = result.index)
    result.at[0, 'ConceptCount'] = NumberOfRecords
    result.at[0, 'Concept'] = 'Number of Records'
    result['AverageOccurancePerRecord'] = result['ConceptCount']/NumberOfRecords
    result['AverageOccurancePerRecord'] = result['AverageOccurancePerRecord'].astype(float)
    result[["ConceptCount","RecordCount"]] = result[["ConceptCount","RecordCount"]].astype(int)
    result['AverageOccurancePerRecord'] = pd.Series(["{0:.2f}".format(val) for val in result['AverageOccurancePerRecord']], index = result.index)
    result.to_csv(FilePathOccurance, mode = 'w', index=False)
   
    #Not provided Occurance
    FiledirectoryOccurance=os.path.join('../data/',Organization)
    FilenameOccurance='/'+Collection+'_'+Dialect+'_NotProvided_Occurance.csv'
    FilePathOccurance=FiledirectoryOccurance+FilenameOccurance

    group_name = ContentNotProvidedDF.groupby(['Record', 'Concept'], as_index=False)
    occuranceMatrix=group_name.size().unstack().reset_index()
    occuranceMatrix=occuranceMatrix.fillna(0)
    occuranceSum=occuranceMatrix.sum()
    occuranceCount=occuranceMatrix[occuranceMatrix!=0].count()
    CollectionName=FilenameOccurance.partition("/")[2].partition("_NotProvided_Occurance.csv")[0]
    result = pd.concat([occuranceSum, occuranceCount], axis=1).reset_index()
    result.insert(1, 'Collection', CollectionName)
    result.insert(4, 'CollectionOccurance%', CollectionName)
    result.insert(4, 'AverageOccurancePerRecord', CollectionName)
    result.columns = ['Concept', 'Collection', 'ConceptCount', 'RecordCount', 'AverageOccurancePerRecord', 'CollectionOccurance%' ]
    NumberOfRecords = result.at[0, 'ConceptCount'].count('.xml')
    result['CollectionOccurance%'] = result['RecordCount']/NumberOfRecords
    result['CollectionOccurance%'] = pd.Series(["{0:.2f}%".format(val * 100) for val in result['CollectionOccurance%']], index = result.index)
    result.at[0, 'ConceptCount'] = NumberOfRecords
    result.at[0, 'Concept'] = 'Number of Records'
    result['AverageOccurancePerRecord'] = result['ConceptCount']/NumberOfRecords
    result['AverageOccurancePerRecord'] = result['AverageOccurancePerRecord'].astype(float)
    result[["ConceptCount","RecordCount"]] = result[["ConceptCount","RecordCount"]].astype(int)
    result['AverageOccurancePerRecord'] = pd.Series(["{0:.2f}".format(val) for val in result['AverageOccurancePerRecord']], index = result.index)
    result.to_csv(FilePathOccurance, mode = 'w', index=False)
print("Good bye!")




No elements contain a variant of 'Not provided' in their content for this collection
Good bye!


### Combine different types of evaluated data to compare between collections or observe trends over time

* [Combine_Data](Combine_Data.ipynb)
