## Choose the collections you want to compare


This notebook allows the user to select multiple data tables, and combine them into one table for processing.

In [1]:
import pandas as pd
pd.options.display.max_colwidth=200
import os
from os import walk
import ipywidgets as widgets
from IPython.display import display
from operator import itemgetter
os.makedirs('../data/Combined', exist_ok=True)

Create a dropdown that allows multiple selection using command or click with the mouse or arrow keys to select additional individual data tables, or shift with the mouse to select each choice between the two clicked.

In [2]:
#Creates a list of all the paths to evaluated collection data.
EvaluatedMetadata=[]
for dirpath, dirnames, filenames in os.walk("../data"):
    for filename in [f for f in filenames if f.endswith('Evaluated.csv.gz')]:
        EvaluatedMetadata.extend([os.path.join(dirpath, filename)])

#widget for selecting multiple collections        
w1=widgets.SelectMultiple(
    options=sorted(EvaluatedMetadata),
    value=['../data/Sample/Sample_Sample_Evaluated.csv.gz'],
    #rows=10,
    description='Evaluated Data',
    disabled=False
)
#Show the widget
display(w1)

#Creates a list of all the paths to evaluated collection data.
QuickE=[]
for dirpath, dirnames, filenames in os.walk("../data"):
    for filename in [f for f in filenames if f.endswith('QuickE.csv')]:
        QuickE.extend([os.path.join(dirpath, filename)])

#widget for selecting multiple collections        
w2=widgets.SelectMultiple(
    options=sorted(QuickE),
    value=['../data/Sample/Sample_Sample_QuickE.csv'],
    #rows=10,
    description='QuickE Data',
    disabled=False
)
#Show the widget
display(w2)

#Creates a list of all the paths to evaluated collection data.
EvaluatedMetadata=[]
for dirpath, dirnames, filenames in os.walk("../data"):
    for filename in [f for f in filenames if f.endswith('Occurance.csv')]:
        EvaluatedMetadata.extend([os.path.join(dirpath, filename)])

#widget for selecting multiple collections        
w3=widgets.SelectMultiple(
    options=sorted(EvaluatedMetadata),
    value=['../data/Sample/Sample_Sample_Occurance.csv'],
    #rows=10,
    description='Occurance Data',
    disabled=False
)
#Show the widget
display(w3)

#widget for naming the file
w4=widgets.Text(
    value='',
    placeholder='Your file name (no spaces)',
    description='Name your File:',
    disabled=False
)
display(w4)

Concatenate the files into one and save it in the combine directory

In [20]:
tupleList=[w1.value,w2.value,w3.value]
CollectionComparisons=max(tupleList,key=len)
DataDestination=os.path.join('../data/Combined', str(w4.value)+'.csv.gz')
CombinedDF = pd.concat((pd.read_csv(f) for f in CollectionComparisons)) 
CombinedDF.to_csv(DataDestination, mode = 'w', compression='gzip', index=False)
CombinedDF.loc[CombinedDF['Concept'] == "Number of Records"]

Unnamed: 0,Concept,Collection,ConceptCount,RecordCount,AverageOccurancePerRecord,CollectionOccurance%
0,Number of Records,2006_EML,250,250,1,100.00%
0,Number of Records,2007_EML,250,250,1,100.00%
0,Number of Records,2008_EML,250,250,1,100.00%
0,Number of Records,2009_EML,250,250,1,100.00%
0,Number of Records,2010_EML,250,250,1,100.00%
0,Number of Records,2011_EML,250,250,1,100.00%
0,Number of Records,2012_EML,250,250,1,100.00%
0,Number of Records,2013_EML,250,250,1,100.00%
0,Number of Records,2014_EML,250,250,1,100.00%
0,Number of Records,2015_EML,250,250,1,100.00%


In [19]:
CombinedPivotDF = CombinedDF.pivot(index='Concept', columns='Collection', values='ConceptCount')
pd.options.display.float_format = '{:,.0f}'.format
CombinedPivotDF.fillna(0)

Collection,2006_EML,2007_EML,2008_EML,2009_EML,2010_EML,2011_EML,2012_EML,2013_EML,2014_EML,2015_EML,2016_EML
Concept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Abstract,250,235,250,247,244,241,244,250,249,250,250
Attribute Constraints,10,63,8,0,0,7,7,0,112,0,42
Attribute Definition,3456,5019,2867,4405,2068,3189,3064,2916,6009,2078,5594
Attribute List,351,315,212,350,164,250,255,238,233,150,415
Author,390,519,521,535,392,441,370,387,473,636,462
Author / Originator,390,519,521,535,392,441,370,387,473,636,462
Author / Originator Email Address,104,236,257,109,136,39,75,107,146,237,91
Author / Originator Identifier,90,137,163,285,215,25,114,199,184,136,75
Author / Originator Identifier Type,0,0,0,0,0,1,5,1,0,0,0
Author / Originator World Wide Web Address,233,368,404,404,173,240,189,177,302,298,271


In [21]:
CombinedPivotDF = CombinedDF.pivot(index='Concept', columns='Collection', values='RecordCount')
pd.options.display.float_format = '{:,.0f}'.format
CombinedPivotDF.fillna(0)

Collection,2006_EML,2007_EML,2008_EML,2009_EML,2010_EML,2011_EML,2012_EML,2013_EML,2014_EML,2015_EML,2016_EML
Concept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Abstract,250,235,250,247,244,241,244,250,249,250,250
Attribute Constraints,1,15,2,0,0,2,4,0,7,0,8
Attribute Definition,193,225,179,205,131,146,217,223,149,130,231
Attribute List,193,225,179,205,131,146,217,223,149,130,231
Author,250,250,250,250,250,250,250,250,250,250,250
Author / Originator,250,250,250,250,250,250,250,250,250,250,250
Author / Originator Email Address,98,128,177,77,105,29,53,81,84,100,57
Author / Originator Identifier,49,95,92,99,119,12,80,129,83,135,46
Author / Originator Identifier Type,0,0,0,0,0,1,5,1,0,0,0
Author / Originator World Wide Web Address,189,168,228,186,127,165,104,112,174,108,176
