## Choose the collections you want to compare


This notebook allows the user to select multiple data tables, and combine them into one table for processing.

In [1]:
import pandas as pd
pd.options.display.max_colwidth=200
import os
from os import walk
import ipywidgets as widgets
from IPython.display import display
from operator import itemgetter

import matplotlib

os.makedirs('../data/Combined', exist_ok=True)

Create a dropdown that allows multiple selection using command or click with the mouse or arrow keys to select additional individual data tables, or shift with the mouse to select each choice between the two clicked.

In [2]:
#Creates a list of all the paths to evaluated collection data.
EvaluatedMetadata=[]
for dirpath, dirnames, filenames in os.walk("../data"):
    for filename in [f for f in filenames if f.endswith('Evaluated.csv.gz')]:
        EvaluatedMetadata.extend([os.path.join(dirpath, filename)])

#widget for selecting multiple collections        
w1=widgets.SelectMultiple(
    options=sorted(EvaluatedMetadata),
    value=['../data/Sample/Sample_Sample_Evaluated.csv.gz'],
    #rows=10,
    description='Evaluated Data',
    disabled=False
)
#Show the widget
display(w1)

#Creates a list of all the paths to evaluated collection data.
QuickE=[]
for dirpath, dirnames, filenames in os.walk("../data"):
    for filename in [f for f in filenames if f.endswith('QuickE.csv')]:
        QuickE.extend([os.path.join(dirpath, filename)])

#widget for selecting multiple collections        
w2=widgets.SelectMultiple(
    options=sorted(QuickE),
    value=['../data/Sample/Sample_Sample_QuickE.csv'],
    #rows=10,
    description='QuickE Data',
    disabled=False
)
#Show the widget
display(w2)

#Creates a list of all the paths to evaluated collection data.
EvaluatedMetadata=[]
for dirpath, dirnames, filenames in os.walk("../data"):
    for filename in [f for f in filenames if f.endswith('Occurance.csv')]:
        EvaluatedMetadata.extend([os.path.join(dirpath, filename)])

#widget for selecting multiple collections        
w3=widgets.SelectMultiple(
    options=sorted(EvaluatedMetadata),
    value=['../data/Sample/Sample_Sample_Occurance.csv'],
    #rows=10,
    description='Occurance Data',
    disabled=False
)
#Show the widget
display(w3)

#widget for naming the file
w4=widgets.Text(
    value='',
    placeholder='Your file name (no spaces)',
    description='Name your File:',
    disabled=False
)
display(w4)

Concatenate the files into one and save it in the combine directory

In [7]:
tupleList=[w1.value,w2.value,w3.value]
CollectionComparisons=max(tupleList,key=len)
DataDestination=os.path.join('../data/Combined', str(w4.value)+'.csv.gz')
CombinedDF = pd.concat((pd.read_csv(f) for f in CollectionComparisons)) 
CombinedDF.to_csv(DataDestination, mode = 'w', compression='gzip', index=False)
CombinedPivotDF = CombinedDF.pivot(index='Concept', columns='Collection', values='ConceptCount')
pd.options.display.float_format = '{:,.0f}'.format
ConceptCountsDF=CombinedPivotDF.fillna(0)
ConceptCountsDF.columns.names = ['']
ConceptCountsDF=ConceptCountsDF.reset_index()
ConceptCountsDF

Unnamed: 0,Concept,2006_EML,2007_EML,2008_EML,2009_EML,2010_EML,2011_EML,2012_EML,2013_EML,2014_EML,2015_EML,2016_EML
0,Abstract,250,235,250,247,244,241,244,250,249,250,250
1,Attribute Constraints,10,63,8,0,0,7,7,0,112,0,42
2,Attribute Definition,3456,5019,2867,4405,2068,3189,3064,2916,6009,2078,5594
3,Attribute List,351,315,212,350,164,250,255,238,233,150,415
4,Author,390,519,521,535,392,441,370,387,473,636,462
5,Author / Originator,390,519,521,535,392,441,370,387,473,636,462
6,Author / Originator Email Address,104,236,257,109,136,39,75,107,146,237,91
7,Author / Originator Identifier,90,137,163,285,215,25,114,199,184,136,75
8,Author / Originator Identifier Type,0,0,0,0,0,1,5,1,0,0,0
9,Author / Originator World Wide Web Address,233,368,404,404,173,240,189,177,302,298,271


In [8]:
CombinedPivotDF = CombinedDF.pivot(index='Concept', columns='Collection', values='ConceptCount')
pd.options.display.float_format = '{:,.0f}'.format
ConceptCountsDF=CombinedPivotDF.fillna(0)
ConceptCountsDF.columns.names = ['']
ConceptCountsDF=ConceptCountsDF.reset_index()
ConceptCountsDF


Unnamed: 0,Concept,2006_EML,2007_EML,2008_EML,2009_EML,2010_EML,2011_EML,2012_EML,2013_EML,2014_EML,2015_EML,2016_EML
0,Abstract,250,235,250,247,244,241,244,250,249,250,250
1,Attribute Constraints,10,63,8,0,0,7,7,0,112,0,42
2,Attribute Definition,3456,5019,2867,4405,2068,3189,3064,2916,6009,2078,5594
3,Attribute List,351,315,212,350,164,250,255,238,233,150,415
4,Author,390,519,521,535,392,441,370,387,473,636,462
5,Author / Originator,390,519,521,535,392,441,370,387,473,636,462
6,Author / Originator Email Address,104,236,257,109,136,39,75,107,146,237,91
7,Author / Originator Identifier,90,137,163,285,215,25,114,199,184,136,75
8,Author / Originator Identifier Type,0,0,0,0,0,1,5,1,0,0,0
9,Author / Originator World Wide Web Address,233,368,404,404,173,240,189,177,302,298,271


In [12]:
CombinedPivotDF = CombinedDF.pivot(index='Concept', columns='Collection', values='AverageOccurancePerRecord')
pd.options.display.float_format = '{:,.2f}'.format
RecordCountsDF=CombinedPivotDF.fillna(0)
RecordCountsDF.columns.names = ['']
RecordCountsDF.reset_index()

Unnamed: 0,Concept,2006_EML,2007_EML,2008_EML,2009_EML,2010_EML,2011_EML,2012_EML,2013_EML,2014_EML,2015_EML,2016_EML
0,Abstract,1.00,0.94,1.00,0.99,0.98,0.96,0.98,1.00,1.00,1.00,1.00
1,Attribute Constraints,0.04,0.25,0.03,0.00,0.00,0.03,0.03,0.00,0.45,0.00,0.17
2,Attribute Definition,13.82,20.08,11.47,17.62,8.27,12.76,12.26,11.66,24.04,8.31,22.38
3,Attribute List,1.40,1.26,0.85,1.40,0.66,1.00,1.02,0.95,0.93,0.60,1.66
4,Author,1.56,2.08,2.08,2.14,1.57,1.76,1.48,1.55,1.89,2.54,1.85
5,Author / Originator,1.56,2.08,2.08,2.14,1.57,1.76,1.48,1.55,1.89,2.54,1.85
6,Author / Originator Email Address,0.42,0.94,1.03,0.44,0.54,0.16,0.30,0.43,0.58,0.95,0.36
7,Author / Originator Identifier,0.36,0.55,0.65,1.14,0.86,0.10,0.46,0.80,0.74,0.54,0.30
8,Author / Originator Identifier Type,0.00,0.00,0.00,0.00,0.00,0.00,0.02,0.00,0.00,0.00,0.00
9,Author / Originator World Wide Web Address,0.93,1.47,1.62,1.62,0.69,0.96,0.76,0.71,1.21,1.19,1.08
