##  This notebook allows the user to look at all the content a specific concept contains, collection wide.

### Read in a csv that the user selects by selecting the organization and collection. 

In [168]:
import pandas as pd
pd.options.display.max_colwidth=2000
import os
from os import walk
import numpy as np
from ipykernel import kernelapp as app
from __future__ import print_function
from ipywidgets import *
import ipywidgets as widgets
from bokeh.charts import output_notebook, output_file, show, Bar, Scatter, Histogram, TimeSeries
from bokeh.plotting import figure
from bokeh.models import Range1d, HoverTool, ResizeTool
from bokeh.charts import defaults
from bokeh.models import Legend
defaults.width = 1200
defaults.height = 800
output_notebook()
#from ipywidgets import Button, Layout
#from glob import glob

### query the directory for subdirectory names, return them in a list.

In [169]:
Organizations = []
for (dirpath, dirnames, filenames) in walk('../../ConceptMining/data/'):
    Organizations.extend(dirnames)
    break
Organizations    

['BCO-DMO',
 'DataOne',
 'IEDA',
 'LTERthroughTime',
 'NASA',
 'NCAR',
 'ORNL',
 'USGS']

### Create a function to populate a variable used to populate the list for the collection dropdown

In [170]:
def OrganizationChoices(Organization):
    global OrganizationChoice
    global Collections
    OrganizationChoice=os.path.join('../../ConceptMining/data',Organization)
    Collections=os.listdir(OrganizationChoice)


### Choose the organization you want to look at by creating a dropdown with the function that identifies the organizations that have data.

In [171]:
interactive(OrganizationChoices, Organization=Organizations)

### Function that reads the selected csv into a dataframe

In [172]:
def CollectionChoices(Collection):
    global CollectionConceptsDF
    CollectionConceptsDF= pd.read_csv(os.path.join(OrganizationChoice, Collection))
    return CollectionConceptsDF

### Choose the CSV you want to examine

In [173]:
interactive(CollectionChoices, Collection=Collections)

Unnamed: 0,Collection,Dialect,Record,Concept,XPath,Content
0,ECL,DCITE,100004.xml,Abstract,/resource/descriptions/description[@descriptionType='Abstract'],"Analyses of volatile, major, and trace elements for a suite of glasses and melt inclusions from the 85 degrees E segment of the ultra-slow spreading Gakkel Ridge."
1,ECL,DCITE,100004.xml,Bounding Box,/resource/geoLocations/geoLocation/geoLocationPlace,Gakkel Ridge Arctic Ocean
2,ECL,DCITE,100004.xml,Contributor Name,/resource/contributors/contributor/contributorName,EarthChem Library (ECL)
3,ECL,DCITE,100004.xml,Contributor Role,/resource/contributors/contributor/@contributorType,Editor
4,ECL,DCITE,100004.xml,Distribution Format,/resource/formats/format,application/vnd.ms-excel
5,ECL,DCITE,100004.xml,First Name,/resource/creators/creator/creatorName,"Shaw, Alison"
6,ECL,DCITE,100004.xml,Keyword,/resource/subjects/subject,Geochemistry
7,ECL,DCITE,100004.xml,Keyword Vocabulary,/resource/subjects/subject/@subjectScheme,ECL
8,ECL,DCITE,100004.xml,Last Name,/resource/creators/creator/creatorName,"Shaw, Alison"
9,ECL,DCITE,100004.xml,Middle Name,/resource/creators/creator/creatorName,"Shaw, Alison"


###  Let's get rid of the columns we won't use for this analysis. We already know the organization and dialect after we have selected the organization and collection. The xpaths aren't needed as they are the xpaths that were looked for and in many cases are relative rather than the actual location within the record itself. I now can change this to actual path, which I believe is more useful....

In [174]:
CollectionConceptsDF.drop(['Collection','Dialect', 'XPath'], axis=1, inplace=True)
CollectionConceptsDF

Unnamed: 0,Record,Concept,Content
0,100004.xml,Abstract,"Analyses of volatile, major, and trace elements for a suite of glasses and melt inclusions from the 85 degrees E segment of the ultra-slow spreading Gakkel Ridge."
1,100004.xml,Bounding Box,Gakkel Ridge Arctic Ocean
2,100004.xml,Contributor Name,EarthChem Library (ECL)
3,100004.xml,Contributor Role,Editor
4,100004.xml,Distribution Format,application/vnd.ms-excel
5,100004.xml,First Name,"Shaw, Alison"
6,100004.xml,Keyword,Geochemistry
7,100004.xml,Keyword Vocabulary,ECL
8,100004.xml,Last Name,"Shaw, Alison"
9,100004.xml,Middle Name,"Shaw, Alison"


### Now that we have the data we want, what are the understood concepts that exist in the collection?

In [175]:
ConceptVerticals=CollectionConceptsDF.Concept.unique()
Verticals=ConceptVerticals.tolist()
Verticals

['Abstract',
 'Bounding Box',
 'Contributor Name',
 'Contributor Role',
 'Distribution Format',
 'First Name',
 'Keyword',
 'Keyword Vocabulary',
 'Last Name',
 'Middle Name',
 'Publication Date',
 'Publisher',
 'Related Resource Identifier',
 'Resource Creation/Revision Date',
 'Resource Format',
 'Resource Identifier',
 'Resource Identifier Type',
 'Resource Language',
 'Resource Title',
 'Resource Type',
 'Resource Version',
 'Responsible Party Identifier',
 'Responsible Party Identifier Type',
 'Rights',
 'Spatial Extent',
 'Theme Keyword',
 'Author / Originator Identifier']

### Create function that allows us to call up metadata vertical content for a concept

In [176]:
def ConceptVerticalTable(Concept):
    global VerticalTable
    VerticalTable = CollectionConceptsDF[CollectionConceptsDF.Concept == Concept]
    return VerticalTable

### Create a dropdown using the function that allows us to create a dataframe of the concept you want as a metadata vertical.

In [177]:
interact(ConceptVerticalTable, Concept=Verticals) 

Unnamed: 0,Record,Concept,Content
6,100004.xml,Keyword,Geochemistry
37,100031.xml,Keyword,Chemistry:Fluid
68,100032.xml,Keyword,Geochemistry
97,100033.xml,Keyword,Geochemistry
125,100034.xml,Keyword,Geochemistry
153,100035.xml,Keyword,Geochemistry
182,100036.xml,Keyword,Geochemistry
222,100037.xml,Keyword,Geochemistry
270,100038.xml,Keyword,Geochemistry
297,100039.xml,Keyword,Geochemistry


### Let's group the unique values in the content column and count them up.

In [180]:
VerticalTable.groupby('Content').size()

Content
Chemistry                 10
Chemistry:Fluid           47
Chemistry:Gas             33
Chemistry:Rock            65
Chemistry:Sediment        20
Geochemistry              69
Geochronology              8
Kinetics                   2
Other                     38
Petrography                4
Petrology                  7
Petrology:Experimental     2
Petrology:Mineral          3
SampleInfo                13
SocialScience              1
dtype: int64

### Remove colons from the values so there are no Bokeh Label errors. Use Bokeh to plot a bar chart of the unique values.

In [181]:
data = VerticalTable.Content.str.replace(':','.')

p = Bar(data, 'Content', title="Vertical Value Occurance Count", legend=False)

output_file("bar.html")

show(p)

INFO:bokeh.core.state:Session output file 'bar.html' already exists, will be overwritten.


