Skip to content

Commit

Permalink
Merge pull request #15 from anusharanganathan/feature_index_data
Browse files Browse the repository at this point in the history
Feature index data
  • Loading branch information
Jessie Keck committed Jan 7, 2016
2 parents 441dc08 + 51b9637 commit 3d901db
Show file tree
Hide file tree
Showing 31 changed files with 26,467 additions and 351 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,7 @@ config/environments/*.local.yml

# Ignore coverage directory
coverage

#Ignore all data files
data/*
!data/.keep
4 changes: 4 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ group :development do
gem 'spring'
end

group :test do
gem 'webmock'
end

gem 'blacklight', '>= 5.3.0'
gem 'jettywrapper', '>= 2.0'
gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw]
Expand Down
10 changes: 10 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ GEM
minitest (~> 5.1)
thread_safe (~> 0.3, >= 0.3.4)
tzinfo (~> 1.1)
addressable (2.4.0)
arel (6.0.3)
autoprefixer-rails (6.1.0.1)
execjs
Expand Down Expand Up @@ -110,6 +111,8 @@ GEM
term-ansicolor (~> 1.3)
thor (~> 0.19.1)
tins (~> 1.6.0)
crack (0.4.3)
safe_yaml (~> 1.0.0)
debug_inspector (0.0.2)
deep_merge (1.0.1)
deprecation (0.2.2)
Expand All @@ -134,6 +137,7 @@ GEM
ffi (1.9.10)
globalid (0.3.6)
activesupport (>= 4.1.0)
hashdiff (0.2.3)
http-cookie (1.0.2)
domain_name (~> 0.5)
i18n (0.7.0)
Expand Down Expand Up @@ -248,6 +252,7 @@ GEM
rspec-expectations (~> 2.14.0)
rspec-mocks (~> 2.14.0)
rubyzip (1.1.7)
safe_yaml (1.0.4)
sass (3.4.19)
sass-rails (5.0.4)
railties (>= 4.0.0, < 5.0)
Expand Down Expand Up @@ -304,6 +309,10 @@ GEM
binding_of_caller (>= 0.7.2)
railties (>= 4.0)
sprockets-rails (>= 2.0, < 4.0)
webmock (1.22.3)
addressable (>= 2.3.6)
crack (>= 0.3.2)
hashdiff
websocket-driver (0.6.3)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.2)
Expand Down Expand Up @@ -346,6 +355,7 @@ DEPENDENCIES
tzinfo-data
uglifier (>= 1.3.0)
web-console (~> 2.0)
webmock

BUNDLED WITH
1.10.6
27 changes: 16 additions & 11 deletions app/models/concerns/annotation_data.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,9 @@ module AnnotationData
include JsonReader
extend ActiveSupport::Concern

# attr_accessor :annotation_url, :annotation_list

def read_annotation(url = nil)
return nil unless url # self[:annotation_url]
@annotation_list = JsonReader::Reader.new.from_url(url)
# self[:annotation_list] = JsonReader::Reader.new.from_str(self[:annotation_url])
end

def motivation_for_annotations
Expand All @@ -18,14 +15,21 @@ def motivation_for_transcriptions
'sc:painting'
end

def resources(annotation_list = nil)
return [] unless annotation_list
return [] unless annotation_list.key? 'resources'
annotation_list['resources']
end

def annotations(annotation_list = nil)
# the motivation for annotations will be: "oa:commenting"
# return [] unless self[:annotation_url]
# self.read_annotation unless self[:annotation_list]
# return [] unless self[:annotation_list]
# return self[:annotation_list][:resources].select {|anno| anno["motivation"] == "oa:commenting" }
return [] unless annotation_list
annotation_list['resources'].select { |anno| anno['motivation'] == motivation_for_annotations }
al = resources(annotation_list)
return [] unless al
al.select { |anno| anno['motivation'] == motivation_for_annotations }
end

def transcriptions(annotation_list = nil)
Expand All @@ -34,8 +38,9 @@ def transcriptions(annotation_list = nil)
# self.read_annotation unless self[:annotation_list]
# return [] unless self[:annotation_list]
# return self[:annotation_list][:resources].select {|anno| anno["motivation"] == "sc:painting" }
return [] unless annotation_list
annotation_list['resources'].select { |anno| anno['motivation'] == motivation_for_transcriptions }
al = resources(annotation_list)
return [] unless al
al.select { |anno| anno['motivation'] == motivation_for_transcriptions }
end

def map_annotation(annotation = nil)
Expand All @@ -61,17 +66,17 @@ def map_annotation(annotation = nil)

def annotation_to_solr(data = {})
# data.keys = [:annotation, :manuscript, :folio, :url]
return {} unless data['annotation']
return {} unless data.key?('annotation') || data['annotation']
anno = map_annotation(data['annotation'])
return {} unless anno['id']
solr_doc = {}
solr_doc['id'] = anno['id']
solr_doc['druid'] = self['druid']
solr_doc['url_sfx'] = data['url']
solr_doc['manifest_urls'] = self['iiif_manifest']
solr_doc['collection'] = self['collection']
solr_doc['folio'] = data['folio']
solr_doc['manuscript_search'] = data['manuscript']
solr_doc['url_sfx'] = data['url'] if data.key?('url')
solr_doc['folio'] = data['folio'] if data.key?('folio')
solr_doc['manuscript_search'] = data['manuscript'] if data.key?('manuscript')
solr_doc['model'] = anno['model']
solr_doc['motivation'] = anno['motivation']
solr_doc['target_url'] = anno['target_url']
Expand Down
8 changes: 7 additions & 1 deletion app/models/concerns/iiif_manifest_data.rb
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ def fetch_modsxml
uri = URI.parse(url)
uri.scheme = 'https'
require 'open-uri'
self.modsxml = open(uri.to_s).read
begin
self.modsxml = open(uri.to_s).read
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT, EOFError,
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, OpenURI::HTTPError => the_error
puts "\nOpen URI error for #{uri}\n\t#{the_error.message}" # TODO: Add to log
return nil
end
end
end
3 changes: 2 additions & 1 deletion app/models/concerns/json_reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ def from_url(url, _encoding = nil)
require 'open-uri'
begin
JSON.parse(open(url).read)
rescue OpenURI::HTTPError => the_error
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, Errno::ETIMEDOUT, EOFError,
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, OpenURI::HTTPError => the_error
puts "\nOpen URI error for #{url}\n\t#{the_error.message}" # TODO: Add to log
return nil
end
Expand Down
2 changes: 2 additions & 0 deletions config/application.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,7 @@ class Application < Rails::Application

# Do not swallow errors in after_commit/after_rollback callbacks.
config.active_record.raise_in_transactional_callbacks = true

WebMock.disable! if Rails.env.test?
end
end
4 changes: 4 additions & 0 deletions data/test_manifest_urls.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
http://dms-data.stanford.edu/data/manifests/Stanford/kq131cs7229/manifest.json
http://dms-data.stanford.edu/data/manifests/BnF/jr903ng8662/manifest.json
http://dms-data.stanford.edu/data/manifests/Parker/fh878gz0315/manifest.json
http://dms-data.stanford.edu/data/manifests/Parker/ft757ht3699/manifest.json
113 changes: 113 additions & 0 deletions lib/data_indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
require 'csv'
class DataIndexer

# Index mods and annotations data from a list of manifest urls or each url
# Params:
# +collection+:: Name of collection the manifest(s) belong to
# +csv_file+:: string containing the path to the csv file.
# File to have one url per line and no header
# +manifest_url+:: url to the mnaifest file
# Usage:
# DataIndexer.new('collection_name', 'file_path').run
# to index csv file
# DataIndexer.new('collection_name', nil, 'url').run
# to index one manifest at url
def initialize(collection = nil, csv_file = nil, manifest_url = nil)
@collection = collection
@csv_file = csv_file
@url = manifest_url
@manifest = nil
@title = nil
@doc = SolrDocument.new
@solr = Blacklight.default_index.connection
end

# Index and commit mods and annotations data either
# from a list of manifest urls or each url
# depending on the options
def run
if @csv_file.present? && File.exist?(@csv_file)
index_csv
commit
elsif @url
index
commit
end
end

# Index mods and annotations data from a list of manifest urls
# csv file to contain one url per line and no header
def index_csv
return unless @csv_file.present? && File.exist?(@csv_file)
CSV.foreach(@csv_file) do |row|
@url = row[0]
index
end
end

# Index MODS and annotation lists fetched from the IIIF manifest url
def index
return unless @url.present?
fetch_manifest
if define_doc
index_mods
index_annotations
end
end

# Commit the data indexed in solr
def commit
@solr.commit
end

protected

# Get the manifest data
def fetch_manifest
@manifest = IiifManifest.new(@url)
@manifest.read_manifest
end

def define_doc
unless @manifest.title.blank? || @manifest.druid.blank?
@doc[:collection] = @collection
@doc[:druid] = @manifest.druid
@doc[:iiif_manifest] = @url
@doc[:mods_url] = @manifest.mods_url
@doc[:modsxml] = @manifest.fetch_modsxml
return true
end
false
end

# index mods data in solr
def index_mods
solr_doc = @doc.mods_to_solr
unless solr_doc.blank?
@title = solr_doc['title_search'] if solr_doc.key?('title_search')
@solr.add solr_doc
end
end

# index all of the annotations data in solr
def index_annotations
list_count = 0
doc_count = 0
add_count = 0
@manifest.annotation_lists.each do |al|
annotation_list = @doc.read_annotation(al['@id'])
@doc.resources(annotation_list).each do |a|
data = { "annotation" => a, "manuscript" => @title, "folio" => al['label'], "url" => al['@id'] }
solr_doc = @doc.annotation_to_solr(data)
unless solr_doc.blank?
@solr.add solr_doc
add_count += 1
end
doc_count += 1
end
list_count += 1
end
[list_count, doc_count, add_count]
end

end
6 changes: 6 additions & 0 deletions lib/tasks/colligo.rake
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
require 'jettywrapper'
require 'data_indexer'
namespace :colligo do
desc 'Run Colligo local installation steps'
task install: [:environment] do
Rake::Task['db:migrate'].invoke
Rake::Task['colligo:download_and_unzip_jetty'].invoke
Rake::Task['colligo:copy_solr_configs'].invoke
Rake::Task['colligo:fixtures'].invoke
end
desc 'Download and unzip jetty'
task :download_and_unzip_jetty do
Expand All @@ -21,4 +23,8 @@ namespace :colligo do
cp "#{Rails.root}/config/solr_configs/#{file}.xml", "#{Rails.root}/jetty/solr/blacklight-core/conf/"
end
end
desc 'Index test fixtures'
task :fixtures do
DataIndexer.new('Test collection', 'data/test_manifest_urls.csv').run
end
end
Loading

0 comments on commit 3d901db

Please sign in to comment.