-
Notifications
You must be signed in to change notification settings - Fork 2
/
mjf_transform.rb
171 lines (132 loc) · 7.25 KB
/
mjf_transform.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# Used to transform Monterey Jazz Festival METs and MODs to contentMetadata.xml and descMetadata.xml files to prepare for pre-assembly
# August 2012
# Peter Mangiafico
# run with 'ruby devel/mjf_transform.rb'
current_path = File.dirname(File.expand_path(__FILE__))
require 'rubygems'
require 'csv'
require 'nokogiri'
require 'assembly-objectfile'
require 'csv-mapper'
content_path = "/sdr-ingest/mjf-tape-all" # **** ADD THE INPUT FOLDER PATH HERE
csv_filename = File.join(current_path, 'smpl_csv', 'mjf_manifest.csv') # **** ADD THE MANIFEST FILE SUPPLIED BY SMPL STAFF HERE
output_path = content_path # where to write the new files (same as input)
content_xml_filename = "contentMetadata.xml"
descriptive_xml_filename = "descMetadata.xml"
existing_files_to_remove = [content_xml_filename, descriptive_xml_filename]
# for output content metadata
content_type_description = "file"
resource_type_description = "file"
puts "Input spreadsheet: #{csv_filename}"
puts "Input folder: #{content_path}"
puts "Output folder: #{output_path}"
puts ""
def file_cleanup(folder, existing_files_to_remove)
existing_files_to_remove.each do |filename|
full_path = File.join(folder, filename)
if File.exist?(full_path)
puts "...Deleting existing file #{full_path}"
File.delete(full_path)
end
end
end
Dir.chdir(content_path)
# read CSV
@items = CsvMapper.import(csv_filename) do read_attributes_from_file end
@items.each do |row|
pid = row.druid
folder = row.folder
folder_path = File.join(content_path, folder)
puts "Operating on #{folder}"
file_cleanup(folder_path, existing_files_to_remove)
# dig into data subfolder
data_folder = File.join(folder_path, "data")
Dir.chdir(data_folder)
# there should be one folder in there starting with "library_stanford_edu", go to it
sub_folder = Dir.glob("library_stanford_edu*")
if sub_folder.size == 1 # ok, we found one folder, go to it
content_sub_folder = File.join(data_folder, sub_folder.first)
Dir.chdir(content_sub_folder)
puts "...Found #{content_sub_folder}"
file_cleanup(content_sub_folder, existing_files_to_remove)
# now find the XML file
xml_file = Dir.glob("*.xml")
if xml_file.size == 1 # ok we found one, grab a handle to it
input_xml_file = File.join(content_sub_folder, xml_file.first)
puts "...Reading #{input_xml_file}"
f = File.open(input_xml_file)
doc = Nokogiri::XML(f)
f.close
# get mods element
mods = doc.search('//mods:mods')
# clean up table of contents and nodes that have return characters
mods.search('//mods:tableOfContents').each { |node| node.content = node.content.gsub('\n', ' ') } if mods.search('//mods:tableOfContents').size != 0
mods.search('//mods:note').each { |node| node.content = node.content.gsub('\n', ' ') } if mods.search('//mods:note').size != 0
# get start date and end date to see if they are the same
start_date = mods.search('//mods:dateCreated[@point="start"]')
end_date = mods.search('//mods:dateCreated[@point="end"]')
if start_date.size != 0 && end_date.size != 0
if start_date.first.content.strip == end_date.first.content.strip # if both nodes exist and are the same, just remove the end date
mods.search('//mods:dateCreated[@point="end"]').each { |node| node.remove }
end
end
# lop off any <mods:relatedItem type="constituent"> nodes
mods.search('//mods:relatedItem[@type="constituent"]').each { |related_item_node| related_item_node.remove }
# find file elements we need to create contentMetadata
filesec = doc.search('fileSec')
filegroups = filesec.search('fileGrp')
resource_label = filesec[0].attributes['ID'].value.gsub('_', ' ')
builder = Nokogiri::XML::Builder.new do |xml|
xml.contentMetadata(:objectId => pid, :type => content_type_description) {
xml.resource(:id => "#{pid}-1", :sequence => "1", :type => resource_type_description) {
xml.label resource_label
filegroups.each do |filegroup|
file_type = filegroup.attributes['USE'].value.strip.downcase
files = filegroup.search('file')
files.each do |file|
file_location = file.search('FLocat')
relative_path_to_file = file_location[0].attributes['href'].value.gsub('file:', '') # this is the relative path to the file in the original folder structure
id = File.basename(relative_path_to_file) # this is just the filename, used in our new contentMetadata, since we will flatten folder structure when staging during pre-assembly
xml_file_params = { :id => id }
xml_file_params.merge!({ :mimetype => file.attributes['MIMETYPE'].value }) unless file.attributes['MIMETYPE'].nil?
xml_file_params.merge!({ :size => file.attributes['SIZE'].value }) unless file.attributes['SIZE'].nil?
case file_type
when 'archive masters'
xml_file_params.merge!({ :preserve => 'yes', :publish => 'no', :shelve => 'no' })
when 'auxiliary application'
xml_file_params.merge!({ :preserve => 'yes', :publish => 'no', :shelve => 'no' })
when 'service high'
xml_file_params.merge!({ :preserve => 'yes', :publish => 'no', :shelve => 'no' })
end # end case file_type
if File.exist?(File.join(content_sub_folder, relative_path_to_file)) # this confirms the existence of the file on the original file system before adding the new node to the contentMetadata
xml.file(xml_file_params) {
xml.checksum(file.attributes['CHECKSUM'].value, :type => 'md5')
} # end builder file node
end
end # loop over all files in a filegroup
end # end loop over all filegroups in a filesec node
} # end builder resource node
# add METs resource node
xml.resource(:id => "#{pid}-2", :sequence => "2", :type => "object") {
xml.label "Descriptive Metadata"
xml.file(:mimetype => "application/tei+xml", :shelve => "yes", :format => "XML", :publish => "yes", :preserve => "yes", :id => xml_file.first)
}
} # end builder contentMetadata node
end # end builder for output content metadata
output_xml_directory = content_sub_folder # File.join(output_path,folder)
# Dir.mkdir(output_xml_directory) unless File.directory? output_xml_directory
# write output contentMetadata
output_cm = File.join(output_xml_directory, content_xml_filename)
f = File.open(output_cm, 'w') { |fh| fh.puts builder.to_xml }
FileUtils.chmod(0644, output_cm)
puts "...Writing #{content_xml_filename}"
# write output descMetadata, removing blank lines
output_dm = File.join(output_xml_directory, descriptive_xml_filename)
f = File.open(output_dm, 'w') { |fh| fh.puts mods.to_xml.gsub(/^\s*\n/, "") }
FileUtils.chmod(0644, output_dm)
puts "...Writing #{descriptive_xml_filename}"
end # end finding XML file
else
puts "**** ERROR: Could not locate content folder within the '#{folder}/data' folder"
end # end check for content subfolder
end # end loop over all input rows in spreadsheet