Skip to content

Commit

Permalink
MarcToXML - use MARC21 XML Schema to validate result
Browse files Browse the repository at this point in the history
  • Loading branch information
dazza-codes committed Jan 31, 2017
1 parent 3bface1 commit 07a7345
Show file tree
Hide file tree
Showing 3 changed files with 215 additions and 19 deletions.
19 changes: 11 additions & 8 deletions conversiontracerbullet/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -73,26 +73,28 @@

<!--COMPILE/RUNTIME-->

<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.7</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.marc4j/marc4j -->
<dependency>
<groupId>org.marc4j</groupId>
<artifactId>marc4j</artifactId>
<version>2.8.0</version>
</dependency>
<!-- Symphony is running Oracle 11.2 -->
<!-- http://www.oracle.com/technetwork/apps-tech/jdbc-112010-090769.html -->
<!-- https://mvnrepository.com/artifact/com.oracle/ojdbc -->
<dependency>
<groupId>com.oracle.jdbc</groupId>
<artifactId>ojdbc6</artifactId>
<version>11.2.0.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/net.sf.saxon/Saxon-HE -->
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
Expand All @@ -101,29 +103,30 @@

<!--TESTS-->

<!-- https://mvnrepository.com/artifact/junit/junit -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.powermock/powermock-module-junit4 -->
<dependency>
<groupId>org.xmlunit</groupId>
<artifactId>xmlunit-core</artifactId>
<version>2.3.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.powermock</groupId>
<artifactId>powermock-module-junit4</artifactId>
<version>${powermock.version}</version>
<scope>test</scope>
</dependency>

<!-- https://mvnrepository.com/artifact/org.powermock/powermock-api-easymock -->
<dependency>
<groupId>org.powermock</groupId>
<artifactId>powermock-api-easymock</artifactId>
<version>${powermock.version}</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.easymock/easymock -->
<dependency>
<groupId>org.easymock</groupId>
<artifactId>easymock</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.stanford;

import org.apache.commons.io.FileUtils;
import org.junit.*;
import static org.junit.Assert.*;

Expand All @@ -8,10 +9,19 @@
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;

import org.marc4j.*;
import org.marc4j.marc.Record;
import org.xml.sax.SAXException;

import javax.xml.XMLConstants;
import javax.xml.transform.Source;
import javax.xml.transform.stream.StreamSource;
import javax.xml.validation.Schema;
import javax.xml.validation.SchemaFactory;
import javax.xml.validation.Validator;

/**
*
Expand All @@ -23,22 +33,27 @@ public class MarcToXMLTest {

// private String marcFileResource = "/sample_marc.mrc";
private String marcFileResource = "/one_record.mrc";

// MARC21 XML schema
// https://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd
private String marcSchemaResource = "/MARC21slim.xsd";

private MarcStreamReader marcReader = null;
private Record marcRecord = null;
private String xmlOutputDir = null;
private Path xmlOutputPath = null;

@Before
public void setUp() throws IOException {
String marcFilePath = getClass().getResource(marcFileResource).getFile();
marcReader = new MarcStreamReader(new FileInputStream(marcFilePath));
assertTrue(marcReader.hasNext());
xmlOutputDir = Files.createTempDirectory("MarcToXMLTest").toString();
MarcToXML.setXmlOutputPath(xmlOutputDir);
xmlOutputPath = Files.createTempDirectory("MarcToXMLTest");
MarcToXML.setXmlOutputPath(xmlOutputPath.toString());
}

@After
public void tearDown() throws IOException {
// FileUtils.deleteDirectory(xmlOutputDir);
FileUtils.deleteDirectory(xmlOutputPath.toFile());
}

@Ignore("This does not yet test anything in MarcToXML")
Expand All @@ -53,20 +68,23 @@ public void convertRecordTest() throws FileNotFoundException {
assertTrue(marcReader.hasNext());
marcRecord = marcReader.next();
MarcToXML.convertMarcRecord(marcRecord);
String outputFile = MarcToXML.marcRecordFileName(marcRecord);
File file = new File(outputFile);
String marcXmlFilePath = MarcToXML.marcRecordFileName(marcRecord);
File file = new File(marcXmlFilePath);
assertTrue(file.exists());

// TODO: read in the one_record.xml file
// TODO: use XMLUnit to check the output file has the same content
// TODO: see http://www.xmlunit.org/
assertTrue(marcXmlValid(marcXmlFilePath));
}

// TODO: read in the one_record.xml file
// TODO: use XMLUnit to check the output file has the same content
// TODO: see http://www.xmlunit.org/

// TODO: use a test MARC record that requires AuthDB access to resolve URIs?

@Test
public void marcRecordFileNameTest() {
marcRecord = marcReader.next();
String result = MarcToXML.marcRecordFileName(marcRecord);
assertTrue(result.contains(xmlOutputDir));
assertTrue(result.contains(xmlOutputPath.toString()));
String cn = marcRecord.getControlNumber();
assertTrue(result.contains(cn));
String fmt = ".xml";
Expand All @@ -78,4 +96,29 @@ public void marcRecordFileNameTest() {
// List cFields = marcRecord.getControlFields();
// List dFields = marcRecord.getDataFields();
// }

private boolean marcXmlValid(String marcXmlFilePath) {
try {
File xmlFile = new File(marcXmlFilePath);
Source xmlSource = new StreamSource(xmlFile);
Validator validator = marcXmlValidator();
validator.validate(xmlSource);
return true;
} catch (Exception e) {
return false;
}
}

private Validator marcXmlValidator = null;

private Validator marcXmlValidator() throws SAXException {
if (marcXmlValidator == null) {
String marcSchemaFilePath = getClass().getResource(marcSchemaResource).getFile();
File schemaFile = new File(marcSchemaFilePath);
SchemaFactory schemaFactory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI);
Schema schema = schemaFactory.newSchema(schemaFile);
marcXmlValidator = schema.newValidator();
}
return marcXmlValidator;
}
}
150 changes: 150 additions & 0 deletions conversiontracerbullet/src/test/resources/MARC21slim.xsd
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
<?xml version="1.0"?>
<xsd:schema targetNamespace="http://www.loc.gov/MARC21/slim" xmlns="http://www.loc.gov/MARC21/slim" xmlns:xsd="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified" attributeFormDefault="unqualified" version="1.1" xml:lang="en">
<xsd:annotation>
<xsd:documentation>
MARCXML: The MARC 21 XML Schema
Prepared by Corey Keith

May 21, 2002 - Version 1.0 - Initial Release

**********************************************
Changes.

August 4, 2003 - Version 1.1 -
Removed import of xml namespace and the use of xml:space="preserve" attributes on the leader and controlfields.
Whitespace preservation in these subfields is accomplished by the use of xsd:whiteSpace value="preserve"

May 21, 2009 - Version 1.2 -
in subfieldcodeDataType the pattern
"[\da-z!&quot;#$%&amp;'()*+,-./:;&lt;=&gt;?{}_^`~\[\]\\]{1}"
changed to:
"[\dA-Za-z!&quot;#$%&amp;'()*+,-./:;&lt;=&gt;?{}_^`~\[\]\\]{1}"
i.e "A-Z" added after "[\d" before "a-z" to allow upper case. This change is for consistency with the documentation.

************************************************************
This schema supports XML markup of MARC21 records as specified in the MARC documentation (see www.loc.gov). It allows tags with
alphabetics and subfield codes that are symbols, neither of which are as yet used in the MARC 21 communications formats, but are
allowed by MARC 21 for local data. The schema accommodates all types of MARC 21 records: bibliographic, holdings, bibliographic
with embedded holdings, authority, classification, and community information.
</xsd:documentation>
</xsd:annotation>
<xsd:element name="record" type="recordType" nillable="true" id="record.e">
<xsd:annotation>
<xsd:documentation>record is a top level container element for all of the field elements which compose the record</xsd:documentation>
</xsd:annotation>
</xsd:element>
<xsd:element name="collection" type="collectionType" nillable="true" id="collection.e">
<xsd:annotation>
<xsd:documentation>collection is a top level container element for 0 or many records</xsd:documentation>
</xsd:annotation>
</xsd:element>
<xsd:complexType name="collectionType" id="collection.ct">
<xsd:sequence minOccurs="0" maxOccurs="unbounded">
<xsd:element ref="record"/>
</xsd:sequence>
<xsd:attribute name="id" type="idDataType" use="optional"/>
</xsd:complexType>
<xsd:complexType name="recordType" id="record.ct">
<xsd:sequence minOccurs="0">
<xsd:element name="leader" type="leaderFieldType"/>
<xsd:element name="controlfield" type="controlFieldType" minOccurs="0" maxOccurs="unbounded"/>
<xsd:element name="datafield" type="dataFieldType" minOccurs="0" maxOccurs="unbounded"/>
</xsd:sequence>
<xsd:attribute name="type" type="recordTypeType" use="optional"/>
<xsd:attribute name="id" type="idDataType" use="optional"/>
</xsd:complexType>
<xsd:simpleType name="recordTypeType" id="type.st">
<xsd:restriction base="xsd:NMTOKEN">
<xsd:enumeration value="Bibliographic"/>
<xsd:enumeration value="Authority"/>
<xsd:enumeration value="Holdings"/>
<xsd:enumeration value="Classification"/>
<xsd:enumeration value="Community"/>
</xsd:restriction>
</xsd:simpleType>
<xsd:complexType name="leaderFieldType" id="leader.ct">
<xsd:annotation>
<xsd:documentation>MARC21 Leader, 24 bytes</xsd:documentation>
</xsd:annotation>
<xsd:simpleContent>
<xsd:extension base="leaderDataType">
<xsd:attribute name="id" type="idDataType" use="optional"/>
</xsd:extension>
</xsd:simpleContent>
</xsd:complexType>
<xsd:simpleType name="leaderDataType" id="leader.st">
<xsd:restriction base="xsd:string">
<xsd:whiteSpace value="preserve"/>
<xsd:pattern value="[\d ]{5}[\dA-Za-z ]{1}[\dA-Za-z]{1}[\dA-Za-z ]{3}(2| )(2| )[\d ]{5}[\dA-Za-z ]{3}(4500| )"/>
</xsd:restriction>
</xsd:simpleType>
<xsd:complexType name="controlFieldType" id="controlfield.ct">
<xsd:annotation>
<xsd:documentation>MARC21 Fields 001-009</xsd:documentation>
</xsd:annotation>
<xsd:simpleContent>
<xsd:extension base="controlDataType">
<xsd:attribute name="id" type="idDataType" use="optional"/>
<xsd:attribute name="tag" type="controltagDataType" use="required"/>
</xsd:extension>
</xsd:simpleContent>
</xsd:complexType>
<xsd:simpleType name="controlDataType" id="controlfield.st">
<xsd:restriction base="xsd:string">
<xsd:whiteSpace value="preserve"/>
</xsd:restriction>
</xsd:simpleType>
<xsd:simpleType name="controltagDataType" id="controltag.st">
<xsd:restriction base="xsd:string">
<xsd:whiteSpace value="preserve"/>
<xsd:pattern value="00[1-9A-Za-z]{1}"/>
</xsd:restriction>
</xsd:simpleType>
<xsd:complexType name="dataFieldType" id="datafield.ct">
<xsd:annotation>
<xsd:documentation>MARC21 Variable Data Fields 010-999</xsd:documentation>
</xsd:annotation>
<xsd:sequence maxOccurs="unbounded">
<xsd:element name="subfield" type="subfieldatafieldType"/>
</xsd:sequence>
<xsd:attribute name="id" type="idDataType" use="optional"/>
<xsd:attribute name="tag" type="tagDataType" use="required"/>
<xsd:attribute name="ind1" type="indicatorDataType" use="required"/>
<xsd:attribute name="ind2" type="indicatorDataType" use="required"/>
</xsd:complexType>
<xsd:simpleType name="tagDataType" id="tag.st">
<xsd:restriction base="xsd:string">
<xsd:whiteSpace value="preserve"/>
<xsd:pattern value="(0([1-9A-Z][0-9A-Z])|0([1-9a-z][0-9a-z]))|(([1-9A-Z][0-9A-Z]{2})|([1-9a-z][0-9a-z]{2}))"/>
</xsd:restriction>
</xsd:simpleType>
<xsd:simpleType name="indicatorDataType" id="ind.st">
<xsd:restriction base="xsd:string">
<xsd:whiteSpace value="preserve"/>
<xsd:pattern value="[\da-z ]{1}"/>
</xsd:restriction>
</xsd:simpleType>
<xsd:complexType name="subfieldatafieldType" id="subfield.ct">
<xsd:simpleContent>
<xsd:extension base="subfieldDataType">
<xsd:attribute name="id" type="idDataType" use="optional"/>
<xsd:attribute name="code" type="subfieldcodeDataType" use="required"/>
</xsd:extension>
</xsd:simpleContent>
</xsd:complexType>
<xsd:simpleType name="subfieldDataType" id="subfield.st">
<xsd:restriction base="xsd:string">
<xsd:whiteSpace value="preserve"/>
</xsd:restriction>
</xsd:simpleType>
<xsd:simpleType name="subfieldcodeDataType" id="code.st">
<xsd:restriction base="xsd:string">
<xsd:whiteSpace value="preserve"/>
<xsd:pattern value="[\dA-Za-z!&quot;#$%&amp;'()*+,-./:;&lt;=&gt;?{}_^`~\[\]\\]{1}"/>
<!-- "A-Z" added after "\d" May 21, 2009 -->
</xsd:restriction>
</xsd:simpleType>
<xsd:simpleType name="idDataType" id="id.st">
<xsd:restriction base="xsd:ID"/>
</xsd:simpleType>
</xsd:schema>

0 comments on commit 07a7345

Please sign in to comment.