Skip to content
This repository has been archived by the owner on May 29, 2020. It is now read-only.

Commit

Permalink
#4 progress. Added code to convert MASC to CoNLL NER format. Still ne…
Browse files Browse the repository at this point in the history
…ed to round it out for tokenization, etc.
  • Loading branch information
jasonbaldridge committed Dec 31, 2012
1 parent 1b262da commit eb2c56b
Show file tree
Hide file tree
Showing 3 changed files with 227 additions and 5 deletions.
9 changes: 5 additions & 4 deletions bin/chalk
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ then
JAVA_MEM_FLAG=-Xmx4g
fi

#JARS="`find $CHALK_DIR/lib_managed/jars -name '*.jar' -print | tr '\n' ':'`"
#JARS="`find $HOME/.ivy2/ -name '*.jar' -print | tr '\n' ':'`"
MANAGED_JARS="`find $CHALK_DIR/lib_managed -name '*.jar' -print | tr '\n' ':'`"

SCALA_LIB="$HOME/.sbt/boot/scala-2.9.2/lib/scala-library.jar"

CP="$CHALK_DIR/target/classes:$SCALA_LIB:$CHALK_DIR/src/main/resources:$JARS:$CLASSPATH"
CP="$CHALK_DIR/target/classes:$SCALA_LIB:$CHALK_DIR/src/main/resources:$MANAGED_JARS"

JAVA="$JAVA_HOME/bin/java"
JAVA_COMMAND="$JAVA -classpath $CP -Dchalk.dir=$CHALK_DIR"
Expand All @@ -27,7 +28,7 @@ shift
help()
{
cat <<EOF
chalk 0.1 commands:
chalk 1.1.0 commands:
cli run the chalk command-line interface
run run the main method of a given class
Expand All @@ -47,7 +48,7 @@ case $CMD in
*) echo "Unrecognized command: $CMD"; help; exit 1;;
esac

$JAVA_COMMAND $CLASS ${1+"$@"}
$JAVA_COMMAND $CLASS ${1+"$@"}
(( EXIT_CODE += $? ))

exit $EXIT_CODE
Expand Down
10 changes: 9 additions & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@ version := "1.1.0-SNAPSHOT"

organization := "com.jasonbaldridge"

scalaVersion := "2.10.0"
scalaVersion := "2.9.2"

crossPaths := false

retrieveManaged := true

resolvers ++= Seq(
"opennlp sourceforge repo" at "http://opennlp.sourceforge.net/maven2"
)

// Original OpenNLP dependencies
libraryDependencies ++= Seq(
"com.novocode" % "junit-interface" % "0.8" % "test->default",
"jwnl" % "jwnl" % "1.3.3" % "compile",
Expand All @@ -20,6 +23,11 @@ libraryDependencies ++= Seq(
"org.apache.uima" % "uimaj-core" % "2.3.1" % "provided"
)

// New dependencies
libraryDependencies ++= Seq(
"com.codecommit" % "anti-xml_2.9.1" % "0.3"
)

publishTo <<= version { v: String =>
val nexus = "https://oss.sonatype.org/"
if (v.trim.endsWith("SNAPSHOT"))
Expand Down
213 changes: 213 additions & 0 deletions src/main/scala/chalk/corpora/MascUtil.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
package chalk.corpora

import com.codecommit.antixml._
import io.Codec

/**
* Convert native MASC xml into CONLL format for named entity recognition.
*
* @author jasonbaldridge
*/
object MascNer {

import io.Source
import java.io._
import MascUtil._

lazy val outsideNe = MAnnotation("outside", "outside", "none", Map[String,String]())

lazy val nerLabelStandardizer = Map(
"location" -> "LOC",
"person" -> "PER",
"org" -> "ORG",
//"date" -> "DAT"
"date" -> "MISC"
).withDefault(x=>"O")

def main(args: Array[String]) {
val mascDir = args(0)
val targets = collectTargets(new File(mascDir))

// Get 3/5 for train, 1/5 for dev, and 1/5 for test
val targetsAndIndices = targets.zipWithIndex
processSet("train", targetsAndIndices.filter(_._2 % 5 < 3).unzip._1)
processSet("dev", targetsAndIndices.filter(_._2 % 5 == 3).unzip._1)
processSet("test", targetsAndIndices.filter(_._2 % 5 == 4).unzip._1)
}

def collectTargets(dir: File): Seq[(File,String)] = {
val files = dir.listFiles.toSeq
val filesInDir = files
.filter(_.getName.endsWith(".txt"))
.map(file => (dir, file.getName.dropRight(4)))

filesInDir ++ files.filter(_.isDirectory).flatMap(collectTargets)
}

def processSet(outputName: String, targets: Seq[(File, String)]) {
System.err.println("Creating " + outputName)
val output = new FileWriter(outputName)
for ((file, prefix) <- targets) {
try {
val allDataBySentence: Seq[(Seq[String], Seq[String], Seq[String], Seq[MRegion])] = processTarget(file,prefix)
for (sentenceInfo <- allDataBySentence) {
// sigh, no zipped on tuple4
(0 until sentenceInfo._1.length).foreach { i =>
val (tok, pos, ner, region) = (sentenceInfo._1(i), sentenceInfo._2(i), sentenceInfo._3(i), sentenceInfo._4(i))
if(tok.exists(_.isSpaceChar)) {
println("Weird token! '" + tok +"' " + file + "/" + prefix +".txt:" + + region.start + "-" + region.end)
}
output.write(tok + " " + pos + " " + pos + " " + ner + "\n")
}
output.write("\n")
}
System.err.println("Success: " + file + "," + prefix)
}
catch {
case e: Throwable => System.err.println("Failure: " + file + "," + prefix)
}
}
output.flush
output.close
System.err.println
}

def processTarget(dir: File, prefix: String): Seq[(Seq[String], Seq[String], Seq[String], Seq[MRegion])] = {

def dirFile(prefix: String) = new File(dir, prefix)

implicit val codec = Codec.UTF8

// Raw text
val rawtext = Source.fromFile(dirFile(prefix+".txt"))(codec).mkString

// Sentence information
val sentenceXml = XML.fromSource(Source.fromFile(dirFile(prefix+"-s.xml"))(Codec.UTF8))
val sentenceRegions = getRegions(sentenceXml).sorted

// Basic segment information
val segmentXml = XML.fromSource(Source.fromFile(dirFile(prefix+"-seg.xml"))(Codec.UTF8))
val segmentRegions = getRegions(segmentXml).map(r => (r.id -> r)).toMap

// POS information
val pennXml = XML.fromSource(Source.fromFile(dirFile(prefix+"-penn.xml"))(Codec.UTF8))

val tokenRegions = getNodes(pennXml).map { n =>
val regions = n.targets.map(segmentRegions).sorted
(n.id -> MRegion(n.id, regions.head.start, regions.last.end))
}.toMap

val tokens = tokenRegions.mapValues(region => rawtext.slice(region.start, region.end))
val posAnnotations = getAnnotations(pennXml).map(anno => (anno.ref -> anno)).toMap

// NER information
val neXml = XML.fromSource(Source.fromFile(dirFile(prefix+"-ne.xml"))(Codec.UTF8))
val neAnnotations =
getAnnotations(neXml).map(anno => (anno.ref -> anno)).toMap.withDefault(x=>outsideNe)

val neEdges =
getEdges(neXml).map(edge => (edge.to -> edge.from)).toMap.withDefault(x=>"outside")

// A helper function for pulling out the information associated with a
// subsequence of the tokens in the document.
def orderedTokPosNer(orderedRegions: Seq[MRegion]) = {
if (orderedRegions.length == 0) None
else {
val orderedTokens = orderedRegions.map(reg=>tokens(reg.id))

val (orderedPos, orderedNe) = orderedRegions.map { region => {
val posAnno = posAnnotations(region.id)
val neAnno = neAnnotations(neEdges(posAnno.ref))
(getPos(posAnno), neAnno)
}}.unzip

val bioLabels = (outsideNe +: orderedNe).sliding(2).toSeq.map {
case Seq(prev, curr) =>
if (curr.label == "outside")
nerLabelStandardizer(curr.label)
else {
val prefix = if (prev.id != curr.id) "B-" else "I-"
prefix+nerLabelStandardizer(curr.label)
}
}
Some(orderedTokens, orderedPos, bioLabels, orderedRegions)
}
}


// Insert the "missing" sentences. (Content not marked as a sentence,
// but containing tokens.)
val paddedSentenceRegionBuffer =
collection.mutable.ListBuffer[MRegion](sentenceRegions.head)

sentenceRegions.sliding(2).foreach { case Seq(prev, curr) => {
if (prev.end+1 < curr.start)
paddedSentenceRegionBuffer.append(MRegion("", prev.end+1, curr.start-1))
paddedSentenceRegionBuffer.append(curr)
}}

val paddedSentenceRegions = paddedSentenceRegionBuffer.toSeq

// Pull out the sequence of token, pos, and NE for each sentence.
val allOrderedTokRegions = tokenRegions.values.toIndexedSeq.sorted
var index = 0
val allDataBySentence = paddedSentenceRegions.flatMap { region => {
val endIndex = allOrderedTokRegions.indexWhere(t=>t.end>region.end,index)
if (index == endIndex) None
else {
val sentence = allOrderedTokRegions.slice(index,endIndex)
index = endIndex
orderedTokPosNer(sentence)
}
}}

allDataBySentence

}

}

/**
* Simple objects and functions for working with MASC data.
*
* @author jasonbaldridge
*/
object MascUtil {

case class MNode(id: String, targets: Seq[String])
case class MAnnotation(id: String, label: String, ref: String, features: Map[String,String])
case class MEdge(id: String, from: String, to: String)
case class MRegion(id: String, start: Int, end: Int) extends Ordered[MRegion] {
def compare(that: MRegion) = this.start - that.start
}

val idQname = QName(Some("xml"),"id")

def getRegions(doc: Elem) = (doc \\ "region").toSeq.map { rxml =>
val Array(start, end) = rxml.attrs("anchors").split(" ")
MRegion(rxml.attrs(idQname), start.toInt, end.toInt)
}

def getNodes(doc: Elem) = (doc \\ "node").toSeq.map { nxml =>
val targets = (nxml \ "link").head.attrs("targets").split(" ").toSeq
MNode(nxml.attrs(idQname), targets)
}

def getEdges(doc: Elem) = (doc \\ "edge").toSeq
.map(exml => MEdge(exml.attrs(idQname), exml.attrs("from"), exml.attrs("to")))

def getAnnotations(doc: Elem) = (doc \\ "a").toSeq.map { axml =>
val features = (axml \\ "f").toSeq
.map(fnode => (fnode.attrs("name") -> fnode.children.toString)).toMap
MAnnotation(axml.attrs(idQname),axml.attrs("label"),axml.attrs("ref"), features)
}

// Have to go through some pains to make sure we get a POS for every token.
def getPos(anno: MAnnotation) = {
if (anno.features.isDefinedAt("msd")) anno.features("msd")
else if (anno.features.get("kind").getOrElse("") == "urlAddress") "URL"
else if (anno.features.isDefinedAt("categor")) anno.features("categor")
else "UNK"
}

}

0 comments on commit eb2c56b

Please sign in to comment.