Skip to content
This repository has been archived by the owner on May 29, 2020. It is now read-only.

Commit

Permalink
#4 resolved. Updated to extract training data better from MASC 3.0.0 …
Browse files Browse the repository at this point in the history
…final version.
  • Loading branch information
jasonbaldridge committed Mar 13, 2013
1 parent 5804edc commit 8e21355
Showing 1 changed file with 21 additions and 14 deletions.
35 changes: 21 additions & 14 deletions src/main/scala/chalk/corpora/MascUtil.scala
Original file line number Diff line number Diff line change
Expand Up @@ -199,27 +199,34 @@ object MascFile {

// Insert the "missing" sentences. (Content not marked as a sentence,
// but containing tokens.)
val paddedSentenceRegionBuffer =
collection.mutable.ListBuffer[MRegion](sentenceRegions.head)

sentenceRegions.sliding(2).foreach {
case Seq(prev, curr) => {
if (prev.end + 1 < curr.start)
paddedSentenceRegionBuffer.append(MRegion("", prev.end + 1, curr.start - 1))
paddedSentenceRegionBuffer.append(curr)
}
}

val paddedSentenceRegions = paddedSentenceRegionBuffer.toSeq
//val paddedSentenceRegionBuffer =
// collection.mutable.ListBuffer[MRegion](sentenceRegions.head)
//
//sentenceRegions.sliding(2).foreach {
// case Seq(prev, curr) => {
// if (prev.end + 1 < curr.start)
// paddedSentenceRegionBuffer.append(MRegion("", prev.end + 1, curr.start - 1))
// paddedSentenceRegionBuffer.append(curr)
// }
//}
//
//val paddedSentenceRegions = paddedSentenceRegionBuffer.toSeq
val paddedSentenceRegions = sentenceRegions

// Pull out the sequence of token, pos, and NE for each sentence.
val allOrderedTokRegions = tokenRegions.values.toIndexedSeq.sorted
var index = 0
val allDataBySentence = paddedSentenceRegions.flatMap { region => {
val endIndex = allOrderedTokRegions.indexWhere(t=>t.end>region.end,index)
if (index == endIndex) None
//val startIndex = math.max(index, region.start)
val startIndex = math.max(index, allOrderedTokRegions.indexWhere(t=>t.start>=region.start,index))
//val startIndex = index
val endIndex = allOrderedTokRegions.indexWhere(t=>t.end>region.end,startIndex)
//println(region.start + " -- " + region.end)
//println(index + ": " + startIndex + " , " + endIndex)
if (startIndex == endIndex) None
else {
val sentence = allOrderedTokRegions.slice(index,endIndex)
val sentence = allOrderedTokRegions.slice(startIndex,endIndex)
index = endIndex
orderedTokPosNer(sentence)
}
Expand Down

0 comments on commit 8e21355

Please sign in to comment.