Skip to content

Commit

Permalink
changed for sg
Browse files Browse the repository at this point in the history
  • Loading branch information
sagnik committed Jul 8, 2017
1 parent fa9d23f commit b1e1e43
Show file tree
Hide file tree
Showing 17 changed files with 158 additions and 8 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package edu.psu.sagnik.research.pdwriters.impl

import java.io.{ BufferedWriter, FileWriter }

import edu.psu.sagnik.research.pdsimplify.model.PDDocumentSimple
import edu.psu.sagnik.research.pdwriters.writers.json.CreateTextLinesJSON

/**
* Created by szr163 on 8/26/16.
Expand Down Expand Up @@ -40,21 +43,29 @@ object CreateResultsBatch {

var doc = document

doc = getExtractionResult(doc, pageNum, chars.map(_.bb), Color.BLUE)
//doc = getExtractionResult(doc, pageNum, chars.map(_.bb), Color.BLUE)

doc = getExtractionResult(doc, pageNum, words.map(_.bb), Color.GREEN)
//doc = getExtractionResult(doc, pageNum, words.map(_.bb), Color.GREEN)

doc = getExtractionResult(doc, pageNum, lines.map(_.bb), Color.RED)

doc = getExtractionResult(doc, pageNum, paragraphs.map(_.bb), Color.CYAN)
//doc = getExtractionResult(doc, pageNum, paragraphs.map(_.bb), Color.CYAN)

doc = getExtractionResult(doc, pageNum, rasters.map(_.bb), Color.MAGENTA)
//doc = getExtractionResult(doc, pageNum, rasters.map(_.bb), Color.MAGENTA)

doc = getExtractionResult(doc, pageNum, segments.map(_.bb), Color.PINK)
//doc = getExtractionResult(doc, pageNum, segments.map(_.bb), Color.PINK)

doc
}

def writeTextLinesJson(fileName: String, simplifiedDocument: PDDocumentSimple) = {
val file = new File(fileName)
val bw = new BufferedWriter(new FileWriter(file))
bw.write(CreateTextLinesJSON(simplifiedDocument))
println(s"JSON file written at: ${fileName}")
bw.close()
}

def getExtractionResult(document: PDDocument, pageNum: Int, bbs: List[RectangleOTL], c: Color): PDDocument = {
val page = document.getPage(pageNum)
CreateMarkedPDF.rectMarkedContent(document, page, bbs, c)
Expand All @@ -71,7 +82,8 @@ object CreateResultsBatch {
}

def main(args: Array[String]): Unit = {
val DEFAULT_LOC = "/home/szr163/Downloads/f706gsd1.pdf"
val DEFAULT_LOC = "/media/sagnik/OS_Install/data/pdfsamples/nuance/nuancepdf/img04192017_0002.pdf"
//"/home/szr163/Downloads/f706gsd1.pdf"
//"/Users/schoudhury/hassan/C10-2042.pdf"
val pdLoc = if (args.length > 1) args(0) else DEFAULT_LOC

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package edu.psu.sagnik.research.pdwriters.impl

import java.io.{ BufferedWriter, File, FileWriter }

import edu.psu.sagnik.research.pdsimplify.impl.ProcessDocument
import edu.psu.sagnik.research.pdsimplify.model.PDDocumentSimple
import edu.psu.sagnik.research.pdwriters.writers.json.CreateTextLinesJSON
import org.apache.pdfbox.pdmodel.PDDocument

/**
* Created by sagnik on 7/7/17.
*/

object CreateResultsBatchFiles {

def writeTextLinesJson(fileName: String, simplifiedDocument: PDDocumentSimple) = {
val file = new File(fileName)
val bw = new BufferedWriter(new FileWriter(file))
bw.write(CreateTextLinesJSON(simplifiedDocument))
println(s"JSON file written at: ${fileName}")
bw.close()
}

import scala.util.matching.Regex
def recursiveListFiles(f: File, r: Regex): Array[File] = {
val these = f.listFiles
val good = these.filter(f => r.findFirstIn(f.getName).isDefined)
good ++ these.filter(_.isDirectory).flatMap(recursiveListFiles(_, r))
}

/*
def main(args: Array[String]): Unit = {
val DEFAULT_DIR = "/media/sagnik/OS_Install/data/pdfsamples/nuance/nuancepdf/"
//"/home/szr163/Downloads/f706gsd1.pdf"
//"/Users/schoudhury/hassan/C10-2042.pdf"
val pdDir = if (args.length > 1) args(0) else DEFAULT_DIR
val docs = recursiveListFiles(new File(pdDir), "pdf".r).map(_.getAbsolutePath)
docs.foreach { x =>
val document = PDDocument.load(new File(x))
val jsonLoc = s"${x.substring(0, x.length - 3)}json"
val simplifiedDocument = ProcessDocument(document)
writeTextLinesJson(jsonLoc, simplifiedDocument)
document.close()
}
}
*/

def main(args: Array[String]): Unit = {
val DEFAULT_DIR = "/media/sagnik/OS_Install/data/pdfsamples/nuance/nuancepdf/"
//"/home/szr163/Downloads/f706gsd1.pdf"
//"/Users/schoudhury/hassan/C10-2042.pdf"
val pdDir = if (args.length > 1) args(0) else DEFAULT_DIR

val docs = recursiveListFiles(new File(pdDir), "pdf".r).map(_.getAbsolutePath)
docs.foreach { x =>
var document = PDDocument.load(new File(x))
val simplifiedDocument = ProcessDocument(document)
(0 until simplifiedDocument.pages.size).foreach(pageNum =>
document = CreateResultsBatch.createMarkedResult(
document,
simplifiedDocument,
pageNum = pageNum,
pdLoc = x
))
document.save(s"${x.dropRight(4)}-marked.pdf")
println(s"[marked document written to]: ${x.dropRight(4)}-marked.pdf")
document.close()
document.close()
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@ object ShowResults {
def printExtractionResult(pdLoc: String, pageNum: Int, bbs: List[RectangleOTL], c: Color, qualifier: String) = {
val document = PDDocument.load(new File(pdLoc))
val page = document.getPage(pageNum)
CreateMarkedPNG(pdLoc, document, pageNum, page, bbs, c, qualifier)
CreateMarkedPDF(pdLoc, document, pageNum, page, bbs, c, qualifier)
logger.fine(s"created ${qualifier.substring(0, qualifier.length - 1)} marked PDF")
}

def main(args: Array[String]): Unit = {
val DEFAULT_LOC = "src/test/resources/test1.pdf"
val DEFAULT_LOC =
"src/test/resources/test1.pdf"
//"/Users/schoudhury/hassan/C10-2042.pdf"
val DEFAULT_PAGE_NUM = 5

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ object CreateJSON {
("content" -> pp.content)
)
})

)
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package edu.psu.sagnik.research.pdwriters.writers.json

import java.awt.geom.Point2D

import edu.psu.sagnik.research.data.RectangleOTL
import edu.psu.sagnik.research.pdsimplify.model.PDDocumentSimple
import edu.psu.sagnik.research.pdsimplify.path.model.PDLine
import org.json4s.JsonDSL._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

/**
* Created by schoudhury on 7/6/16.
*/
object CreateTextLinesJSON {

def precReduce(d: Float): Float = BigDecimal(d).setScale(2, BigDecimal.RoundingMode.HALF_UP).toFloat

def precReduce(point: Point2D.Float): List[Float] = List(precReduce(point.x), precReduce(point.y))

def precReduce(r: RectangleOTL): List[Float] = List(precReduce(r.xTopLeft), precReduce(r.yTopLeft), precReduce(r.widthRight), precReduce(r.heightDown))

case class PathStyle(
fill: Option[String],
fillRule: Option[String],
fillOpacity: Option[String],
stroke: Option[String],
strokeWidth: Option[String],
strokeLineCap: Option[String],
strokeLineJoin: Option[String],
strokeMiterLimit: Option[String],
strokeDashArray: Option[String],
strokeDashOffset: Option[String],
strokeOpacity: Option[String]
)

def apply(pDS: PDDocumentSimple): String = {
val jsonContent =
"pages" ->
pDS.pages.map {
p =>
(
("pageNumber" -> p.pNum) ~
("pageBB" -> precReduce(p.bb)) ~
("textparagraphs" -> p.paragraphs.map {
pp =>
(
("bb" -> precReduce(pp.bb)) ~
("textlines" -> pp.tLines.map {
tl =>
(
("bb" -> precReduce(tl.bb)) ~
("content" -> tl.content)
)
})
)
})

)
}
pretty(render(jsonContent))
}

}
Binary file not shown.
Binary file added writers/src/test/resources/test1-page-5-chars.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file added writers/src/test/resources/test1-page-5-lines.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added writers/src/test/resources/test1-page-5-paths.pdf
Binary file not shown.
Binary file added writers/src/test/resources/test1-page-5-paths.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file added writers/src/test/resources/test1-page-5-words.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit b1e1e43

Please sign in to comment.