Skip to content

Commit

Permalink
code completely depends on rectangle AVRO schema
Browse files Browse the repository at this point in the history
  • Loading branch information
sagnik committed Aug 26, 2016
1 parent f0dab66 commit 57f27b9
Show file tree
Hide file tree
Showing 28 changed files with 103 additions and 4 deletions.
6 changes: 3 additions & 3 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
// shared settings across root & all subprojects

version in ThisBuild := {
val major = 0
val major = 1
val minor = 0
val patch = 11
val patch = 0
s"$major.$minor.$patch"
} //page size now conforms to our rectangle structure.
}

scalaVersion in ThisBuild := "2.11.8"

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
package edu.psu.sagnik.research.pdwriters.impl

import edu.psu.sagnik.research.pdsimplify.model.PDDocumentSimple

/**
* Created by szr163 on 8/26/16.
*/
object CreateResultsBatch {

import java.awt.Color
import java.io.File

import org.apache.pdfbox.pdmodel.PDDocument
import java.util.logging.{ Level, Logger }

import edu.psu.sagnik.research.data.RectangleOTL
import edu.psu.sagnik.research.pdsimplify.impl.ProcessDocument
import edu.psu.sagnik.research.pdwriters.writers.image.CreateMarkedPNG
import edu.psu.sagnik.research.pdwriters.writers.pdf.CreateMarkedPDF

import scala.util.{ Failure, Success }

/**
* Created by schoudhury on 6/27/16.
*/
def createMarkedResult(document: PDDocument, SimplifiedDocument: PDDocumentSimple, pageNum: Int, pdLoc: String): PDDocument = {
val paragraphs = SimplifiedDocument.pages(pageNum).paragraphs
val rasters = SimplifiedDocument.pages(pageNum).rasters
val graphicsPaths = SimplifiedDocument.pages(pageNum).gPaths

val segments = graphicsPaths
.filter(x => x.doPaint)
.flatMap(x => x.subPaths)
.flatMap(x => x.segments)

//TODO: check for comprehensions.
val chars = paragraphs.flatMap(_.tLines).flatMap(_.tWords).flatMap(_.chars)
val words = paragraphs.flatMap(_.tLines).flatMap(_.tWords)
val lines = paragraphs.flatMap(_.tLines)

var doc = document

doc = getExtractionResult(doc, pageNum, chars.map(_.bb), Color.BLUE)

doc = getExtractionResult(doc, pageNum, words.map(_.bb), Color.GREEN)

doc = getExtractionResult(doc, pageNum, lines.map(_.bb), Color.RED)

doc = getExtractionResult(doc, pageNum, paragraphs.map(_.bb), Color.CYAN)

doc = getExtractionResult(doc, pageNum, rasters.map(_.bb), Color.MAGENTA)

doc = getExtractionResult(doc, pageNum, segments.map(_.bb), Color.PINK)

doc
}

def getExtractionResult(document: PDDocument, pageNum: Int, bbs: List[RectangleOTL], c: Color): PDDocument = {
val page = document.getPage(pageNum)
CreateMarkedPDF.rectMarkedContent(document, page, bbs, c)
}

lazy val logger = Logger.getLogger("pdwriters.writer.ShowResults")
logger.setLevel(Level.ALL)

def printExtractionResult(pdLoc: String, pageNum: Int, bbs: List[RectangleOTL], c: Color, qualifier: String) = {
val document = PDDocument.load(new File(pdLoc))
val page = document.getPage(pageNum)
CreateMarkedPDF(pdLoc, document, pageNum, page, bbs, c, qualifier)
logger.fine(s"created ${qualifier.substring(0, qualifier.length - 1)} marked PDF")
}

def main(args: Array[String]): Unit = {
val DEFAULT_LOC = "/home/szr163/Downloads/f706gsd1.pdf"
//"/Users/schoudhury/hassan/C10-2042.pdf"
val pdLoc = if (args.length > 1) args(0) else DEFAULT_LOC

var document = PDDocument.load(new File(pdLoc))

val simplifiedDocument = ProcessDocument(document)

(0 until simplifiedDocument.pages.size).foreach(pageNum => document = createMarkedResult(document, simplifiedDocument, pageNum = pageNum, pdLoc = pdLoc))

document.save(s"${pdLoc.dropRight(4)}-marked.pdf")
println(s"[marked document written to]: ${pdLoc.dropRight(4)}-marked.pdf")
document.close()

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ object CreateMarkedPDF {
private def drawRect(content: PDPageContentStream, color: Color, rect: RectangleOTL, page: PDPage, fill: Boolean) {
content.addRect(
rect.xTopLeft + page.getCropBox.getLowerLeftX,
page.getCropBox.getHeight - (rect.yTopLeft + page.getCropBox.getLowerLeftY) - rect.heightDown,
page.getCropBox.getHeight - rect.yTopLeft - rect.heightDown + page.getCropBox.getLowerLeftY,
rect.widthRight,
rect.heightDown
)
Expand All @@ -47,4 +47,13 @@ object CreateMarkedPDF {
document.close()
}

def rectMarkedContent(document: PDDocument, page: PDPage, bbs: List[RectangleOTL], color: Color): PDDocument = {
bbs.foreach(bb => {
val content = new PDPageContentStream(document, page, PDPageContentStream.AppendMode.APPEND, false)
drawRect(content, color, bb, page, fill = false)
content.close()
})
document
}

}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added writers/src/test/resources/test1-marked.pdf
Binary file not shown.
Binary file removed writers/src/test/resources/test1-page-5-chars.pdf
Binary file not shown.
Binary file removed writers/src/test/resources/test1-page-5-chars.png
Binary file not shown.
Binary file removed writers/src/test/resources/test1-page-5-lines.pdf
Binary file not shown.
Binary file removed writers/src/test/resources/test1-page-5-lines.png
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed writers/src/test/resources/test1-page-5-paths.pdf
Binary file not shown.
Binary file removed writers/src/test/resources/test1-page-5-paths.png
Binary file not shown.
Binary file removed writers/src/test/resources/test1-page-5-rasters.pdf
Binary file not shown.
Binary file removed writers/src/test/resources/test1-page-5-rasters.png
Binary file not shown.
Binary file removed writers/src/test/resources/test1-page-5-words.pdf
Binary file not shown.
Binary file removed writers/src/test/resources/test1-page-5-words.png
Binary file not shown.

0 comments on commit 57f27b9

Please sign in to comment.