Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add char/page limit #75

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ go:
- "1.14"
- tip
go_import_path: code.sajari.com/docconv
before_install:
- sudo apt-get -y install poppler-utils
notifications:
email:
- infra@sajari.com
33 changes: 33 additions & 0 deletions config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package docconv

// Config settings for doc conv
type Config struct {
Limitation LenthLimitation
}

// LenthLimitation page or word limit
type LenthLimitation struct {
// XMLMaxWord max word limit for xml parsing, this will effort office 2007 zip format document
XMLMaxWord int
// PdfFirstPage first page to convert for pdf
PdfFirstPage int
// PdfLastPage last page to convert for pdf
PdfLastPage int
}

var (
config Config
)

// SetConfig set configuration for docconv
func SetConfig(c Config) {
config = c
}

func checkXMLMaxWord() bool {
return config.Limitation.XMLMaxWord > 0
}

func xmlMaxWordExceed(length int) bool {
return length > config.Limitation.XMLMaxWord
}
75 changes: 75 additions & 0 deletions docconv_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package docconv

import (
"os"
"strings"
"testing"
)
Expand All @@ -18,3 +19,77 @@ func TestConvertTrimsSpace(t *testing.T) {
t.Errorf("body = %v, want %v", resp.Body, want)
}
}

func TestXMLMaxWord(t *testing.T) {
t.Run("max word not set", func(t *testing.T) {
checkMaxWord := checkXMLMaxWord()
if checkMaxWord != false {
t.Fatalf("got %v, want false", checkMaxWord)
}
})
t.Run("test checkMaxWord", func(t *testing.T) {
SetConfig(Config{Limitation: LenthLimitation{XMLMaxWord: 10}})
checkMaxWord := checkXMLMaxWord()
if checkMaxWord != true {
t.Fatalf("got %v, want true", checkMaxWord)
}
})
t.Run("test xmlMaxWordExceed", func(t *testing.T) {
SetConfig(Config{Limitation: LenthLimitation{XMLMaxWord: 10}})
exceed := xmlMaxWordExceed(10)
if exceed != false {
t.Fatalf("got %v, want false", exceed)
}
exceed = xmlMaxWordExceed(11)
if exceed != true {
t.Fatalf("got %v, want true", exceed)
}
})
t.Run("test parse pptx with maxword", func(t *testing.T) {
SetConfig(Config{Limitation: LenthLimitation{XMLMaxWord: 2}})
f, err := os.Open("./docx_test/testdata/sample_3.docx")
if err != nil {
t.Fatalf("got error = %v, want nil", err)
}

resp, _, err := ConvertDocx(f)
if err != nil {
t.Fatalf("got error = %v, want nil", err)
}
if want := "Content from docx file"; !strings.Contains(resp, want) {
t.Errorf("expected %v to contains %v", resp, want)
}
if want := "second"; strings.Contains(resp, want) {
t.Errorf("expected %v to not contains %v", resp, want)
}
})

}

func TestPDFPageLimit(t *testing.T) {
SetConfig(Config{Limitation: LenthLimitation{PdfFirstPage: 2, PdfLastPage: 3}})
f, err := os.Open("./pdf_test/testdata/pdf.pdf")
if err != nil {
t.Fatalf("got error = %v, want nil", err)
}

resp, _, err := ConvertPDF(f)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@xiaoxin01 I think the best approach for supporting this kind configuration is to introduce new Converters. Something like:

package docconv

type PDFConverter struct {
  pageRange []int
  maxWords int
}

func NewPDFConverter(pageRange []int, maxWords int) *PDFConverter {
  return &PDFConverter{
    pageRange: pageRange,
    maxWords: maxWords,
  }
}

func (c *PDFConverter) Convert(r io.Reader) (string, map[string]string, error) {
  // ...
}

We want to avoid anymore global package state so I think a solution like this is probably going to be better. What do you think?

if err != nil {
t.Fatalf("got error = %v, want nil", err)
}
if want := "2"; !strings.Contains(resp, want) {
t.Errorf("expected %v to contains %v", resp, want)
}
if want := "3"; !strings.Contains(resp, want) {
t.Errorf("expected %v to contains %v", resp, want)
}
if want := "1"; strings.Contains(resp, want) {
t.Errorf("expected %v to not contains %v", resp, want)
}
if want := "4"; strings.Contains(resp, want) {
t.Errorf("expected %v to not contains %v", resp, want)
}
if want := "5"; strings.Contains(resp, want) {
t.Errorf("expected %v to not contains %v", resp, want)
}
}
Binary file added pdf_test/testdata/pdf.pdf
Binary file not shown.
14 changes: 13 additions & 1 deletion pdf_text.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package docconv
import (
"fmt"
"os/exec"
"strconv"
"strings"
"time"
)
Expand Down Expand Up @@ -54,9 +55,20 @@ func ConvertPDFText(path string) (BodyResult, MetaResult, error) {
mr <- metaResult
}()

parameters := []string{
"-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix",
}
if config.Limitation.PdfFirstPage > 0 {
parameters = append(parameters, "-f", strconv.Itoa(config.Limitation.PdfFirstPage))
}
if config.Limitation.PdfLastPage > 0 {
parameters = append(parameters, "-l", strconv.Itoa(config.Limitation.PdfLastPage))
}
parameters = append(parameters, path, "-")

br := make(chan BodyResult, 1)
go func() {
body, err := exec.Command("pdftotext", "-q", "-nopgbrk", "-enc", "UTF-8", "-eol", "unix", path, "-").Output()
body, err := exec.Command("pdftotext", parameters...).Output()
if err != nil {
bodyResult.err = err
}
Expand Down
4 changes: 4 additions & 0 deletions xml.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ func XMLToText(r io.Reader, breaks []string, skip []string, strict bool) (string
}
}
}
// check max word limit for insufficient memory
if checkXMLMaxWord() && xmlMaxWordExceed(len(result)) {
break
}
}
return result, nil
}
Expand Down