From 93312f47fc88605b897413340f278066db678722 Mon Sep 17 00:00:00 2001 From: Jonathan Ingram Date: Tue, 31 Oct 2023 11:32:40 +1100 Subject: [PATCH] v2: initial commit (#153) - Some methods have been changed to return an error as their last argument - Log calls inside various functions have been removed - Use a v1 tag if you need the previous signature --- .github/workflows/docd.yml | 2 +- README.md | 14 +++++----- client/cmd/docconv-client/main.go | 2 +- doc.go | 18 ++++-------- docconv.go | 2 +- docd/convert.go | 4 +-- docd/main.go | 8 +++--- docx_test/docx_test.go | 2 +- go.mod | 2 +- html.go | 30 ++++++++++---------- html_appengine.go | 11 ++------ html_test/html_test.go | 2 +- iWork/TSPArchiveMessages.pb.go | 10 +++---- iWork/TSPDatabaseMessages.pb.go | 10 +++---- iWork/TSPMessages.pb.go | 9 +++--- iWork/pb-schema/TSPArchiveMessages.proto | 2 +- iWork/pb-schema/TSPDatabaseMessages.proto | 3 +- iWork/pb-schema/TSPMessages.proto | 3 +- image.go | 2 +- image_ocr.go | 2 +- image_ocr_test.go | 2 +- pages.go | 4 +-- pdf.go | 3 +- pdf_ocr.go | 34 ++++++++++------------- pdf_ocr_test.go | 7 +++-- pptx_test/pptx_test.go | 2 +- rtf_test/rtf_test.go | 2 +- 27 files changed, 88 insertions(+), 104 deletions(-) diff --git a/.github/workflows/docd.yml b/.github/workflows/docd.yml index 49ea553..ddf19ce 100644 --- a/.github/workflows/docd.yml +++ b/.github/workflows/docd.yml @@ -20,7 +20,7 @@ jobs: with: images: sajari/docd labels: | - org.opencontainers.image.description=A tool which exposes code.sajari.com/docconv as a service + org.opencontainers.image.description=A tool which exposes code.sajari.com/docconv/v2 as a service org.opencontainers.image.title=docd tags: | type=semver,pattern={{version}} diff --git a/README.md b/README.md index a758923..1aef2ff 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # docconv -[![Go reference](https://pkg.go.dev/badge/code.sajari.com/docconv.svg)](https://pkg.go.dev/code.sajari.com/docconv) +[![Go reference](https://pkg.go.dev/badge/code.sajari.com/docconv/v2.svg)](https://pkg.go.dev/code.sajari.com/docconv/v2) [![Build status](https://github.com/sajari/docconv/workflows/Go/badge.svg?branch=master)](https://github.com/sajari/docconv/actions) -[![Report card](https://goreportcard.com/badge/code.sajari.com/docconv)](https://goreportcard.com/report/code.sajari.com/docconv) -[![Sourcegraph](https://sourcegraph.com/github.com/sajari/docconv/-/badge.svg)](https://sourcegraph.com/github.com/sajari/docconv) +[![Report card](https://goreportcard.com/badge/code.sajari.com/docconv/v2)](https://goreportcard.com/report/code.sajari.com/docconv/v2) +[![Sourcegraph](https://sourcegraph.com/github.com/sajari/docconv/v2/-/badge.svg)](https://sourcegraph.com/github.com/sajari/docconv/v2) A Go wrapper library to convert PDF, DOC, DOCX, XML, HTML, RTF, ODT, Pages documents and images (see optional dependencies below) to plain text. @@ -14,7 +14,7 @@ If you haven't setup Go before, you first need to [install Go](https://golang.or To fetch and build the code: ```console -$ go install code.sajari.com/docconv/docd@latest +$ go install code.sajari.com/docconv/v2/docd@latest ``` See `go help install` for details on the installation location of the installed `docd` executable. Make sure that the full path to the executable is in your `PATH` environment variable. @@ -48,7 +48,7 @@ To add image support to the `docconv` library you first need to [install and bui Now you can add `-tags ocr` to any `go` command when building/fetching/testing `docconv` to include support for processing images: ```console -$ go get -tags ocr code.sajari.com/docconv/... +$ go get -tags ocr code.sajari.com/docconv/v2/... ``` This may complain on macOS, which you can fix by installing [tesseract](https://tesseract-ocr.github.io) via brew: @@ -119,7 +119,7 @@ package main import ( "fmt" - "code.sajari.com/docconv" + "code.sajari.com/docconv/v2" ) func main() { @@ -139,7 +139,7 @@ package main import ( "fmt" - "code.sajari.com/docconv/client" + "code.sajari.com/docconv/v2/client" ) func main() { diff --git a/client/cmd/docconv-client/main.go b/client/cmd/docconv-client/main.go index 38829d9..8572691 100644 --- a/client/cmd/docconv-client/main.go +++ b/client/cmd/docconv-client/main.go @@ -6,7 +6,7 @@ import ( "fmt" "os" - "code.sajari.com/docconv/client" + "code.sajari.com/docconv/v2/client" ) var ( diff --git a/doc.go b/doc.go index d9bce28..18f80d6 100644 --- a/doc.go +++ b/doc.go @@ -4,7 +4,6 @@ import ( "bytes" "fmt" "io" - "log" "os" "os/exec" "time" @@ -26,7 +25,7 @@ func ConvertDoc(r io.Reader) (string, map[string]string, error) { go func() { defer func() { if e := recover(); e != nil { - log.Printf("panic when reading doc format: %v", e) + // TODO: Propagate error. } }() @@ -34,7 +33,7 @@ func ConvertDoc(r io.Reader) (string, map[string]string, error) { doc, err := mscfb.New(f) if err != nil { - log.Printf("ConvertDoc: could not read doc: %v", err) + // TODO: Propagate error. mc <- meta return } @@ -42,8 +41,8 @@ func ConvertDoc(r io.Reader) (string, map[string]string, error) { props := msoleps.New() for entry, err := doc.Next(); err == nil; entry, err = doc.Next() { if msoleps.IsMSOLEPS(entry.Initial) { - if oerr := props.Reset(doc); oerr != nil { - log.Printf("ConvertDoc: could not reset props: %v", oerr) + if err := props.Reset(doc); err != nil { + // TODO: Propagate error. break } @@ -73,13 +72,10 @@ func ConvertDoc(r io.Reader) (string, map[string]string, error) { // Document body bc := make(chan string, 1) go func() { - // Save output to a file var buf bytes.Buffer outputFile, err := os.CreateTemp("/tmp", "sajari-convert-") if err != nil { - // TODO: Remove this. - log.Println("TempFile Out:", err) bc <- buf.String() return } @@ -87,14 +83,12 @@ func ConvertDoc(r io.Reader) (string, map[string]string, error) { err = exec.Command("wvText", f.Name(), outputFile.Name()).Run() if err != nil { - // TODO: Remove this. - log.Println("wvText:", err) + // TODO: Propagate error. } _, err = buf.ReadFrom(outputFile) if err != nil { - // TODO: Remove this. - log.Println("wvText:", err) + // TODO: Propagate error. } bc <- buf.String() diff --git a/docconv.go b/docconv.go index bd30dae..218e06a 100755 --- a/docconv.go +++ b/docconv.go @@ -1,4 +1,4 @@ -package docconv // import "code.sajari.com/docconv" +package docconv // import "code.sajari.com/docconv/v2" import ( "encoding/json" diff --git a/docd/convert.go b/docd/convert.go index c01a42f..25b4825 100644 --- a/docd/convert.go +++ b/docd/convert.go @@ -12,8 +12,8 @@ import ( "cloud.google.com/go/errorreporting" - "code.sajari.com/docconv" - "code.sajari.com/docconv/docd/internal" + "code.sajari.com/docconv/v2" + "code.sajari.com/docconv/v2/docd/internal" ) type convertServer struct { diff --git a/docd/main.go b/docd/main.go index 48c9328..6fce2ab 100644 --- a/docd/main.go +++ b/docd/main.go @@ -14,10 +14,10 @@ import ( "github.com/gorilla/mux" - "code.sajari.com/docconv" - "code.sajari.com/docconv/docd/internal" - "code.sajari.com/docconv/docd/internal/cloudtrace" - "code.sajari.com/docconv/docd/internal/debug" + "code.sajari.com/docconv/v2" + "code.sajari.com/docconv/v2/docd/internal" + "code.sajari.com/docconv/v2/docd/internal/cloudtrace" + "code.sajari.com/docconv/v2/docd/internal/debug" ) var ( diff --git a/docx_test/docx_test.go b/docx_test/docx_test.go index f96a4f6..c95a7a7 100644 --- a/docx_test/docx_test.go +++ b/docx_test/docx_test.go @@ -5,7 +5,7 @@ import ( "strings" "testing" - "code.sajari.com/docconv" + "code.sajari.com/docconv/v2" ) func TestConvertDocx(t *testing.T) { diff --git a/go.mod b/go.mod index ad1179e..41e7654 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module code.sajari.com/docconv +module code.sajari.com/docconv/v2 go 1.21 diff --git a/html.go b/html.go index df4f26f..8fc4652 100644 --- a/html.go +++ b/html.go @@ -1,11 +1,10 @@ -// +build !appengine +//go:build !appengine package docconv import ( "bytes" "io" - "log" "strings" "golang.org/x/net/html" @@ -25,18 +24,23 @@ func ConvertHTML(r io.Reader, readability bool) (string, map[string]string, erro cleanXML, err := Tidy(buf, false) if err != nil { - log.Println("Tidy:", err) // Tidy failed, so we now manually tokenize instead clean := cleanHTML(buf, true) cleanXML = []byte(clean) - // TODO: remove this log - log.Println("Cleaned HTML using Golang tokenizer") } if readability { - cleanXML = HTMLReadability(bytes.NewReader(cleanXML)) + var err error + cleanXML, err = HTMLReadability(bytes.NewReader(cleanXML)) + if err != nil { + return "", nil, err + } + } + text, err := HTMLToText(bytes.NewReader(cleanXML)) + if err != nil { + return "", nil, err } - return HTMLToText(bytes.NewReader(cleanXML)), meta, nil + return text, meta, nil } var acceptedHTMLTags = [...]string{ @@ -127,7 +131,7 @@ type HTMLReadabilityOptions struct { var HTMLReadabilityOptionsValues HTMLReadabilityOptions // HTMLReadability extracts the readable text in an HTML document -func HTMLReadability(r io.Reader) []byte { +func HTMLReadability(r io.Reader) ([]byte, error) { jr := justext.NewReader(r) // TODO: Improve this! @@ -141,8 +145,7 @@ func HTMLReadability(r io.Reader) []byte { paragraphSet, err := jr.ReadAll() if err != nil { - log.Println("Justext:", err) - return nil + return nil, err } useClasses := strings.SplitN(HTMLReadabilityOptionsValues.ReadabilityUseClasses, ",", 10) @@ -156,13 +159,12 @@ func HTMLReadability(r io.Reader) []byte { } } - return []byte(output) + return []byte(output), nil } // HTMLToText converts HTML to plain text. -func HTMLToText(input io.Reader) string { - text, _ := XMLToText(input, []string{"br", "p", "h1", "h2", "h3", "h4"}, []string{}, false) - return text +func HTMLToText(input io.Reader) (string, error) { + return XMLToText(input, []string{"br", "p", "h1", "h2", "h3", "h4"}, []string{}, false) } var readabilityStopList = map[string]bool{"and": true, "the": true, "a": true, "about": true, "above": true, "across": true, "after": true, "afterwards": true, "again": true, "against": true, "all": true, "almost": true, "alone": true, diff --git a/html_appengine.go b/html_appengine.go index 1f2337d..47763a1 100644 --- a/html_appengine.go +++ b/html_appengine.go @@ -1,18 +1,11 @@ //go:build appengine -// +build appengine package docconv import ( "io" - "log" ) -func HTMLReadability(r io.Reader) []byte { - b, err := io.ReadAll(r) - if err != nil { - log.Printf("HTMLReadability: %v", err) - return nil - } - return b +func HTMLReadability(r io.Reader) ([]byte, error) { + return io.ReadAll(r) } diff --git a/html_test/html_test.go b/html_test/html_test.go index f381ce7..8b9801f 100644 --- a/html_test/html_test.go +++ b/html_test/html_test.go @@ -8,7 +8,7 @@ import ( "github.com/google/go-cmp/cmp" - "code.sajari.com/docconv" + "code.sajari.com/docconv/v2" ) func TestConvertHTML_readabilityUseClasses(t *testing.T) { diff --git a/iWork/TSPArchiveMessages.pb.go b/iWork/TSPArchiveMessages.pb.go index 8150889..550d265 100644 --- a/iWork/TSPArchiveMessages.pb.go +++ b/iWork/TSPArchiveMessages.pb.go @@ -1,7 +1,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: -// protoc-gen-go v1.28.0 -// protoc v3.19.4 +// protoc-gen-go v1.27.1 +// protoc v4.24.4 // source: TSPArchiveMessages.proto package tsp @@ -1122,9 +1122,9 @@ var file_TSPArchiveMessages_proto_rawDesc = []byte{ 0x74, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x55, 0x75, 0x69, 0x64, 0x12, 0x30, 0x0a, 0x09, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x18, 0x03, 0x20, 0x02, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x54, 0x53, 0x50, 0x2e, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x49, - 0x6e, 0x66, 0x6f, 0x52, 0x09, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x42, 0x1d, - 0x5a, 0x1b, 0x63, 0x6f, 0x64, 0x65, 0x2e, 0x73, 0x61, 0x6a, 0x61, 0x72, 0x69, 0x2e, 0x63, 0x6f, - 0x6d, 0x2f, 0x64, 0x6f, 0x63, 0x63, 0x6f, 0x6e, 0x76, 0x2f, 0x74, 0x73, 0x70, + 0x6e, 0x66, 0x6f, 0x52, 0x09, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x42, 0x20, + 0x5a, 0x1e, 0x63, 0x6f, 0x64, 0x65, 0x2e, 0x73, 0x61, 0x6a, 0x61, 0x72, 0x69, 0x2e, 0x63, 0x6f, + 0x6d, 0x2f, 0x64, 0x6f, 0x63, 0x63, 0x6f, 0x6e, 0x76, 0x2f, 0x76, 0x32, 0x2f, 0x74, 0x73, 0x70, } var ( diff --git a/iWork/TSPDatabaseMessages.pb.go b/iWork/TSPDatabaseMessages.pb.go index 5dd9136..69ae471 100644 --- a/iWork/TSPDatabaseMessages.pb.go +++ b/iWork/TSPDatabaseMessages.pb.go @@ -1,7 +1,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: -// protoc-gen-go v1.28.0 -// protoc v3.19.4 +// protoc-gen-go v1.27.1 +// protoc v4.24.4 // source: TSPDatabaseMessages.proto package tsp @@ -307,9 +307,9 @@ var file_TSPDatabaseMessages_proto_rawDesc = []byte{ 0x6d, 0x61, 0x67, 0x65, 0x54, 0x79, 0x70, 0x65, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x22, 0x2d, 0x0a, 0x09, 0x49, 0x6d, 0x61, 0x67, 0x65, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, 0x0a, 0x07, 0x75, 0x6e, 0x6b, 0x6e, 0x6f, 0x77, 0x6e, 0x10, 0x00, 0x12, 0x0a, 0x0a, 0x06, 0x62, 0x69, 0x74, 0x6d, - 0x61, 0x70, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x70, 0x64, 0x66, 0x10, 0x02, 0x42, 0x1d, 0x5a, - 0x1b, 0x63, 0x6f, 0x64, 0x65, 0x2e, 0x73, 0x61, 0x6a, 0x61, 0x72, 0x69, 0x2e, 0x63, 0x6f, 0x6d, - 0x2f, 0x64, 0x6f, 0x63, 0x63, 0x6f, 0x6e, 0x76, 0x2f, 0x74, 0x73, 0x70, + 0x61, 0x70, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x70, 0x64, 0x66, 0x10, 0x02, 0x42, 0x20, 0x5a, + 0x1e, 0x63, 0x6f, 0x64, 0x65, 0x2e, 0x73, 0x61, 0x6a, 0x61, 0x72, 0x69, 0x2e, 0x63, 0x6f, 0x6d, + 0x2f, 0x64, 0x6f, 0x63, 0x63, 0x6f, 0x6e, 0x76, 0x2f, 0x76, 0x32, 0x2f, 0x74, 0x73, 0x70, } var ( diff --git a/iWork/TSPMessages.pb.go b/iWork/TSPMessages.pb.go index f32039a..2dea10e 100644 --- a/iWork/TSPMessages.pb.go +++ b/iWork/TSPMessages.pb.go @@ -1,7 +1,7 @@ // Code generated by protoc-gen-go. DO NOT EDIT. // versions: -// protoc-gen-go v1.28.0 -// protoc v3.19.4 +// protoc-gen-go v1.27.1 +// protoc v4.24.4 // source: TSPMessages.proto package tsp @@ -1116,8 +1116,9 @@ var file_TSPMessages_proto_rawDesc = []byte{ 0x69, 0x66, 0x69, 0x65, 0x72, 0x12, 0x28, 0x0a, 0x07, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0e, 0x2e, 0x54, 0x53, 0x50, 0x2e, 0x52, 0x65, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x52, 0x07, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x73, 0x42, - 0x1d, 0x5a, 0x1b, 0x63, 0x6f, 0x64, 0x65, 0x2e, 0x73, 0x61, 0x6a, 0x61, 0x72, 0x69, 0x2e, 0x63, - 0x6f, 0x6d, 0x2f, 0x64, 0x6f, 0x63, 0x63, 0x6f, 0x6e, 0x76, 0x2f, 0x74, 0x73, 0x70, + 0x20, 0x5a, 0x1e, 0x63, 0x6f, 0x64, 0x65, 0x2e, 0x73, 0x61, 0x6a, 0x61, 0x72, 0x69, 0x2e, 0x63, + 0x6f, 0x6d, 0x2f, 0x64, 0x6f, 0x63, 0x63, 0x6f, 0x6e, 0x76, 0x2f, 0x76, 0x32, 0x2f, 0x74, 0x73, + 0x70, } var ( diff --git a/iWork/pb-schema/TSPArchiveMessages.proto b/iWork/pb-schema/TSPArchiveMessages.proto index bcb2bd0..fe918a5 100755 --- a/iWork/pb-schema/TSPArchiveMessages.proto +++ b/iWork/pb-schema/TSPArchiveMessages.proto @@ -1,6 +1,6 @@ syntax = "proto2"; -option go_package = "code.sajari.com/docconv/tsp"; +option go_package = "code.sajari.com/docconv/v2/tsp"; package TSP; diff --git a/iWork/pb-schema/TSPDatabaseMessages.proto b/iWork/pb-schema/TSPDatabaseMessages.proto index aff0b9f..9bc3be8 100755 --- a/iWork/pb-schema/TSPDatabaseMessages.proto +++ b/iWork/pb-schema/TSPDatabaseMessages.proto @@ -1,6 +1,6 @@ syntax = "proto2"; -option go_package = "code.sajari.com/docconv/tsp"; +option go_package = "code.sajari.com/docconv/v2/tsp"; import "TSPMessages.proto"; package TSP; @@ -27,4 +27,3 @@ message DatabaseImageDataArchive { required .TSP.DatabaseDataArchive super = 1; required .TSP.DatabaseImageDataArchive.ImageType type = 2; } - diff --git a/iWork/pb-schema/TSPMessages.proto b/iWork/pb-schema/TSPMessages.proto index 4e20880..0fb881c 100755 --- a/iWork/pb-schema/TSPMessages.proto +++ b/iWork/pb-schema/TSPMessages.proto @@ -1,6 +1,6 @@ syntax = "proto2"; -option go_package = "code.sajari.com/docconv/tsp"; +option go_package = "code.sajari.com/docconv/v2/tsp"; package TSP; @@ -94,4 +94,3 @@ message ObjectContainer { optional uint32 identifier = 1; repeated .TSP.Reference objects = 2; } - diff --git a/image.go b/image.go index 0b2c6cf..e8c60b5 100644 --- a/image.go +++ b/image.go @@ -1,4 +1,4 @@ -// +build !ocr +//go:build !ocr package docconv diff --git a/image_ocr.go b/image_ocr.go index 6447309..9c9fb29 100644 --- a/image_ocr.go +++ b/image_ocr.go @@ -1,4 +1,4 @@ -// +build ocr +//go:build ocr package docconv diff --git a/image_ocr_test.go b/image_ocr_test.go index 46db151..c48c9f1 100644 --- a/image_ocr_test.go +++ b/image_ocr_test.go @@ -1,4 +1,4 @@ -// +build ocr +//go:build ocr package docconv diff --git a/pages.go b/pages.go index d5cfbd8..0085669 100644 --- a/pages.go +++ b/pages.go @@ -11,8 +11,8 @@ import ( "google.golang.org/protobuf/proto" - TSP "code.sajari.com/docconv/iWork" - "code.sajari.com/docconv/snappy" + TSP "code.sajari.com/docconv/v2/iWork" + "code.sajari.com/docconv/v2/snappy" ) // ConvertPages converts a Pages file to text. diff --git a/pdf.go b/pdf.go index e5920d2..6a6a118 100644 --- a/pdf.go +++ b/pdf.go @@ -1,4 +1,4 @@ -// +build !ocr +//go:build !ocr package docconv @@ -8,7 +8,6 @@ import ( ) func ConvertPDF(r io.Reader) (string, map[string]string, error) { - f, err := NewLocalFile(r) if err != nil { return "", nil, fmt.Errorf("error creating local file: %v", err) diff --git a/pdf_ocr.go b/pdf_ocr.go index 6fcd5ea..dc96515 100644 --- a/pdf_ocr.go +++ b/pdf_ocr.go @@ -1,12 +1,10 @@ //go:build ocr -// +build ocr package docconv import ( "fmt" "io" - "log" "os" "os/exec" "path/filepath" @@ -28,13 +26,6 @@ func compareExt(ext string, exts []string) bool { return false } -func cleanupTemp(tmpDir string) { - err := os.RemoveAll(tmpDir) - if err != nil { - log.Println(err) - } -} - func ConvertPDFImages(path string) (BodyResult, error) { bodyResult := BodyResult{} @@ -45,7 +36,9 @@ func ConvertPDFImages(path string) (BodyResult, error) { } tmpDir := fmt.Sprintf("%s/", tmp) - defer cleanupTemp(tmpDir) + defer func() { + _ = os.RemoveAll(tmpDir) // ignore error + }() _, err = exec.Command("pdfimages", "-j", path, tmpDir).Output() if err != nil { @@ -110,18 +103,17 @@ func ConvertPDFImages(path string) (BodyResult, error) { } // PdfHasImage verify if `path` (PDF) has images -func PDFHasImage(path string) bool { +func PDFHasImage(path string) (bool, error) { cmd := "pdffonts -l 5 %s | tail -n +3 | cut -d' ' -f1 | sort | uniq" out, err := exec.Command("bash", "-c", fmt.Sprintf(cmd, shellEscape(path))).CombinedOutput() if err != nil { - log.Println(err) - return false + return false, err } if string(out) == "" { - return true + return true, nil } - return false + return false, nil } func ConvertPDF(r io.Reader) (string, map[string]string, error) { @@ -142,18 +134,20 @@ func ConvertPDF(r io.Reader) (string, map[string]string, error) { return "", nil, metaResult.err } - if !PDFHasImage(f.Name()) { + hasImage, err := PDFHasImage(f.Name()) + if err != nil { + return "", nil, fmt.Errorf("could not check if PDF has image: %w", err) + } + if !hasImage { return bodyResult.body, metaResult.meta, nil } imageConvertResult, imageConvertErr := ConvertPDFImages(f.Name()) if imageConvertErr != nil { - log.Println(imageConvertErr) - return bodyResult.body, metaResult.meta, nil + return bodyResult.body, metaResult.meta, nil // ignore error, return what we have } if imageConvertResult.err != nil { - log.Println(imageConvertResult.err) - return bodyResult.body, metaResult.meta, nil + return bodyResult.body, metaResult.meta, nil // ignore error, return what we have } fullBody := strings.Join([]string{bodyResult.body, imageConvertResult.body}, " ") diff --git a/pdf_ocr_test.go b/pdf_ocr_test.go index 73a0af5..f0d3bff 100644 --- a/pdf_ocr_test.go +++ b/pdf_ocr_test.go @@ -1,5 +1,4 @@ //go:build ocr -// +build ocr package docconv @@ -12,7 +11,11 @@ func TestPDFHasImage_CannotExecuteCode(t *testing.T) { // Try to inject code by passing a bad file path. // If the code was successful it will create a file called foo in the working directory badFilePath := "$(id >> foo).pdf" - if got, want := PDFHasImage(badFilePath), false; got != want { + got, err := PDFHasImage(badFilePath) + if err != nil { + t.Fatal(err) + } + if want := false; got != want { t.Errorf("got %v, want %v", got, want) } diff --git a/pptx_test/pptx_test.go b/pptx_test/pptx_test.go index 939f244..39eea25 100644 --- a/pptx_test/pptx_test.go +++ b/pptx_test/pptx_test.go @@ -5,7 +5,7 @@ import ( "strings" "testing" - "code.sajari.com/docconv" + "code.sajari.com/docconv/v2" ) func TestConvertPptx(t *testing.T) { diff --git a/rtf_test/rtf_test.go b/rtf_test/rtf_test.go index 7985f56..af14a81 100644 --- a/rtf_test/rtf_test.go +++ b/rtf_test/rtf_test.go @@ -6,7 +6,7 @@ import ( "strings" "testing" - "code.sajari.com/docconv" + "code.sajari.com/docconv/v2" ) func TestConvertRTF(t *testing.T) {