Permalink
Browse files

Merge pull request #4 from masom/master

Updated for ruby 1.9.x
  • Loading branch information...
2 parents 5565bc9 + 2482b21 commit c8658132f00f8562017b8c7c8312f17045e94e63 @scottdavis committed Jan 4, 2012
View
@@ -1,9 +1,9 @@
require 'rake'
require 'rake/testtask'
-require 'rake/rdoctask'
+require 'rdoc/task'
Rake::TestTask.new do |t|
t.libs << "tesseract"
t.libs << "test"
t.test_files = FileList['test/*_test.rb']
t.verbose = true
-end
+end
View
@@ -17,6 +17,5 @@ This is a library for using the tesseract OCR in ruby applications
Config options are also supported
- tess = Tesseract::Process.new("photo.jpg", {:lang => 'some language', :chop_enable => 0})
+ tess = Tesseract::Process.new("photo.jpg", {:lang => :fra, :tesseract_options => {:chop_enable => 0}})
tess.to_s
-
View
@@ -1,10 +1,11 @@
-require 'tesseract/dependency_checker'
-require 'tesseract/file_handler'
-require 'tesseract/process'
+path = File.join(File.dirname(__FILE__), 'tesseract')
+['dependency_checker', 'file_handler', 'process'].each do |f|
+ require File.expand_path(File.join(path, f))
+end
require 'pathname'
require 'digest/md5'
-
+require 'shellwords'
module Tesseract
-
-end
+
+end
@@ -1,23 +1,23 @@
module Tesseract
- class DependencyChecker
+ class DependencyChecker
#putting these here so its easyer to test
IMAGE_MAGICK_ERROR = "ImageMagick \"convert\" command not found! Make sure ImageMagick is installed and in the system path"
TESSERACT_ERROR = "\"tesseract\" command not found! Make sure tesseract is installed and in the system path"
OS_ERROR = "Only Unix Based enviroments are supported Mac, Linux, etc."
-
+
def self.check!
check_os!
check_for_tesseract!
check_for_imagemagick!
true
end
-
+
private
#for easy mocking
def self.run_cmd(cmd)
`#{cmd}`
end
-
+
def self.check_os!
case ::RUBY_PLATFORM
when /darwin/
@@ -27,14 +27,14 @@ def self.check_os!
end
raise Exception, OS_ERROR
end
-
+
def self.check_for_imagemagick!
raise Exception, IMAGE_MAGICK_ERROR if run_cmd('which convert').empty?
end
-
+
def self.check_for_tesseract!
raise Exception, TESSERACT_ERROR if run_cmd('which tesseract').empty?
end
-
+
end
-end
+end
@@ -3,18 +3,15 @@
module Tesseract
class FileHandler
@tempfiles = []
-
def self.create_temp_file(filename)
file = Pathname.new(Dir::tmpdir).join(filename)
@tempfiles << file
return file
end
-
def self.cleanup!
@tempfiles.each do |file|
- File.unlink(file.to_s) if File.exists?(file.to_s)
+ File.unlink(file.to_s) if File.exists?(file.to_s)
end
end
-
end
-end
+end
View
@@ -1,51 +1,123 @@
+require 'shellwords'
module Tesseract
class Process
+
attr_reader :image
- attr_accessor :lang
+
CONVERT_COMMAND = 'convert'
TESSERACT_COMMAND = 'tesseract'
-
+ # Initialize a Tesseract translation process
+ # image_name is the file to translate
+ # options can be of the following:
+ # * tesseract_options Hash of options for tesseract
+ # * convert_options Array of options for convert
+ # * lang Image input language (eng, fra, etc. )
+ # * convert_command Convert binary name/path
+ # * tesseract_command Tesseract binary name/path
+ # * check_deps Boolean value. If true, verifies dependencies. Defaults to false
def initialize(image_name, options = {})
- DependencyChecker.check!
+ defaults = {
+ :tesseract_options => {},
+ :convert_options => {:input => [], :output => []},
+ :lang => :eng,
+ :convert_command => CONVERT_COMMAND,
+ :tesseract_command => TESSERACT_COMMAND,
+ :check_deps => false
+ }
+ @out = nil
@image = Pathname.new(image_name)
@hash = Digest::MD5.hexdigest("#{@image}-#{Time.now}")
- @lang = options[:lang].nil? ? 'eng' : options.delete(:lang)
- @options = options
+
+ merge_options! defaults, options
+ DependencyChecker.check! if @options[:check_deps]
+ end
+
+ def merge_options!(defaults, options)
+ @options = {}
+
+ if options.has_key? :tesseract_options
+ @options[:tesseract_options] = defaults[:tesseract_options].merge!(options[:tesseract_options]) if options.has_key? :tesseract_options
+ end
+
+
+ if options.has_key? :convert_options
+ @options[:convert_options] = defaults[:convert_options]
+ defaults[:convert_options].each do |k,v|
+ next unless options[:convert_options].has_key? k
+ @options[:convert_options][k] = v | options[:convert_options][k]
+ end
+ options.delete :convert_options
+ end
+
+ [:tesseract_options, :convert_options].each do |k|
+ options.delete(k) if options.has_key? k
+ end
+ @options = defaults.merge options
+ end
+
+ def lang=(lang)
+ @options[:lang]
+ end
+ def lang
+ @options[:lang]
end
-
def to_s
@out ||= process!
end
-
+
+ # Process the image into text.
def process!
temp_image = to_tiff
- text = tesseract_translation(temp_image)
- FileHandler.cleanup!
+ begin
+ text = tesseract_translation(temp_image)
+ rescue IOError
+ raise
+ ensure
+ FileHandler.cleanup!
+ end
text.gsub(/^\//, '')
end
-
+
+ # Generates the convert command.
+ def generate_convert_command(temp_file)
+ cmd = [@options[:convert_command]]
+ input_opt = @options[:convert_options][:input]
+ output_opt = @options[:convert_options][:output]
+
+ cmd += input_opt unless input_opt.empty?
+ cmd << Shellwords.shellescape(@image.to_s)
+ cmd += output_opt unless output_opt.empty?
+ cmd << temp_file.to_s
+ cmd.join(" ")
+ end
+
+ # Converts the source image to a tiff file.
def to_tiff
temp_file = FileHandler.create_temp_file("#{@hash}.tif")
- system [CONVERT_COMMAND, image, temp_file].join(" ")
+ executed = system generate_convert_command(temp_file)
+ raise RuntimeError, "`#{@options[:convert_command]}` could not be executed." if executed.nil?
temp_file
end
-
+
+ # Translate a tiff file into text
def tesseract_translation(image_file)
- temp_text_file = FileHandler.create_temp_file("#{@hash}")
+ temp_text_file = FileHandler.create_temp_file(@hash.to_s)
config_file = write_configs
- system [TESSERACT_COMMAND, image_file, temp_text_file, "-l #{@lang}", config_file, "&> /dev/null"].join(" ")
- File.read("#{temp_text_file}.txt")
+ txt_file = "#{temp_text_file}.txt"
+ executed = system [@options[:tesseract_command], image_file.to_s, temp_text_file.to_s, "-l #{@options[:lang]}", config_file, "&> /dev/null"].join(' ')
+ raise RuntimeError, "`#{@options[:tesseract_command]}` could not be executed." if (executed.nil? || executed == false)
+ out = File.read(txt_file)
+ File.unlink txt_file
+ out
end
-
+ # Writes Tesseract configuration for the current source file
def write_configs
- return '' if @options.empty?
+ return '' if @options[:tesseract_options].empty?
path = FileHandler.create_temp_file("#{@hash}.config")
File.open(path, "w+") do |f|
- @options.each { |k,v| f << "#{k} #{v}\n" }
+ @options[:tesseract_options].each { |k,v| f << "#{k} #{v}\n" }
end
path
end
-
end
-
-end
+end
View
@@ -1,3 +1,3 @@
module Tesseract
- VERSION = '0.1.0'
-end
+ VERSION = '0.1.1'
+end
View
@@ -9,16 +9,17 @@ Gem::Specification.new do |s|
s.version = Tesseract::VERSION
s.platform = Gem::Platform::RUBY
- s.authors = ["Scott Davis"]
+ s.authors = ["Scott Davis", "Martin Samson"]
s.description = %q{Ruby wrapper for google tesseract}
s.summary = %q{Ruby wrapper for google tesseract}
s.email = %q{jetviper21@gmail.com}
s.date = Date.today.to_s
- s.files = `git ls-files`.split("\n")
- s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
- s.require_path = 'tesseract'
+ s.files = ['lib/tesseract.rb', 'lib/tesseract/process.rb', 'lib/tesseract/file_handler.rb', 'lib/tesseract/dependency_checker.rb']
+ s.files += ['lib/tesseract/version.rb']
+ s.require_path = 'lib'
s.homepage = %q{http://github.com/scottdavis/ruby-tesseract}
s.rdoc_options = ["--charset=UTF-8"]
s.required_rubygems_version = ">= 1.3.6"
s.add_development_dependency "bundler", ">= 1.0.0"
-end
+ s.required_ruby_version = '>= 1.8.6'
+end
Oops, something went wrong.

0 comments on commit c865813

Please sign in to comment.