diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5fff1d9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +pkg diff --git a/Manifest b/Manifest new file mode 100644 index 0000000..6fbc8b1 --- /dev/null +++ b/Manifest @@ -0,0 +1,6 @@ +Manifest +README.markdown +Rakefile +bin/eurl +lib/embed_html.rb +lib/embed_html/embeder.rb diff --git a/README.markdown b/README.markdown new file mode 100644 index 0000000..38ccb8c --- /dev/null +++ b/README.markdown @@ -0,0 +1,17 @@ +Install +======= + + gem install embed_html + +Usage +======= + + eurl _<input-url>_ _<output-file>_ + +Example: + + eurl http://www.google.com google.html + +Output: + + \ No newline at end of file diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..dc5223d --- /dev/null +++ b/Rakefile @@ -0,0 +1,15 @@ +# Rakefile +require 'rubygems' +require 'rake' +require 'echoe' + +Echoe.new('embed_html', '0.1.0') do |p| + p.description = "Download and embed images in html using base64 data encoding" + p.summary = "Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding" + p.url = "http://github.com/siuying/embed_html" + p.author = "Francis Chong" + p.email = "francis@ignition.hk" + p.ignore_pattern = ["tmp/*", "script/*", "*.html"] + p.runtime_dependencies = ["hpricot", "typhoeus"] +end + diff --git a/bin/eurl b/bin/eurl new file mode 100644 index 0000000..fcb15aa --- /dev/null +++ b/bin/eurl @@ -0,0 +1,16 @@ +require 'embed_html' + +url = ARGV[0] +file = ARGV[1] + +if url && file + log = Logger.new($stdout) + log.level = Logger::INFO + + html = EmbedHtml::Embeder.new(url, log).process + File.open(file, 'w') {|f| f.write(html)} + +else + puts "usage: eurl " + +end \ No newline at end of file diff --git a/lib/embed_html.rb b/lib/embed_html.rb new file mode 100644 index 0000000..dcc347b --- /dev/null +++ b/lib/embed_html.rb @@ -0,0 +1,4 @@ +path = File.dirname(__FILE__) +$:.unshift(path) unless $:.include?(path) + +require 'embed_html/embeder' \ No newline at end of file diff --git a/lib/embed_html/embeder.rb b/lib/embed_html/embeder.rb new file mode 100644 index 0000000..c7ffb50 --- /dev/null +++ b/lib/embed_html/embeder.rb @@ -0,0 +1,50 @@ +require 'logger' +require 'open-uri' +require 'hpricot' +require 'uri' +require 'base64' +require 'typhoeus' + +module EmbedHtml + class Embeder + MAX_CONCURRENCY = 5 + + attr_accessor :url + attr_accessor :logger + + def initialize(url, logger=Logger.new($stdout)) + @logger = logger + @url = url + end + + def process + @logger.info "downloading url: #{@url}" + html = Typhoeus::Request.get(@url.to_s).body + doc = Hpricot(html) + + hydra = Typhoeus::Hydra.new(:max_concurrency => MAX_CONCURRENCY) + doc.search("//img").each do |img| + begin + image_url = URI.join(@url, img.attributes['src']) + @logger.debug "queue download image: #{image_url}" + + request = Typhoeus::Request.new(image_url.to_s) + request.on_complete do |response| + data = response.body + type = response.headers_hash["Content-Type"] + if data && type + data_b64 = Base64.encode64(data) + img.attributes['src'] = "data:#{type};base64,#{data_b64}" + end + end + hydra.queue request + rescue StandardError => e + @logger.error "failed downloading image: #{image_url} (#{e.message})" + end + end + hydra.run + @logger.info "done" + doc.to_html + end + end +end \ No newline at end of file