Skip to content

Commit

Permalink
init commit
Browse files Browse the repository at this point in the history
  • Loading branch information
siuying committed May 20, 2010
0 parents commit 021b8b0
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -0,0 +1 @@
pkg
6 changes: 6 additions & 0 deletions Manifest
@@ -0,0 +1,6 @@
Manifest
README.markdown
Rakefile
bin/eurl
lib/embed_html.rb
lib/embed_html/embeder.rb
17 changes: 17 additions & 0 deletions README.markdown
@@ -0,0 +1,17 @@
Install
=======

gem install embed_html

Usage
=======

eurl _<input-url>_ _<output-file>_

Example:

eurl http://www.google.com google.html

Output:


15 changes: 15 additions & 0 deletions Rakefile
@@ -0,0 +1,15 @@
# Rakefile
require 'rubygems'
require 'rake'
require 'echoe'

Echoe.new('embed_html', '0.1.0') do |p|
p.description = "Download and embed images in html using base64 data encoding"
p.summary = "Download or process a HTML page, find images there, download them and embed it into the HTML using Base64 data encoding"
p.url = "http://github.com/siuying/embed_html"
p.author = "Francis Chong"
p.email = "francis@ignition.hk"
p.ignore_pattern = ["tmp/*", "script/*", "*.html"]
p.runtime_dependencies = ["hpricot", "typhoeus"]
end

16 changes: 16 additions & 0 deletions bin/eurl
@@ -0,0 +1,16 @@
require 'embed_html'

url = ARGV[0]
file = ARGV[1]

if url && file
log = Logger.new($stdout)
log.level = Logger::INFO

html = EmbedHtml::Embeder.new(url, log).process
File.open(file, 'w') {|f| f.write(html)}

else
puts "usage: eurl <URL> <OUTPUT_FILE>"

end
4 changes: 4 additions & 0 deletions lib/embed_html.rb
@@ -0,0 +1,4 @@
path = File.dirname(__FILE__)
$:.unshift(path) unless $:.include?(path)

require 'embed_html/embeder'
50 changes: 50 additions & 0 deletions lib/embed_html/embeder.rb
@@ -0,0 +1,50 @@
require 'logger'
require 'open-uri'
require 'hpricot'
require 'uri'
require 'base64'
require 'typhoeus'

module EmbedHtml
class Embeder
MAX_CONCURRENCY = 5

attr_accessor :url
attr_accessor :logger

def initialize(url, logger=Logger.new($stdout))
@logger = logger
@url = url
end

def process
@logger.info "downloading url: #{@url}"
html = Typhoeus::Request.get(@url.to_s).body
doc = Hpricot(html)

hydra = Typhoeus::Hydra.new(:max_concurrency => MAX_CONCURRENCY)
doc.search("//img").each do |img|
begin
image_url = URI.join(@url, img.attributes['src'])
@logger.debug "queue download image: #{image_url}"

request = Typhoeus::Request.new(image_url.to_s)
request.on_complete do |response|
data = response.body
type = response.headers_hash["Content-Type"]
if data && type
data_b64 = Base64.encode64(data)
img.attributes['src'] = "data:#{type};base64,#{data_b64}"
end
end
hydra.queue request
rescue StandardError => e
@logger.error "failed downloading image: #{image_url} (#{e.message})"
end
end
hydra.run
@logger.info "done"
doc.to_html
end
end
end

0 comments on commit 021b8b0

Please sign in to comment.