Skip to content

Commit

Permalink
Better encoding and wrapping
Browse files Browse the repository at this point in the history
  • Loading branch information
singpolyma committed Jul 28, 2010
1 parent 2f2e544 commit 249a986
Showing 1 changed file with 16 additions and 6 deletions.
22 changes: 16 additions & 6 deletions html2markdown.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# encoding: utf-8
require 'nokogiri'
require 'uri'

Expand All @@ -7,6 +8,7 @@ def initialize(str, baseurl=nil)
@links = []
@baseuri = (baseurl ? URI::parse(baseurl) : nil)
@section_level = 0
@encoding = str.encoding
@markdown = output_for(Nokogiri::HTML(str, baseurl).root).gsub(/\n\n+/, "\n\n")
end

Expand Down Expand Up @@ -48,6 +50,7 @@ def add_link(link)
end

def wrap(str)
return str if str =~ /\n/
out = ''
line = []
str.split(/[ \t]+/).each {|word|
Expand All @@ -57,7 +60,7 @@ def wrap(str)
line = []
end
}
out << line.join(' ')
out << line.join(' ') + (str[-1..-1] =~ /[ \t\n]/ ? str[-1..-1] : '')
end

def output_for(node)
Expand All @@ -67,7 +70,7 @@ def output_for(node)
when 'br'
" \n"
when 'p', 'div'
"\n\n#{output_for_children(node)}\n\n"
"\n\n#{wrap(output_for_children(node))}\n\n"
when 'section', 'article'
@section_level += 1
o = "\n\n----\n\n#{output_for_children(node)}\n\n"
Expand All @@ -77,7 +80,7 @@ def output_for(node)
"\n\n" + ('#'*($1.to_i+@section_level) + ' ' + output_for_children(node)) + "\n\n"
when 'blockquote'
@section_level += 1
o = ("\n\n> #{output_for_children(node).gsub(/\n/, "\n> ")}\n\n").gsub(/> \n(> \n)+/, "> \n")
o = ("\n\n> #{wrap(output_for_children(node)).gsub(/\n/, "\n> ")}\n\n").gsub(/> \n(> \n)+/, "> \n")
@section_level -= 1
o
when 'ul'
Expand All @@ -93,7 +96,7 @@ def output_for(node)
"#{i}. #{output_for_children(el).gsub(/^(\t)|( )/, "\t\t").gsub(/^>/, "\t>")}\n"
}.join + "\n\n"
when 'pre', 'code'
block = "\t" + output_for_children(node).gsub(/\n/, "\n\t")
block = "\t" + wrap(output_for_children(node)).gsub(/\n/, "\n\t")
if block.count("\n") < 1
"`#{output_for_children(node)}`"
else
Expand Down Expand Up @@ -127,9 +130,16 @@ def output_for(node)
when 'th', 'td'
"|#{'=' if node.name == 'th'}#{output_for_children(node)}|"
when 'text'
wrap(node.content)
# Sometimes Nokogiri lies. Force the encoding back to what we know it is
if (c = node.content.force_encoding(@encoding)) =~ /\S/
c.gsub!(/\n\n+/, '<$PreserveDouble$>')
c.gsub!(/\s+/, ' ')
c.gsub(/<\$PreserveDouble\$>/, "\n\n")
else
c
end
else
output_for_children(node)
wrap(output_for_children(node))
end
end

Expand Down

0 comments on commit 249a986

Please sign in to comment.