Permalink
Browse files

refactor: use doc as parameter, not html string

  • Loading branch information...
1 parent 1fa55f3 commit 8b251c8bdc3992e54b449510e1de1d3d63e1f34b @siuying committed Jan 5, 2010
Showing with 46 additions and 42 deletions.
  1. +8 −6 README.rdoc
  2. +1 −3 lib/table_parser/parser.rb
  3. +2 −2 lib/table_parser/table.rb
  4. +35 −31 test/test_table_parser.rb
View
@@ -18,21 +18,23 @@ Parsing table could be difficult when its structure contains colspan or rowspan.
Use TableParser::Table to create parsed HTML table.
For example, following code:
- table = TableParser::Table.new "<html><body><table><tr><td>A</td><td>B</td></tr>\
+ html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
- <tr><td>3</td></tr></table></body></html>",
- "/html/body/table"
+ <tr><td>3</td></tr></table></body></html>"
+ doc = Nokogiri::HTML(html)
+ table = TableParser::Table.new doc, "/html/body/table"
Result in following parsed table:
Table<TableColumn<name=A, children=[1],[1]>,TableColumn<name=B, children=[2],[3]>>
Note the first column contains duplicated item, because the first row contains "rowspan" element. If this is not desired, use following syntax to skip duplication:
- table = TableParser::Table.new "<html><body><table><tr><td>A</td><td>B</td></tr>\
+ html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
- <tr><td>3</td></tr></table></body></html>",
- "/html/body/table", {:dup_rows => false})
+ <tr><td>3</td></tr></table></body></html>"
+ doc = Nokogiri::HTML(html)
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false})
Which result in following parsed table:
@@ -5,9 +5,7 @@
module TableParser
class Parser
# extract_table("http://www.bs4.jp/table/index.html", "/html/body/table/tr/td/table")
- def self.extract_table(input, xpath)
- doc = Nokogiri::HTML(input)
-
+ def self.extract_table(doc, xpath)
rows = []
table = doc.xpath(xpath)
rows = table.xpath("./tr").collect do |row|
@@ -1,8 +1,8 @@
module TableParser
class Table
attr_reader :nodes, :columns
- def initialize(input, xpath_to_table="//table[0]", duplicate_colspan=true)
- table = Parser.extract_table(input, xpath_to_table)
+ def initialize(doc, xpath_to_table="//table[0]", duplicate_colspan=true)
+ table = Parser.extract_table(doc, xpath_to_table)
@columns = Parser.extract_column_headers(table)
@nodes = Parser.extract_nodes(table, @columns, duplicate_colspan)
end
View
@@ -3,10 +3,11 @@
class TestTableParser < Test::Unit::TestCase
def test_parse_rowspan
- table = TableParser::Table.new "<html><body><table><tr><td>A</td><td>B</td></tr>\
+ html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
- <tr><td>3</td></tr></table></body></html>",
- "/html/body/table"
+ <tr><td>3</td></tr></table></body></html>"
+ doc = Nokogiri::HTML(html)
+ table = TableParser::Table.new doc, "/html/body/table"
puts table
assert_equal(2, table.columns.size, 'header_count should = 2 ')
@@ -15,23 +16,24 @@ def test_parse_rowspan
end
def test_parse_rowspan_disable_dup
- table = TableParser::Table.new "<html><body><table><tr><td>A</td><td>B</td></tr>\
+ html = "<html><body><table><tr><td>A</td><td>B</td></tr>\
<tr><td rowspan=\"2\">1</td><td>2</td></tr> \
- <tr><td>3</td></tr></table></body></html>",
- "/html/body/table", false
-
- puts table
+ <tr><td>3</td></tr></table></body></html>"
+ doc = Nokogiri::HTML(html)
+ table = TableParser::Table.new doc, "/html/body/table", false
+
assert_equal(2, table.columns.size, 'header_count should = 2 ')
assert_equal(1, table[0].size)
assert_equal(2, table[1].size)
end
def test_parse_colspan
- table = TableParser::Table.new "<html><body><table><tr><td>A</td><td colspan=\"2\">B</td></tr>\
+ html = "<html><body><table><tr><td>A</td><td colspan=\"2\">B</td></tr>\
<tr><td rowspan=\"2\">A1</td><td>B1</td><td>C1</td></tr> \
<tr><td>B2</td><td>C2</td></tr>\
- <tr><td>A3</td><td>B3</td><td>C3</td></tr><tr><td>A4</td><td>B4</td><td>C4</td></tr></table></body></html>",
- "/html/body/table"
+ <tr><td>A3</td><td>B3</td><td>C3</td></tr><tr><td>A4</td><td>B4</td><td>C4</td></tr></table></body></html>"
+ doc = Nokogiri::HTML(html)
+ table = TableParser::Table.new doc, "/html/body/table"
assert_equal(3, table.columns.size, 'header_count should = 3 ')
assert_equal(4, table[0].size)
@@ -41,12 +43,13 @@ def test_parse_colspan
end
def test_parse_complex
- table = TableParser::Table.new "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
- <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>B4</td></tr>\
- <tr><td>B2</td><td>B4</td></tr>\
- <tr><td>C2</td><td>C3</td><td>B4</td></tr>\
- </table></body></html>",
- "/html/body/table"
+ html = "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
+ <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>B4</td></tr>\
+ <tr><td>B2</td><td>B4</td></tr>\
+ <tr><td>C2</td><td>C3</td><td>B4</td></tr>\
+ </table></body></html>"
+ doc = Nokogiri::HTML(html)
+ table = TableParser::Table.new doc, "/html/body/table"
assert_equal 4, table.columns.size
assert_equal 3, table[0].size
@@ -55,18 +58,19 @@ def test_parse_complex
end
def test_parse_complex2
- table = TableParser::Table.new "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
- <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
- <tr><td>B2</td><td>B4</td></tr>\
- <tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
- <tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
- <tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
- <tr><td>F2</td><td>F4</td></tr>\
- <tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
- <tr><td>H2</td><td>H4</td></tr>\
- <tr><td>I2</td><td>I3</td><td>I4</td></tr>\
- </table></body></html>",
- "/html/body/table"
+ html = "<html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>\
+ <tr><td rowspan=\"3\">A1</td><td>A2</td><td rowspan=\"2\">A3</td><td>A4</td></tr>\
+ <tr><td>B2</td><td>B4</td></tr>\
+ <tr><td>C2</td><td rowspan=\"2\">C3</td><td>C4</td></tr>\
+ <tr><td rowspan=\"3\">D1</td><td>D2</td><td>D4</td></tr>\
+ <tr><td>E2</td><td rowspan=\"2\">E3</td><td>E4</td></tr>\
+ <tr><td>F2</td><td>F4</td></tr>\
+ <tr><td rowspan=\"3\">G1</td><td>G2</td><td rowspan=\"2\">G3</td><td>G4</td></tr>\
+ <tr><td>H2</td><td>H4</td></tr>\
+ <tr><td>I2</td><td>I3</td><td>I4</td></tr>\
+ </table></body></html>"
+ doc = Nokogiri::HTML(html)
+ table = TableParser::Table.new doc, "/html/body/table"
assert_equal 4, table.columns.size
assert_equal 9, table[0].size
@@ -76,8 +80,8 @@ def test_parse_complex2
end
def test_parse_web
- table = TableParser::Table.new open("test.html").read,
- "/html/body/table"
+ doc = Nokogiri::HTML(open("test.html").read)
+ table = TableParser::Table.new doc, "/html/body/table"
assert_equal 11, table.columns.size
assert_equal 9, table[0].size

0 comments on commit 8b251c8

Please sign in to comment.