Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

add th parsing support

  • Loading branch information...
commit b3e98272fbe42635e43f52d567c032bc52bf1b38 1 parent 84e6e03
@siuying authored
Showing with 86 additions and 10 deletions.
  1. +1 −1  lib/table_parser/parser.rb
  2. +85 −9 test/test_table_parser.rb
View
2  lib/table_parser/parser.rb
@@ -9,7 +9,7 @@ def self.extract_table(doc, xpath)
rows = []
table = doc.xpath(xpath)
rows = table.xpath("./tr").collect do |row|
- row.xpath("./td").collect do |col|
+ row.xpath("./td|./th").collect do |col|
col
end
end
View
94 test/test_table_parser.rb
@@ -5,7 +5,16 @@
class TestTableParser < Test::Unit::TestCase
def test_parse_rowspan
- html = open("rowspan.html").read
+ html = <<EOF
+ <html><body><table>
+
+ <tr><td>A</td><td>B</td></tr>
+ <tr><td rowspan="2">1</td><td>2</td></tr>
+ <tr><td rowspan="2">3</td></tr>
+ <tr><td>4</td></tr>
+
+ </table></body></html>
+EOF
doc = Nokogiri::HTML(html)
table = TableParser::Table.new doc, "/html/body/table", {:dup_rows => false, :dup_cols => false}
assert_equal(2, table.columns.size, 'header_count should = 2 ')
@@ -42,7 +51,21 @@ def test_parse_rowspan_disable_dup
end
def test_parse_colspan
- html = open("colspan.html").read
+ html = <<EOF
+ <html><body><table>
+ <tr><td>A</td><td colspan="2">B</td></tr>
+ <tr><td rowspan="2">A1</td><td>B1</td><td>C1</td></tr>
+ <tr><td>B2</td><td>C2</td></tr>
+ <tr><td>A3</td><td>B3</td><td>C3</td></tr>
+ <tr><td>A4</td><td colspan="2" rowspan="2">B4</td></tr>
+ <tr><td>A5</td></tr>
+ <tr><td rowspan="2">A1</td><td>B1</td><td>C1</td></tr>
+ <tr><td>B2</td><td>C2</td></tr>
+ <tr><td>A3</td><td>B3</td><td>C3</td></tr>
+ <tr><td>A4</td><td colspan="2" rowspan="2">B4</td></tr>
+ <tr><td>A5</td></tr>
+ </table></body></html>
+EOF
doc = Nokogiri::HTML(html)
table = TableParser::Table.new doc, "/html/body/table"
assert_equal(3, table.columns.size, 'header_count should = 3 ')
@@ -81,7 +104,19 @@ def test_parse_complex
end
def test_parse_complex2
- html = open("complex2.html").read
+ html = <<EOF
+ <html><body><table><tr><td>Header1</td><td>Header2</td><td>Header3</td><td>Header4</td></tr>
+ <tr><td rowspan="3">A1</td><td>A2</td><td rowspan="2">A3</td><td>A4</td></tr>
+ <tr><td>B2</td><td>B4</td></tr>
+ <tr><td>C2</td><td rowspan="2">C3</td><td>C4</td></tr>
+ <tr><td rowspan="3">D1</td><td>D2</td><td>D4</td></tr>
+ <tr><td>E2</td><td rowspan="2">E3</td><td>E4</td></tr>
+ <tr><td>F2</td><td>F4</td></tr>
+ <tr><td rowspan="3">G1</td><td>G2</td><td rowspan="2">G3</td><td>G4</td></tr>
+ <tr><td>H2</td><td>H4</td></tr>
+ <tr><td>I2</td><td>I3</td><td>I4</td></tr>
+ </table></body></html>
+EOF
doc = Nokogiri::HTML(html)
table = TableParser::Table.new doc, "/html/body/table"
@@ -114,7 +149,15 @@ def test_parse_noheader
end
def test_parse_complex_colrowspan
- html = open("table_rowcol.html").read
+ html = <<EOF
+ <html><body><table><tr><td>A</td><td>B</td><td>C</td><td>D</td><td>E</td></tr>
+ <tr><td rowspan="5">1</td><td>2</td><td>3</td><td>4</td><td>5</td></tr>
+ <tr><td rowspan="2" colspan="2">2b</td><td>4b</td><td>5b</td></tr>
+ <tr><td rowspan="2">4c</td><td>5c</td></tr>
+ <tr><td rowspan="2">2d</td><td>3d</td><td>5d</td></tr>
+ <tr><td>3e</td><td>4e</td><td>5e</td></tr>
+ </table></body></html>
+EOF
doc = Nokogiri::HTML(html)
table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => false, :dup_rows => false}
@@ -145,11 +188,44 @@ def test_parse_complex_colrowspan
assert_equal(5, table[4].size)
end
- def test_web
- html = open("test4.html").read
- doc = Nokogiri::HTML::Document.parse(html, nil, "Shift_JIS")
- table = TableParser::Table.new doc, "/html/body/div/div[3]/div/div[2]/table", {:header => false, :dup_rows => false}
- puts table.columns[0].size
+ def test_parse_th
+ html = <<EOF
+ <html><body><table><tr><th>A</th><th>B</th><th>C</th><th>D</th><th>E</th></tr>
+ <tr><td rowspan="5">1</td><td>2</td><td>3</td><td>4</td><td>5</td></tr>
+ <tr><td rowspan="2" colspan="2">2b</td><td>4b</td><td>5b</td></tr>
+ <tr><td rowspan="2">4c</td><td>5c</td></tr>
+ <tr><td rowspan="2">2d</td><td>3d</td><td>5d</td></tr>
+ <tr><td>3e</td><td>4e</td><td>5e</td></tr>
+ </table></body></html>
+EOF
+
+ doc = Nokogiri::HTML(html)
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => false, :dup_rows => false}
+ puts table
+ assert_equal(5, table.columns.size, 'header_count should = 5 ')
+ assert_equal(1, table[0].size)
+ assert_equal(3, table[1].size)
+ assert_equal(3, table[2].size)
+ assert_equal(4, table[3].size)
+ assert_equal(5, table[4].size)
+
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => false, :dup_rows => true}
+ puts table
+ assert_equal(5, table.columns.size, 'header_count should = 5 ')
+ assert_equal(5, table[0].size)
+ assert_equal(5, table[1].size)
+ assert_equal(3, table[2].size)
+ assert_equal(5, table[3].size)
+ assert_equal(5, table[4].size)
+
+ table = TableParser::Table.new doc, "/html/body/table", {:dup_cols => true, :dup_rows => true}
+ puts table
+ assert_equal(5, table.columns.size, 'header_count should = 5 ')
+ assert_equal(5, table[0].size)
+ assert_equal(5, table[1].size)
+ assert_equal(5, table[2].size)
+ assert_equal(5, table[3].size)
+ assert_equal(5, table[4].size)
end
end
Please sign in to comment.
Something went wrong with that request. Please try again.