-
Notifications
You must be signed in to change notification settings - Fork 1
/
parser.rb
82 lines (74 loc) · 2.24 KB
/
parser.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
require 'rubygems'
require 'nokogiri'
require 'open-uri'
module TableParser
class Parser
# extract_table("http://www.bs4.jp/table/index.html", "/html/body/table/tr/td/table")
def self.extract_table(doc, xpath)
rows = []
table = doc.xpath(xpath)
rows = table.xpath("./tr").collect do |row|
row.xpath("./td").collect do |col|
col
end
end
rows
end
def self.extract_column_headers(rows, dup_rows, dup_cols, has_header)
headers = []
if has_header
rows.first.collect do |col|
header = TableColumn.new(col)
headers << header
colspan = col["colspan"].to_i rescue 1
(colspan-1).times do
headers << TableColumn.new(col)
end
end
rows.delete_at(0)
else
rows.first.collect do |col|
header = TableColumn.new(nil)
headers << header
colspan = col["colspan"].to_i rescue 1
(colspan-1).times do
headers << TableColumn.new(nil)
end
end
end
headers
end
def self.extract_nodes(rows, headers, dup_rows, dup_cols)
data = rows.collect do |row|
row.collect do |ele|
node = TableNode.new(ele)
end
end
# handle rowspan
data.each_index do |row_index|
row = data[row_index]
row.each_index do |col_index|
col = row[col_index]
if headers[col_index]
headers[col_index].children << col if col.class != EmptyTableNode
if col.colspan > 1
if dup_cols
row.insert(col_index, TableNode.new(col.element, col.rowspan, col.colspan - 1))
else
row.insert(col_index, EmptyTableNode.new(col.rowspan, col.colspan - 1))
end
end
if col.rowspan > 1 && data[row_index+1]
if dup_rows
data[row_index+1].insert(col_index, TableNode.new(col.element, col.rowspan - 1))
else
data[row_index+1].insert(col_index, EmptyTableNode.new(col.rowspan - 1))
end
end
end
end
end
data
end
end
end