/
selection.pxi
173 lines (130 loc) · 5.64 KB
/
selection.pxi
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
cdef class CSSSelector:
cdef char *c_selector
cdef mycss_entry_t *css_entry
cdef modest_finder_t *finder
cdef mycss_selectors_list_t *selectors_list
def __init__(self, str selector):
selector_pybyte = selector.encode('UTF-8')
self.c_selector = selector_pybyte
# In order to propagate errors these methods should return no value
self._create_css_parser()
self._prepare_selector(self.css_entry, self.c_selector, len(self.c_selector))
self.finder = modest_finder_create_simple()
cdef myhtml_collection_t* find(self, myhtml_tree_node_t* scope):
"""Find all possible matches."""
cdef myhtml_collection_t *collection
collection = NULL
modest_finder_by_selectors_list(self.finder, scope, self.selectors_list, &collection)
return collection
cdef _create_css_parser(self):
cdef mystatus_t status
cdef mycss_t *mycss = mycss_create()
status = mycss_init(mycss)
if status != 0:
raise RuntimeError("Can't init MyCSS object.")
# return
self.css_entry = mycss_entry_create()
status = mycss_entry_init(mycss, self.css_entry)
if status != 0:
raise RuntimeError("Can't init MyCSS Entry object.")
cdef _prepare_selector(self, mycss_entry_t *css_entry,
const char *selector, size_t selector_size):
cdef mystatus_t out_status;
self.selectors_list = mycss_selectors_parse(mycss_entry_selectors(css_entry),
myencoding_t.MyENCODING_UTF_8,
selector, selector_size,
&out_status)
if (self.selectors_list == NULL) or (self.selectors_list.flags and MyCSS_SELECTORS_FLAGS_SELECTOR_BAD):
raise ValueError("Bad CSS Selectors: %s" % self.c_selector.decode('utf-8'))
def __dealloc__(self):
mycss_selectors_list_destroy(mycss_entry_selectors(self.css_entry), self.selectors_list, 1)
modest_finder_destroy(self.finder, 1)
cdef mycss_t *mycss = self.css_entry.mycss
mycss_entry_destroy(self.css_entry, 1)
mycss_destroy(mycss, 1)
cdef class Selector:
"""An advanced CSS selector that supports additional operations.
Think of it as a toolkit that mimicks some of the features of XPath.
Please note, this is an experimental feature that can change in the future.
"""
cdef Node node
cdef list nodes
def __init__(self, Node node, query):
"""custom init, because __cinit__ doesn't accept C types"""
self.node = node
self.nodes = find_nodes(node.parser, node.node, query) if query else [node, ]
cpdef css(self, str query):
"""Evaluate CSS selector against current scope."""
cdef Node current_node
nodes = list()
for node in self.nodes:
current_node = node
nodes.extend(find_nodes(self.node.parser, current_node.node, query))
self.nodes = nodes
return self
@property
def matches(self):
"""Returns all possible matches"""
return self.nodes
@property
def any_matches(self):
"""Returns True if there are any matches"""
return bool(self.nodes)
def text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
"""Filter all current matches given text."""
nodes = []
for node in self.nodes:
node_text = node.text(deep=deep, separator=separator, strip=strip)
if node_text and text in node_text:
nodes.append(node)
self.nodes = nodes
return self
def any_text_contains(self, str text, bool deep=True, str separator='', bool strip=False):
"""Returns True if any node in the current search scope contains specified text"""
nodes = []
for node in self.nodes:
node_text = node.text(deep=deep, separator=separator, strip=strip)
if node_text and text in node_text:
return True
return False
def attribute_longer_than(self, str attribute, int length, str start = None):
"""Returns True any href attribute longer than a specified length.
Similar to `string-length` in XPath.
"""
nodes = []
for node in self.nodes:
attr = node.attributes.get(attribute)
if attr and start and start in attr:
attr = attr[attr.find(start) + len(start):]
if len(attr) > length:
return True
return False
cdef find_nodes(HTMLParser parser, myhtml_tree_node_t *node, str query):
cdef myhtml_collection_t *collection
cdef CSSSelector selector = CSSSelector(query)
result = list()
collection = selector.find(node)
if collection == NULL:
return result
for i in range(collection.length):
n = Node()
n._init(collection.list[i], parser)
result.append(n)
myhtml_collection_destroy(collection)
return result
cdef bool find_matches(HTMLParser parser, myhtml_tree_node_t *node, tuple selectors):
cdef myhtml_collection_t *collection
cdef CSSSelector selector
cdef int collection_size
for query in selectors:
selector = CSSSelector(query)
collection_size = 0
collection = NULL
collection = selector.find(node)
if collection == NULL:
continue
collection_size = collection.length
myhtml_collection_destroy(collection)
if collection.length > 0:
return True
return False