This repository has been archived by the owner on May 22, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 44
/
uast_ids_to_bag.py
110 lines (90 loc) · 3.61 KB
/
uast_ids_to_bag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from collections import defaultdict, deque
import bblfsh
from sourced.ml.algorithms import TokenParser, NoopTokenParser
from sourced.ml.algorithms.uast_to_bag import Uast2BagBase
from sourced.ml.utils import bblfsh_roles
def uast2sequence(root):
sequence = []
nodes = defaultdict(deque)
stack = [root]
nodes[id(root)].extend(root.children)
while stack:
if nodes[id(stack[-1])]:
child = nodes[id(stack[-1])].popleft()
nodes[id(child)].extend(child.children)
stack.append(child)
else:
sequence.append(stack.pop())
return sequence
class FakeVocabulary:
# FIXME(zurk): change to simple function. Vadim Markovtsev comments:
# > would rather made this a simple function and change roles2index
# type from [] to callable. Saves time to understand.
def __getitem__(self, item):
return item
class UastTokens2Bag(Uast2BagBase):
"""
Converts a UAST to a weighed bag of tokens via xpath.
"""
XPATH = None # Should be overridden in child class
def __init__(self, token2index=None, token_parser=None):
"""
:param token2index: The mapping from tokens to bag keys. If None, no mapping is performed.
:param token_parser: Specify token parser if you want to use a custom one. \
:class:'NoopTokenParser' is used if it is not specified.
"""
self._token2index = FakeVocabulary() if token2index is None else token2index
self._token_parser = NoopTokenParser() if token_parser is None else token_parser
@property
def token_parser(self):
return self._token_parser
@property
def token2index(self):
return self._token2index
def __call__(self, uast):
"""
Converts a UAST to a weighed bag-of-words. The weights are words frequencies.
The tokens are preprocessed by _token_parser.
:param uast: The UAST root node.
:return:
"""
nodes = bblfsh.filter(uast, self.XPATH)
bag = defaultdict(int)
for node in nodes:
for sub in self._token_parser.process_token(node.token):
try:
bag[self._token2index[sub]] += 1
except KeyError:
continue
return bag
class UastIds2Bag(UastTokens2Bag):
"""
Converts a UAST to a bag-of-identifiers.
"""
XPATH = "//*[@roleIdentifier]"
def __init__(self, token2index=None, token_parser=None):
"""
:param token2index: The mapping from tokens to bag keys. If None, no mapping is performed.
:param token_parser: Specify token parser if you want to use a custom one. \
:class:'TokenParser' is used if it is not specified.
"""
token_parser = TokenParser() if token_parser is None else token_parser
super().__init__(token2index, token_parser)
def __call__(self, uast):
"""
HOTFIX for https://github.com/bblfsh/client-python/issues/92
Converts a UAST to a weighed bag-of-identifiers. The weights are identifiers frequencies.
The tokens are preprocessed by _token_parser.
Overwrite __call__ to avoid issues with `bblfsh.filter`.
:param uast: The UAST root node.
:return: bag
"""
nodes = [node for node in uast2sequence(uast) if bblfsh_roles.IDENTIFIER in node.roles]
bag = defaultdict(int)
for node in nodes:
for sub in self._token_parser.process_token(node.token):
try:
bag[self._token2index[sub]] += 1
except KeyError:
continue
return bag