This repository has been archived by the owner on Oct 3, 2022. It is now read-only.
/
features.py
323 lines (226 loc) · 8.72 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
"""This file contains the definition of the different features.
There are also support_features, which are basically just used to memoize
intermediate work common to multiple features. They are not persisted, and are
calculated lazily."""
import ast as pyast
from collections import Counter
from glob import glob
import os
import logging
import tokenize
from StringIO import StringIO
import utils
# {'feature name': Feature} for outside the module.
# serialization assumes that features will not be removed once added
# adding doesn't require a migration; on the next load its value will default to None
all_features = {}
_support_features = {}
#These decorators register a feature as normal/support.
def feature(f):
all_features[f.__name__] = f
return f
def support_feature(f):
_support_features[f.__name__] = f
return f
#features take a Repo and return their value.
#they assume cwd is the repo's directory
#they can use other feature's (possibly memoized) values with repo._calc()
@support_feature
def all_file_sizes(repo):
"""A map of {filepath: filesize} for all files in the repo."""
filesizes = {}
for dirpath, dirnames, filenames in os.walk('.'):
if '.git' in dirnames:
dirnames.remove('.git') # don't enter git db
filepaths = (os.path.join(dirpath, fname) for fname in filenames)
filepaths = (path for path in filepaths if
not os.path.islink(path))
for f in filepaths:
filesizes[f] = os.path.getsize(f)
return filesizes
@support_feature
def src_file_sizes(repo):
return {f: size for (f, size) in repo._calc('all_file_sizes').iteritems()
if f.endswith('.py')}
@feature
def num_all_files(repo):
return len(repo._calc('all_file_sizes'))
@feature
def size_all_files(repo):
"""Total filesize of all files in the repo."""
return sum(size for size in repo._calc('all_file_sizes').itervalues())
@feature
def size_src_files(repo):
return sum(size for fname, size in repo._calc('src_file_sizes').iteritems())
@feature
def num_src_files(repo):
return len(repo._calc('src_file_sizes'))
@feature
def ratio_src_files(repo):
""".py files / all files"""
return 100.0 * repo._calc('num_src_files') / (1 + repo._calc('num_all_files'))
@feature
def ratio_vol_src_files(repo):
"""size of .py files / size of all files"""
return 100.0 * repo._calc('size_src_files') / (1 + repo._calc('size_all_files'))
@feature
def readme_size(repo):
"""Size of the readme, in bytes.
0 can mean either non-existant or 0-length."""
matching = glob('*README*')
if not matching:
return 0
#take the longest filename; hack to emulate GitHub's preference to
#eg README.md over README
readme_fn = max(matching, key=len)
return utils.filesize_or_zero(readme_fn)
@feature
def setuppy_size(repo):
return utils.filesize_or_zero('setup.py')
@feature
def license_size(repo):
matching = glob('LICENSE*') + glob('COPYING*')
if not matching:
return 0
return max(utils.filesize_or_zero(fn) for fn in matching)
@feature
def travis_cfg_size(repo):
return utils.filesize_or_zero('.travis.yml')
@feature
def contributing_size(repo):
matching = glob('CONTRIBUTING*')
if not matching:
return 0
return max(utils.filesize_or_zero(fn) for fn in matching)
# Code features
@support_feature
def source_contents(repo):
"""A dict {filename: contents} for all source files."""
contents = {}
for py_file, size in repo._calc('src_file_sizes').iteritems():
try:
with open(py_file, 'rb') as f:
contents[py_file] = f.read()
except IOError:
logging.exception("could not open %s/%s", repo._calc('name'), py_file)
return contents
# too costly to calculate for large samples
#class Pep8Compliance(Feature):
# """pep8 errors and warnings / ast nodes."""
#
# @classmethod
# def _calculate(cls, user_repo, features):
# errors = utils.get_pep8_errors('.')
#
# return 100.0 * errors / NumAstNodes._get_val(user_repo, features)
@support_feature
def asts(repo):
"""A dict {filename: ast} for all .py files."""
asts = {}
for src_fn, src in repo._calc('source_contents').iteritems():
try:
ast = pyast.parse(src)
except:
#if their code does not compile, ignore it
#TODO should probably be more strict against this,
#could really throw off num_ast-relative features
#maybe don't consider repos with non-compiling code?
logging.exception("file %s/%s does not compile",
repo.name, src_fn)
else:
#otherwise, include it
asts[src_fn] = ast
return asts
@support_feature
def ast_node_counts(repo):
"""A counter over ast node names for all source."""
counter = Counter()
for ast in repo._calc('asts').itervalues():
counter.update(node.__class__.__name__
for node in pyast.walk(ast))
return counter
@feature
def num_ast_nodes(repo):
"""Total number of ast nodes; a measure of code volume."""
nodes = sum(count for count in repo._calc('ast_node_counts').values())
nodes += 1 # used as relative, don't want 0
return nodes
#These features refer to usage of certain language features.
#They are all measured as percentages of ast nodes.
def make_ast_usage_feature(node_name, feature_name):
def feature_func(repo, node_name=node_name):
nodes = repo._calc('ast_node_counts').get(node_name, 0)
return 100.0 * nodes / repo._calc('num_ast_nodes')
feature_func.__name__ = feature_name
feature_func = feature(feature_func)
return feature_func
for node_name, feature_name in [('With', 'with_stmt_usage'),
('comprehension', 'compr_usage'),
('Lambda', 'lambda_usage'),
('Global', 'global_usage'),
('GeneratorExp', 'gen_exp_usage'),
('Print', 'print_usage'),
]:
make_ast_usage_feature(node_name, feature_name)
@feature
def comment_ratio(repo):
"""Number of comments / code volume."""
num = 0
for src_fn, src in repo._calc('source_contents').iteritems():
strbuf = StringIO(src)
try:
toks = tokenize.generate_tokens(strbuf.readline)
num += len([t for t in toks if t[0] == tokenize.COMMENT])
except:
#similar to does not compile error
logging.exception("file %s/%s does not tokenize",
repo.name, src_fn)
#consider storing fractions and converting out later
return 100.0 * num / repo._calc('num_ast_nodes')
@feature
def docstring_ratio(repo):
"""Percent of function/class/module definitions with docstrings."""
def_nodes = 1 # avoid division by zero
doc_def_nodes = 0
for root in repo._calc('asts').itervalues():
for node in pyast.walk(root):
if isinstance(node, (pyast.FunctionDef, pyast.ClassDef,
pyast.Module)):
def_nodes += 1
docstring = pyast.get_docstring(node)
if docstring:
doc_def_nodes += 1
return 100.0 * doc_def_nodes / def_nodes
@feature
def docstring_avg_len(repo):
def_nodes = 1 # avoid division by zero
docstring_len = 0
for root in repo._calc('asts').itervalues():
for node in pyast.walk(root):
if isinstance(node, (pyast.FunctionDef, pyast.ClassDef,
pyast.Module)):
def_nodes += 1
docstring = pyast.get_docstring(node)
if docstring:
docstring_len += len(docstring)
return 100.0 * docstring_len / def_nodes
@feature
def imported_modules(repo):
"""Return a set (as tuple) of toplevel module names this repo could import."""
imports = set()
for root in repo._calc('asts').itervalues():
for node in pyast.walk(root):
if isinstance(node, pyast.Import):
for alias in node.names:
imports.add(alias.name.split('.')[0])
elif isinstance(node, pyast.ImportFrom):
#can't get relative imports without running them,
#but they're just intra-package anyway
if node.level == 0 and node.module:
imports.add(node.module.split('.')[0])
return tuple(imports)
@feature
def imported_stdlib_modules(repo):
"""Like imported_modules, but only keeps stdlib modules."""
return tuple(mod for mod in repo._calc('imported_modules')
if mod in utils.stdlib_module_names())