/
flay.rb
executable file
·593 lines (465 loc) · 13.7 KB
/
flay.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
#!/usr/bin/env ruby -w
require 'optparse'
require 'rubygems'
require 'sexp_processor'
require 'ruby_parser'
require 'timeout'
class File
RUBY19 = "<3".respond_to? :encoding unless defined? RUBY19 # :nodoc:
class << self
alias :binread :read unless RUBY19
end
end
class Flay
VERSION = "2.2.0" # :nodoc:
##
# Returns the default options.
def self.default_options
{
:diff => false,
:mass => 16,
:summary => false,
:verbose => false,
:number => true,
:timeout => 10,
:liberal => false,
:fuzzy => false,
}
end
##
# Process options in +args+, defaulting to +ARGV+.
def self.parse_options args = ARGV
options = self.default_options
OptionParser.new do |opts|
opts.banner = 'flay [options] files_or_dirs'
opts.version = Flay::VERSION
opts.separator ""
opts.separator "Specific options:"
opts.separator ""
opts.on('-h', '--help', 'Display this help.') do
puts opts
exit
end
opts.on('-f', '--fuzzy [DIFF]', Integer,
"Detect fuzzy (copy & paste) duplication (default 1).") do |n|
options[:fuzzy] = n || 1
end
opts.on('-l', '--liberal', "Use a more liberal detection method.") do
options[:liberal] = true
end
opts.on('-m', '--mass MASS', Integer,
"Sets mass threshold (default = #{options[:mass]})") do |m|
options[:mass] = m.to_i
end
opts.on('-#', "Don't number output (helps with diffs)") do |m|
options[:number] = false
end
opts.on('-v', '--verbose', "Verbose. Show progress processing files.") do
options[:verbose] = true
end
opts.on('-d', '--diff', "Diff Mode. Display N-Way diff for ruby.") do
options[:diff] = true
end
opts.on('-s', '--summary', "Summarize. Show flay score per file only.") do
options[:summary] = true
end
opts.on('-t', '--timeout TIME', Integer,
"Set the timeout. (default = #{options[:timeout]})") do |t|
options[:timeout] = t.to_i
end
extensions = ['rb'] + Flay.load_plugins
opts.separator ""
opts.separator "Known extensions: #{extensions.join(', ')}"
extensions.each do |meth|
msg = "options_#{meth}"
send msg, opts, options if self.respond_to?(msg)
end
begin
opts.parse! args
rescue => e
abort "#{e}\n\n#{opts}"
end
end
options
end
##
# Expands +*dirs+ to all files within that match ruby and rake extensions.
# --
# REFACTOR: from flog
def self.expand_dirs_to_files *dirs
extensions = ['rb'] + Flay.load_plugins
dirs.flatten.map { |p|
if File.directory? p then
Dir[File.join(p, '**', "*.{#{extensions.join(',')}}")]
else
p
end
}.flatten
end
##
# Loads all flay plugins. Files must be named "flog_*.rb".
def self.load_plugins
unless defined? @@plugins then
@@plugins = []
plugins = Gem.find_files("flay_*.rb").reject { |p| p =~ /flay_task/ }
plugins.each do |plugin|
plugin_name = File.basename(plugin, '.rb').sub(/^flay_/, '')
next if @@plugins.include? plugin_name
begin
load plugin
@@plugins << plugin_name
rescue LoadError => e
warn "error loading #{plugin.inspect}: #{e.message}. skipping..."
end
end
end
@@plugins
rescue
# ignore
end
# :stopdoc:
attr_accessor :mass_threshold, :total, :identical, :masses
attr_reader :hashes, :option
# :startdoc:
##
# Create a new instance of Flay with +option+s.
def initialize option = nil
@option = option || Flay.default_options
@hashes = Hash.new { |h,k| h[k] = [] }
self.identical = {}
self.masses = {}
self.total = 0
self.mass_threshold = @option[:mass]
require 'ruby2ruby' if @option[:diff]
end
##
# Process any number of files.
def process(*files) # TODO: rename from process - should act as SexpProcessor
files.each do |file|
warn "Processing #{file}" if option[:verbose]
ext = File.extname(file).sub(/^\./, '')
ext = "rb" if ext.nil? || ext.empty?
msg = "process_#{ext}"
unless respond_to? msg then
warn " Unknown file type: #{ext}, defaulting to ruby"
msg = "process_rb"
end
begin
sexp = begin
send msg, file
rescue => e
warn " #{e.message.strip}"
warn " skipping #{file}"
nil
end
next unless sexp
process_sexp sexp
rescue SyntaxError => e
warn " skipping #{file}: #{e.message}"
end
end
end
##
# Prune, find identical nodes, and update masses.
def analyze
self.prune
self.hashes.each do |hash,nodes|
identical[hash] = nodes[1..-1].all? { |n| n == nodes.first }
end
update_masses
end
##
# Reset total and recalculate the masses for all nodes in +hashes+.
def update_masses
self.total = 0
masses.clear
self.hashes.each do |hash, nodes|
masses[hash] = nodes.first.mass * nodes.size
masses[hash] *= (nodes.size) if identical[hash]
self.total += masses[hash]
end
end
##
# Parse a ruby +file+ and return the sexp.
#
# --
# TODO: change the system and rename this to parse_rb.
def process_rb file
begin
RubyParser.new.process(File.binread(file), file, option[:timeout])
rescue Timeout::Error
warn "TIMEOUT parsing #{file}. Skipping."
end
end
##
# Process a sexp +pt+.
def process_sexp pt
pt.deep_each do |node|
next unless node.any? { |sub| Sexp === sub }
next if node.mass < self.mass_threshold
self.hashes[node.structural_hash] << node
process_fuzzy node, option[:fuzzy] if option[:fuzzy]
end
end
# :stopdoc:
MAX_NODE_SIZE = 10 # prevents exponential blowout
MAX_AVG_MASS = 12 # prevents exponential blowout
# :startdoc:
##
# Process "fuzzy" matches for +node+. A fuzzy match is a subset of
# +node+ up to +difference+ elements less than the original.
def process_fuzzy node, difference
return unless node.has_code?
avg_mass = node.mass / node.size
return if node.size > MAX_NODE_SIZE or avg_mass > MAX_AVG_MASS
tmpl, code = node.split_code
tmpl.modified = true
(code.size - 1).downto(code.size - difference) do |n|
code.combination(n).each do |subcode|
new_node = tmpl + subcode
next unless new_node.any? { |sub| Sexp === sub }
next if new_node.mass < self.mass_threshold
# they're already structurally similar, don't bother adding another
next if self.hashes[new_node.structural_hash].any? { |sub|
sub.file == new_node.file and sub.line == new_node.line
}
self.hashes[new_node.structural_hash] << new_node
end
end
end
##
# Prunes nodes that aren't relevant to analysis or are already
# covered by another node.
def prune
# prune trees that aren't duped at all, or are too small
self.hashes.delete_if { |_,nodes| nodes.size == 1 }
self.hashes.delete_if { |_,nodes| nodes.all?(&:modified?) }
return prune_liberally if option[:liberal]
prune_conservatively
end
##
# Conservative prune. Remove any bucket that is known to contain a
# subnode element of a node in another bucket.
def prune_conservatively
hashes_to_prune = {}
# extract all subtree hashes from all nodes
self.hashes.values.each do |nodes|
nodes.first.all_structural_subhashes.each do |h|
hashes_to_prune[h] = true
end
end
# nuke subtrees so we show the biggest matching tree possible
self.hashes.delete_if { |h,_| hashes_to_prune[h] }
end
##
# Liberal prune. Remove any _element_ from a bucket that is known to
# be a subnode of another node. Removed by identity.
def prune_liberally
update_masses
hashes_to_prune = Hash.new { |h,k| h[k] = [] }
# record each subtree by subhash, but skip if subtree mass > parent mass
self.hashes.values.each do |nodes|
nodes.each do |node|
tophash = node.structural_hash
topscore = self.masses[tophash]
node.deep_each do |subnode|
subhash = subnode.structural_hash
subscore = self.masses[subhash]
next if subscore and subscore > topscore
hashes_to_prune[subhash] << subnode
end
end
end
# nuke only individual items by object identity
self.hashes.each do |h,v|
v.delete_eql hashes_to_prune[h]
end
# nuke buckets we happened to fully empty
self.hashes.delete_if { |k,v| v.size <= 1 }
end
##
# Output an n-way diff from +data+. This is only used if --diff is
# given.
def n_way_diff *data
comments = []
codes = []
split_and_group(data).each do |subdata|
n = subdata.find_index { |s| s !~ /^#/ }
comment, code = subdata[0..n-1], subdata[n..-1]
comments << comment
codes << code
end
comments = collapse_and_label pad_with_empty_strings comments
codes = collapse_and_label pad_with_empty_strings codes
(comments + codes).flatten.join("\n")
end
def split_and_group ary # :nodoc:
ary.each_with_index.map { |s, i|
c = (?A.ord + i).chr
s.scan(/^.*/).map { |s2|
s2.group = c
s2
}
}
end
def pad_with_empty_strings ary # :nodoc:
max = ary.map { |s| s.size }.max
ary.map { |a| a + ([""] * (max - a.size)) }
end
def collapse_and_label ary # :nodoc:
ary[0].zip(*ary[1..-1]).map { |lines|
if lines.uniq.size == 1 then
" #{lines.first}"
else
lines.reject { |l| l.empty? }.map { |l| "#{l.group}: #{l}" }
end
}
end
##
# Calculate summary scores on a per-file basis. For --summary.
def summary
score = Hash.new 0
masses.each do |hash, mass|
sexps = hashes[hash]
mass_per_file = mass.to_f / sexps.size
sexps.each do |sexp|
score[sexp.file] += mass_per_file
end
end
score
end
##
# Output the report. Duh.
def report prune = nil
analyze
puts "Total score (lower is better) = #{self.total}"
if option[:summary] then
puts
self.summary.sort_by { |_,v| -v }.each do |file, score|
puts "%8.2f: %s" % [score, file]
end
return
end
count = 0
sorted = masses.sort_by { |h,m|
[-m,
hashes[h].first.file,
hashes[h].first.line,
hashes[h].first.first.to_s]
}
sorted.each do |hash, mass|
nodes = hashes[hash]
next unless nodes.first.first == prune if prune
puts
same = identical[hash]
node = nodes.first
n = nodes.size
match, bonus = if same then
["IDENTICAL", "*#{n}"]
else
["Similar", ""]
end
if option[:number] then
count += 1
puts "%d) %s code found in %p (mass%s = %d)" %
[count, match, node.first, bonus, mass]
else
puts "%s code found in %p (mass%s = %d)" %
[match, node.first, bonus, mass]
end
nodes.sort_by { |x| [x.file, x.line] }.each_with_index do |x, i|
if option[:diff] then
c = (?A.ord + i).chr
extra = " (FUZZY)" if x.modified?
puts " #{c}: #{x.file}:#{x.line}#{extra}"
else
extra = " (FUZZY)" if x.modified?
puts " #{x.file}:#{x.line}#{extra}"
end
end
if option[:diff] then
puts
r2r = Ruby2Ruby.new
puts n_way_diff(*nodes.map { |s| r2r.process(s.deep_clone) })
end
end
end
end
class String
attr_accessor :group # :nodoc:
end
class Sexp
##
# Whether or not this sexp is a mutated/modified sexp.
attr_accessor :modified
alias :modified? :modified # Is this sexp modified?
##
# Calculate the structural hash for this sexp. Cached, so don't
# modify the sexp afterwards and expect it to be correct.
def structural_hash
@structural_hash ||= self.structure.hash
end
##
# Returns a list of structural hashes for all nodes (and sub-nodes)
# of this sexp.
def all_structural_subhashes
hashes = []
self.deep_each do |node|
hashes << node.structural_hash
end
hashes
end
def initialize_copy o # :nodoc:
s = super
s.file = o.file
s.line = o.line
s.modified = o.modified
s
end
def [] a # :nodoc:
s = super
if Sexp === s then
s.file = self.file
s.line = self.line
s.modified = self.modified
end
s
end
def + o # :nodoc:
self.dup.concat o
end
##
# Useful general array method that splits the array from 0..+n+ and
# the rest. Returns both sections.
def split_at n
return self[0..n], self[n+1..-1]
end
##
# Return the index of the last non-code element, or nil if this sexp
# is not a code-bearing node.
def code_index
{
:block => 0, # s(:block, *code)
:class => 2, # s(:class, name, super, *code)
:module => 1, # s(:module, name, *code)
:defn => 2, # s(:defn, name, args, *code)
:defs => 3, # s(:defs, recv, name, args, *code)
:iter => 2, # s(:iter, recv, args, *code)
}[self.sexp_type]
end
alias has_code? code_index # Does this sexp have a +*code+ section?
##
# Split the sexp into front-matter and code-matter, returning both.
# See #code_index.
def split_code
index = self.code_index
self.split_at index if index
end
end
class Array # :nodoc:
##
# Delete anything in +self+ if they are identical to anything in +other+.
def delete_eql other
self.delete_if { |o1| other.any? { |o2| o1.equal? o2 } }
end
end