Skip to content

Commit

Permalink
licensecheck: add support for wildcards in license text
Browse files Browse the repository at this point in the history
In license text the sequence ___ now denotes a wildcard
that can be filled in by up to 70 words.
That's long enough for a handful of copyright lines
in a row (see testdata/MIT.t2) but short enough that
it can't skip over an entire license. (Our shortest two
recognized licenses are Apache-2.0-User at 80 and
BSD-0-Clause at 99.)

This commit updates the licenses to use ___ where appropriate.
This fixes google#6.

This commit also removes removeCopyrightLines (d686698),
which turns out to have been removing quite a few lines in
actual license text (see diffs in testdata/CC*) and did not
handle other cases now handled by the wildcards
(for example, testdata/MIT.t2). This fixes google#12.

Fixes google#6.
Fixes google#12.
  • Loading branch information
rsc committed Feb 12, 2020
1 parent 9efdb3f commit 463d502
Show file tree
Hide file tree
Showing 32 changed files with 180 additions and 202 deletions.
51 changes: 23 additions & 28 deletions data.gen.go

Large diffs are not rendered by default.

100 changes: 78 additions & 22 deletions license.go
Expand Up @@ -465,7 +465,7 @@ func (l *license) submatches(text []int32, opts Options) (s []submatch) {
// For each word of the input, look to see if a sequence starting there
// matches a sequence in the license.
var p phrase
for k := 0; k+len(p) <= len(text); k++ { // k also updated in loop.
for k := 0; k+len(p) <= len(text); { // k updated in loop.
copy(p[:], text[k:])
// Find longest match starting with that word.
startIndexes := l.startIndexes[p]
Expand All @@ -485,34 +485,90 @@ func (l *license) submatches(text []int32, opts Options) (s []submatch) {
matchIndex = index
}
}
// If we have a long match, remember it and advance the location in

if matchLength < opts.MinLength {
k++
continue
}

// We have a long match. Remember it and advance the location in
// the text. Note that we do not do anything to advance the license
// text, which means that certain reorderings will match, perhaps
// erroneously. This has not appeared in practice, while handling
// things this way means the algorithm can identify multiple
// appearances of a license within a single file.
if matchLength > opts.MinLength {
end := k + matchLength
// Does this fit onto the previous match, or is it close
// enough to consider? The slop allows text like
// Copyright (c) 2009 Snarfboodle Inc. All rights reserved.
// to match
// Copyright (c) <YEAR> <COMPANY>. All rights reserved.
// and be considered a single span.
if len(s) > 0 && s[len(s)-1].end+opts.Slop >= k && matchIndex >= s[len(s)-1].licenseEnd {
s[len(s)-1].end = end
s[len(s)-1].matched += matchLength
s[len(s)-1].licenseEnd = matchIndex + matchLength
} else {
s = append(s, submatch{
start: k,
end: end,
matched: matchLength,
licenseEnd: matchIndex + matchLength,
})
start := k
end := start + matchLength
k = end // The last word is not part of the match, but might be part of the next.

// The blank (wildcard) ___ maps to word ID -1.
// If we see a blank, we allow it to be filled in by up to 70 words.
// This allows recognizing quite a few specialized copyright lines
// (see for example testdata/MIT.t2) while not being large enough
// to jump over an entire other license (our shortest is Apache-2.0-User
// at 80 words).
const blank = -1
const blankMax = 70

// Does this fit onto the previous match, or is it close
// enough to consider? The slop allows text like
// Copyright (c) 2009 Snarfboodle Inc. All rights reserved.
// to match
// Copyright (c) <YEAR> <COMPANY>. All rights reserved.
// and be considered a single span.
if len(s) > 0 {
prev := &s[len(s)-1]
textGap := opts.Slop
if prev.licenseEnd < len(l.doc.words) && l.doc.words[prev.licenseEnd] == blank {
textGap = blankMax
}
if prev.end+textGap >= start && matchIndex >= prev.licenseEnd {
if textGap == blankMax {
prev.matched++ // matched the blank
}
prev.end = end
prev.matched += matchLength
prev.licenseEnd = matchIndex + matchLength
continue
}
k = end - 1 // The last word is not part of the match, but might be part of the next.
}

// Does this match immediately follow an early blank in the license text?
// If so, see if we can extend it backward.
// The most common case needing this is licenses that start with "Copyright ___".
// The text before the blank is too short to be its own match but it can be
// part of this one.
if matchIndex >= 2 && l.doc.words[matchIndex-1] == blank && l.doc.words[matchIndex-2] != blank {
i := start - blankMax
if i < 0 {
i = 0
}
if len(s) > 0 && i < s[len(s)-1].end {
i = s[len(s)-1].end
}
for ; i < start; i++ {
if text[i] == l.doc.words[matchIndex-2] {
// Found a match across the gap.
start = i
matchIndex -= 2
matchLength += 2
// Extend backward if possible.
for start > 0 && matchIndex > 0 && text[start-1] == l.doc.words[matchIndex-1] {
start--
matchIndex--
matchLength++
}
break
}
}
}

s = append(s, submatch{
start: start,
end: end,
matched: matchLength,
licenseEnd: matchIndex + matchLength,
})
}
return s
}
3 changes: 1 addition & 2 deletions licenses/AGPL-3.0
Expand Up @@ -629,8 +629,7 @@ to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
___

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
Expand Down
2 changes: 1 addition & 1 deletion licenses/Apache-2.0
Expand Up @@ -58,7 +58,7 @@ APPENDIX: How to apply the Apache License to your work.

To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives.

Copyright [yyyy] [name of copyright owner]
Copyright ___

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

Expand Down
2 changes: 1 addition & 1 deletion licenses/Apache-2.0-User
@@ -1,4 +1,4 @@
Copyright [yyyy] [name of copyright owner]
Copyright ___

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

Expand Down
2 changes: 1 addition & 1 deletion licenses/BSD-0-Clause
@@ -1,4 +1,4 @@
Copyright <YEAR> <OWNER>
Copyright ___

Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted.

Expand Down
2 changes: 1 addition & 1 deletion licenses/BSD-2-Clause
@@ -1,4 +1,4 @@
Copyright <YEAR> <COPYRIGHT HOLDER>
Copyright ___

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

Expand Down
2 changes: 1 addition & 1 deletion licenses/BSD-2-Clause-FreeBSD
@@ -1,4 +1,4 @@
Copyright <yesar> <copyright holders>. All rights reserved.
Copyright ___

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

Expand Down
2 changes: 1 addition & 1 deletion licenses/BSD-3-Clause
@@ -1,4 +1,4 @@
Copyright <YEAR> <COPYRIGHT HOLDER>
Copyright ___

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

Expand Down
4 changes: 2 additions & 2 deletions licenses/BSD-4-Clause
@@ -1,12 +1,12 @@
Copyright <YEAR> <COPYRIGHT HOLDER>. All rights reserved.
Copyright ___

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

3. All advertising materials mentioning features or use of this software must display the following acknowledgement:
3. All advertising materials mentioning features or use of this software must display the following acknowledgement:
This product includes software developed by the organization.

4. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
Expand Down
5 changes: 3 additions & 2 deletions licenses/ECL-2.0
Expand Up @@ -84,11 +84,12 @@ that a file or class name and description of purpose be included on
the same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright [yyyy] [name of copyright owner] Licensed under the
Copyright ___
Licensed under the
Educational Community License, Version 2.0 (the "License"); you may
not use this file except in compliance with the License. You may
obtain a copy of the License at

http://www.osedu.org/licenses/ECL-2.0

Unless required by applicable law or agreed to in writing,
Expand Down
6 changes: 2 additions & 4 deletions licenses/GPL2
Expand Up @@ -290,8 +290,7 @@ to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
___

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand All @@ -312,8 +311,7 @@ Also add information on how to contact you by electronic and paper mail.
If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:

Gnomovision version 69, Copyright (C) year name of author
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
___ comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.

Expand Down
5 changes: 2 additions & 3 deletions licenses/GPL3
Expand Up @@ -189,8 +189,7 @@ If you develop a new program, and you want it to be of the greatest possible use

To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the “copyright” line and a pointer to where the full notice is found.

<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
___

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand All @@ -208,7 +207,7 @@ Also add information on how to contact you by electronic and paper mail.

If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode:

<program> Copyright (C) <year> <name of author>
___
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
Expand Down
2 changes: 1 addition & 1 deletion licenses/ISC
@@ -1,4 +1,4 @@
Copyright <YEAR> <OWNER>
Copyright ___

Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies.

Expand Down
2 changes: 1 addition & 1 deletion licenses/JSON
@@ -1,4 +1,4 @@
Copyright <YEAR> <ORG>
Copyright ___

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

Expand Down
5 changes: 2 additions & 3 deletions licenses/LGPL-2.0
Expand Up @@ -133,7 +133,7 @@ such a program is covered only if its contents constitute a work based
on the Library (independent of the use of the Library in a tool for
writing it). Whether that is true depends on what the Library does
and what the program that uses the Library does.

1. You may copy and distribute verbatim copies of the Library's
complete source code as you receive it, in any medium, provided that
you conspicuously and appropriately publish on each copy an
Expand Down Expand Up @@ -449,8 +449,7 @@ safest to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least the
"copyright" line and a pointer to where the full notice is found.

<one line to give the library's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
___

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
Expand Down
3 changes: 1 addition & 2 deletions licenses/LGPL-2.1
Expand Up @@ -470,8 +470,7 @@ safest to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least the
"copyright" line and a pointer to where the full notice is found.

<one line to give the library's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
___

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
Expand Down
2 changes: 1 addition & 1 deletion licenses/MIT
@@ -1,4 +1,4 @@
Copyright <YEAR> <COPYRIGHT HOLDER>
Copyright ___

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

Expand Down
2 changes: 1 addition & 1 deletion licenses/Zlib
@@ -1,4 +1,4 @@
Copyright (c) <year> <copyright holders>
Copyright ___

This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software.

Expand Down
55 changes: 7 additions & 48 deletions normalize.go
Expand Up @@ -5,7 +5,6 @@
package licensecheck

import (
"bytes"
"strings"
"unicode"
"unicode/utf8"
Expand All @@ -21,7 +20,6 @@ func (c *Checker) normalize(data []byte) *document {
var r rune
var wid int
pos := 0
data = removeCopyrightLines(data)
str := toLower(data)
next := func() {
r, wid = utf8.DecodeRuneInString(str[pos:])
Expand All @@ -32,6 +30,13 @@ func (c *Checker) normalize(data []byte) *document {
// Each iteration adds a word.
for pos < len(str) {
start := pos
const blank = "___" // fill in the blank wildcard
if strings.HasPrefix(str[pos:], blank) {
words = append(words, -1)
indexes = append(indexes, int32(start))
pos += len(blank)
continue
}
next()
// Skip spaces, punctuation, etc. and keep only word characters.
if !isWordChar(r) {
Expand Down Expand Up @@ -67,52 +72,6 @@ func (c *Checker) normalize(data []byte) *document {
}
}

var copyrightText = []byte("\nCopyright ")

// removeCopyrightLines returns its argument text with (nearly) all lines beginning
// with the word Copyright deleted. (The exception is for the lines in the Creative
// Commons licenses that are a definition of Copyright.) Leading spaces are
// significant: the line must start with a 'C'. This cleanup eliminates a common
// difference between standard license text and the form of the license seen in
// practice. If a copyright line is deleted, the return value is a fresh copy to
// avoid overwriting the caller's data.
func removeCopyrightLines(text []byte) []byte {
copied := false
for i := 0; ; {
copyright := copyrightText
if i == 0 {
copyright = copyright[1:] // Drop leading newline
}
start := bytes.Index(text[i:], copyright)
if start < 0 {
break
}
if i > 0 {
start += i + 1 // Skip starting newline.
}
newline := bytes.IndexByte(text[start:], '\n')
if newline < 0 {
break
}
newline = start + newline // Leave trailing newline, making it line of blanks.
i = newline
// Special case for the Creative Commons licenses, which define copyright.
// TODO: Better ideas?
if bytes.Contains(text[start:newline], []byte(" means copyright ")) {
continue
}
if !copied {
text = append([]byte(nil), text...)
copied = true
}
// White out the text.
for j := start; j < newline; j++ {
text[j] = ' '
}
}
return text
}

// toLower returns a lowercased version of the input, guaranteeing
// that the size remains the same so byte offsets between the slice and
// the string created from it, which will be used to locate words, will
Expand Down

0 comments on commit 463d502

Please sign in to comment.