/
investigate.go
180 lines (163 loc) · 5.22 KB
/
investigate.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
package ld
import (
"errors"
"fmt"
"io/ioutil"
"os"
paths "path"
"regexp"
"strings"
)
var (
// ErrNoLicenseFound is raised if no license files were found.
ErrNoLicenseFound = errors.New("no license file was found")
globalLicenseDatabase = &LicenseDatabase{}
// Base names of guessable license files.
fileNames = []string{
"copying",
"copyleft",
"copyright",
"license",
"unlicense",
"licence",
}
// License file extensions. Combined with the fileNames slice
// to create a set of files we can reasonably assume contain
// licensing information.
fileExtensions = []string{
"",
".md",
".rst",
".html",
".txt",
}
filePreprocessors = map[string]func(string) string{
".md": PreprocessMarkdown,
".rst": PreprocessRestructuredText,
".html": PreprocessHTML,
}
licenseFileRe = regexp.MustCompile(
fmt.Sprintf("^(%s)(%s)$",
strings.Join(fileNames, "|"),
strings.Replace(strings.Join(fileExtensions, "|"), ".", "\\.", -1)))
readmeFileRe = regexp.MustCompile(fmt.Sprintf("^readme(%s)$",
strings.Replace(strings.Join(fileExtensions, "|"), ".", "\\.", -1)))
)
// InvestigateProjectLicenses returns the most probable reference licenses matched for the given
// file tree. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident.
func InvestigateProjectLicenses(path string) (map[string]float32, error) {
files, err := ioutil.ReadDir(path)
if err != nil {
return nil, err
}
fileNames := []string{}
for _, file := range files {
if !file.IsDir() {
fileNames = append(fileNames, file.Name())
}
}
return InvestigateFilesLicenses(fileNames, func(file string) (string, error) {
text, err := ioutil.ReadFile(paths.Join(path, file))
return string(text), err
})
}
// InvestigateFilesLicenses scans the given list of file names, reads them with `reader` and
// detects the licenses. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident.
func InvestigateFilesLicenses(
fileNames []string, reader func(string) (string, error)) (map[string]float32, error) {
candidates := ExtractLicenseFiles(fileNames, reader)
if len(candidates) == 0 {
// Plan B: take the README, find the section about the license and apply NER
candidates = ExtractReadmeFiles(fileNames, reader)
if len(candidates) == 0 {
return nil, ErrNoLicenseFound
}
licenses := InvestigateReadmeTexts(candidates)
if len(licenses) == 0 {
return nil, ErrNoLicenseFound
}
return licenses, nil
}
return InvestigateLicenseTexts(candidates), nil
}
// ExtractLicenseFiles returns the list of possible license texts.
// The file names are matched against the template.
// Reader is used to to read file contents.
func ExtractLicenseFiles(files []string, reader func(string) (string, error)) []string {
candidates := []string{}
for _, file := range files {
if licenseFileRe.MatchString(strings.ToLower(file)) {
text, err := reader(file)
if err == nil {
if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists {
text = preprocessor(text)
}
candidates = append(candidates, text)
}
}
}
return candidates
}
// InvestigateLicenseTexts takes the list of candidate license texts and returns the most probable
// reference licenses matched. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident.
func InvestigateLicenseTexts(texts []string) map[string]float32 {
maxLicenses := map[string]float32{}
for _, text := range texts {
candidates := InvestigateLicenseText(text)
for name, sim := range candidates {
maxSim := maxLicenses[name]
if sim > maxSim {
maxLicenses[name] = sim
}
}
}
return maxLicenses
}
// InvestigateLicenseText takes the license text and returns the most probable reference licenses matched.
// Each match has the confidence assigned, from 0 to 1, 1 means 100% confident.
func InvestigateLicenseText(text string) map[string]float32 {
return globalLicenseDatabase.QueryLicenseText(text)
}
// ExtractReadmeFiles searches for README files.
// Reader is used to to read file contents.
func ExtractReadmeFiles(files []string, reader func(string) (string, error)) []string {
candidates := []string{}
for _, file := range files {
if readmeFileRe.MatchString(strings.ToLower(file)) {
text, err := reader(file)
if err == nil {
if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists {
text = preprocessor(text)
}
candidates = append(candidates, text)
}
}
}
return candidates
}
// InvestigateReadmeTexts scans README files for licensing information and outputs the
// probable names using NER.
func InvestigateReadmeTexts(texts []string) map[string]float32 {
maxLicenses := map[string]float32{}
for _, text := range texts {
candidates := InvestigateReadmeText(text)
for name, sim := range candidates {
maxSim := maxLicenses[name]
if sim > maxSim {
maxLicenses[name] = sim
}
}
}
return maxLicenses
}
// InvestigateReadmeText scans the README file for licensing information and outputs probable
// names found with Named Entity Recognition from NLP.
func InvestigateReadmeText(text string) map[string]float32 {
return globalLicenseDatabase.QueryReadmeText(text)
}
func init() {
if os.Getenv("LICENSE_DEBUG") != "" {
globalLicenseDatabase.Debug = true
}
globalLicenseDatabase.Load()
}