Skip to content
Permalink
Browse files

Merge pull request #56 from adracus/feature.file-matches

Add file matches to detection result
  • Loading branch information...
vmarkovtsev committed Aug 19, 2019
2 parents b17c037 + 79f5d7c commit 0ba3cd18b17de5abe09119d8f0a68eb9d42f0291
2 go.sum
@@ -51,8 +51,6 @@ github.com/shogo82148/go-shuffle v0.0.0-20170808115208-59829097ff3b h1:VI1u+o2KZ
github.com/shogo82148/go-shuffle v0.0.0-20170808115208-59829097ff3b/go.mod h1:2htx6lmL0NGLHlO8ZCf+lQBGBHIbEujyywxJArf+2Yc=
github.com/shurcooL/sanitized_anchor_name v0.0.0-20170918181015-86672fcb3f95 h1:/vdW8Cb7EXrkqWGufVMES1OH2sU9gKVb2n9/1y5NMBY=
github.com/shurcooL/sanitized_anchor_name v0.0.0-20170918181015-86672fcb3f95/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
github.com/spf13/pflag v1.0.0 h1:oaPbdDe/x0UncahuwiPxW1GYJyilRAdsPnq3e1yaPcI=
github.com/spf13/pflag v1.0.0/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
github.com/src-d/gcfg v1.3.0 h1:2BEDr8r0I0b8h/fOqwtxCEiq2HJu8n2JGZJQFGXWLjg=
@@ -67,7 +67,7 @@ func process(arg string) ([]Match, error) {

var matches []Match
for k, v := range ls {
matches = append(matches, Match{k, v})
matches = append(matches, Match{k, v.Confidence})
}
sort.Slice(matches, func(i, j int) bool { return matches[i].Confidence > matches[j].Confidence })
return matches, nil
@@ -0,0 +1,8 @@
package api

// Match is a detection result of a license with a confidence (0.0 - 1.0)
// and a mapping of files to confidence.
type Match struct {
Files map[string]float32
Confidence float32
}
@@ -6,8 +6,10 @@ import (
"sync"
"testing"

"github.com/stretchr/testify/assert"
"gopkg.in/src-d/go-license-detector.v2/licensedb/api"
"gopkg.in/src-d/go-license-detector.v2/licensedb/filer"

"github.com/stretchr/testify/assert"
)

func TestDataset(t *testing.T) {
@@ -16,7 +18,7 @@ func TestDataset(t *testing.T) {
defer rootFiler.Close()
projects, err := rootFiler.ReadDir("")
assert.Nil(t, err)
licenses := map[string]map[string]float32{}
licenses := map[string]map[string]api.Match{}
mutex := sync.Mutex{}
wg := sync.WaitGroup{}
wg.Add(len(projects))

Some generated files are not rendered by default. Learn more.

@@ -8,6 +8,7 @@ import (
"strings"
"sync"

"gopkg.in/src-d/go-license-detector.v2/licensedb/api"
"gopkg.in/src-d/go-license-detector.v2/licensedb/filer"
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/processors"
)
@@ -64,11 +65,30 @@ var (
"^(%s)$", strings.Join(licenseFileNames, "|")))
)

func investigateCandidates(candidates map[string][]byte, f func(text []byte) map[string]float32) map[string]api.Match {
matches := make(map[string]api.Match)
for file, text := range candidates {
candidates := f(text)
for name, sim := range candidates {
match := matches[name]
if match.Files == nil {
match.Files = make(map[string]float32)
}
match.Files[file] = sim
if sim > match.Confidence {
match.Confidence = sim
}
matches[name] = match
}
}
return matches
}

// ExtractLicenseFiles returns the list of possible license texts.
// The file names are matched against the template.
// Reader is used to to read file contents.
func ExtractLicenseFiles(files []string, fs filer.Filer) [][]byte {
candidates := [][]byte{}
func ExtractLicenseFiles(files []string, fs filer.Filer) map[string][]byte {
candidates := make(map[string][]byte)
for _, file := range files {
if licenseFileRe.MatchString(strings.ToLower(paths.Base(file))) {
text, err := fs.ReadFile(file)
@@ -84,7 +104,7 @@ func ExtractLicenseFiles(files []string, fs filer.Filer) [][]byte {
if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists {
text = preprocessor(text)
}
candidates = append(candidates, text)
candidates[file] = text
}
}
}
@@ -93,18 +113,9 @@ func ExtractLicenseFiles(files []string, fs filer.Filer) [][]byte {

// InvestigateLicenseTexts takes the list of candidate license texts and returns the most probable
// reference licenses matched. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident.
func InvestigateLicenseTexts(texts [][]byte) map[string]float32 {
maxLicenses := map[string]float32{}
for _, text := range texts {
candidates := InvestigateLicenseText(text)
for name, sim := range candidates {
maxSim := maxLicenses[name]
if sim > maxSim {
maxLicenses[name] = sim
}
}
}
return maxLicenses
// Furthermore, each match contains a mapping of filename to the confidence that file produced.
func InvestigateLicenseTexts(candidates map[string][]byte) map[string]api.Match {
return investigateCandidates(candidates, InvestigateLicenseText)
}

// InvestigateLicenseText takes the license text and returns the most probable reference licenses matched.
@@ -115,16 +126,16 @@ func InvestigateLicenseText(text []byte) map[string]float32 {

// ExtractReadmeFiles searches for README files.
// Reader is used to to read file contents.
func ExtractReadmeFiles(files []string, fs filer.Filer) [][]byte {
candidates := [][]byte{}
func ExtractReadmeFiles(files []string, fs filer.Filer) map[string][]byte {
candidates := make(map[string][]byte)
for _, file := range files {
if readmeFileRe.MatchString(strings.ToLower(file)) {
text, err := fs.ReadFile(file)
if err == nil {
if preprocessor, exists := filePreprocessors[paths.Ext(file)]; exists {
text = preprocessor(text)
}
candidates = append(candidates, text)
candidates[file] = text
}
}
}
@@ -133,18 +144,10 @@ func ExtractReadmeFiles(files []string, fs filer.Filer) [][]byte {

// InvestigateReadmeTexts scans README files for licensing information and outputs the
// probable names using NER.
func InvestigateReadmeTexts(texts [][]byte, fs filer.Filer) map[string]float32 {
maxLicenses := map[string]float32{}
for _, text := range texts {
candidates := InvestigateReadmeText(text, fs)
for name, sim := range candidates {
maxSim := maxLicenses[name]
if sim > maxSim {
maxLicenses[name] = sim
}
}
}
return maxLicenses
func InvestigateReadmeTexts(candidtes map[string][]byte, fs filer.Filer) map[string]api.Match {
return investigateCandidates(candidtes, func(text []byte) map[string]float32 {
return InvestigateReadmeText(text, fs)
})
}

// InvestigateReadmeText scans the README file for licensing information and outputs probable
@@ -4,6 +4,7 @@ import (
"errors"
paths "path"

"gopkg.in/src-d/go-license-detector.v2/licensedb/api"
"gopkg.in/src-d/go-license-detector.v2/licensedb/filer"
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal"
)
@@ -15,7 +16,7 @@ var (

// Detect returns the most probable reference licenses matched for the given
// file tree. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident.
func Detect(fs filer.Filer) (map[string]float32, error) {
func Detect(fs filer.Filer) (map[string]api.Match, error) {
files, err := fs.ReadDir("")
if err != nil {
return nil, err

0 comments on commit 0ba3cd1

Please sign in to comment.
You can’t perform that action at this time.