-
Notifications
You must be signed in to change notification settings - Fork 8
/
filter.go
369 lines (326 loc) · 11.1 KB
/
filter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
package filefilter
import (
"encoding/json"
"fmt"
"hash/fnv"
"os"
"path/filepath"
"runtime"
"sort"
"strings"
"sync"
"github.com/puzpuzpuz/xsync"
"github.com/rs/zerolog"
ignore "github.com/sabhiram/go-gitignore"
"gopkg.in/yaml.v3"
"github.com/snyk/snyk-ls/internal/progress"
"github.com/snyk/snyk-ls/internal/util"
)
const defaultParallelism = 4
// parallelism is the number of concurrent goroutines that can be used to check if a file is ignored.
// It is set to the defaultParallelism, unless there are fewer CPU cores.
// For safety, it is set to be at least 1.
var parallelism = util.Max(1, util.Min(defaultParallelism, runtime.NumCPU()))
// semaphore is used to limit the number of concurrent CPU-heavy ignore checks.
// It is global because there can be several file filters running concurrently on the same machine.
var semaphore = make(chan struct{}, parallelism)
func FindNonIgnoredFiles(rootFolder string, logger *zerolog.Logger, progressTracker *progress.Tracker) <-chan string {
return NewFileFilter(rootFolder, logger).FindNonIgnoredFiles()
}
type FileFilter struct {
// The path to the root of the repository
repoRoot string
ignoreFiles []string
globsPerFolder map[string][]string
logger zerolog.Logger
cache *xsync.MapOf[string, cachedResults]
}
type cachedResults struct {
Hash uint64
FilteredFiles []string
FilteredChildFolders []string
}
type folderContent struct {
Files []string
Globs []string
Folders []string
}
func hashFolder(globs, files, folders []string) (uint64, error) {
sort.Strings(files)
sort.Strings(folders)
data := folderContent{
Files: files,
Globs: globs,
Folders: folders,
}
dataBytes, err := json.Marshal(data)
if err != nil {
return 0, err
}
h := fnv.New64a()
_, err = h.Write(dataBytes)
if err != nil {
return 0, err
}
hash := h.Sum64()
return hash, nil
}
func NewFileFilter(rootFolder string, logger *zerolog.Logger) *FileFilter {
return &FileFilter{
repoRoot: rootFolder,
ignoreFiles: []string{".gitignore", ".dcignore", ".snyk"},
globsPerFolder: make(map[string][]string),
logger: logger.With().Str("component", "FileFilter").Str("repoRoot", rootFolder).Logger(),
cache: xsync.NewMapOf[cachedResults](),
}
}
// FindNonIgnoredFiles returns a channel of non-ignored files in the repository.
// The channel is closed when all files have been processed.
func (f *FileFilter) FindNonIgnoredFiles() <-chan string {
t := progress.NewTracker(false)
t.BeginWithMessage("Snyk Code: Collecting files in \""+f.repoRoot+"\"", "Evaluating ignores and counting files...")
resultsCh := make(chan string)
go func() {
defer close(resultsCh)
defer t.EndWithMessage("Collected files")
err := f.processFolders(f.repoRoot, t, resultsCh)
if err != nil {
f.logger.Err(err).Msg("Error during filepath.WalkDir")
}
}()
return resultsCh
}
// processFolders walks through the folder structure recursively and filters files and folders based on the ignore files.
// It attempts to return cached results if the folder structure hasn't changed.
func (f *FileFilter) processFolders(folderPath string, progressTracker *progress.Tracker, results chan<- string) error {
progressTracker.ReportWithMessage(10, fmt.Sprintf("Collecting files in %s", folderPath))
c, err := f.collectFolderFiles(folderPath)
if err != nil {
return err
}
files := c.Files
globs := c.Globs
childFolders := c.Folders
// Attempt to retrieve cached results.
hashFailed := false
hash, err := hashFolder(globs, files, childFolders)
if err != nil {
f.logger.Err(err).Msg("Error during hash calculation")
hashFailed = true
} else {
cacheEntry, found := f.cache.Load(folderPath)
if found && hash == cacheEntry.Hash { // Cache hit - returning cached results
for _, file := range cacheEntry.FilteredFiles {
results <- file
}
for _, childFolder := range cacheEntry.FilteredChildFolders {
err = f.processFolders(childFolder, progressTracker, results)
if err != nil {
return err
}
}
return nil
}
}
// If results were not cached, filter files and folders, and store the results in the cache.
progressTracker.ReportWithMessage(20, fmt.Sprintf("Filtering files in %s", folderPath))
filteredFiles, filteredChildFolders := f.filterFilesInFolder(globs, files, childFolders, results)
for _, child := range filteredChildFolders {
// Only process child folders that are not ignored
err = f.processFolders(child, progressTracker, results)
if err != nil {
return err
}
}
if !hashFailed { // Only cache results when hash calculation was successful
f.cache.Store(folderPath, cachedResults{
Hash: hash,
FilteredFiles: filteredFiles,
FilteredChildFolders: filteredChildFolders,
})
}
return err
}
func (f *FileFilter) filterFilesInFolder(globs []string,
files []string,
childFolders []string,
results chan<- string,
) (filteredFiles []string, filteredChildFolders []string) {
ignoreParser := ignore.CompileIgnoreLines(globs...) // This is memory heavy
var wg sync.WaitGroup
var resultsLock sync.Mutex
for _, file := range files {
wg.Add(1)
go func(file string) {
defer func() {
wg.Done()
<-semaphore // Release semaphore
}()
semaphore <- struct{}{} // Acquire semaphore
if !ignoreParser.MatchesPath(file) {
resultsLock.Lock()
filteredFiles = append(filteredFiles, file)
resultsLock.Unlock()
results <- file
}
}(file)
}
for _, childFolder := range childFolders {
semaphore <- struct{}{} // Acquire semaphore
if !ignoreParser.MatchesPath(childFolder) {
filteredChildFolders = append(filteredChildFolders, childFolder)
}
<-semaphore // Release semaphore
}
wg.Wait()
ignoreParser = nil // Does this have any effect?
return filteredFiles, filteredChildFolders
}
// collectFolderFiles collects the top-level files and child folders of a folder, along with the ignore rules (globs).
func (f *FileFilter) collectFolderFiles(folderPath string) (folderContent, error) {
var files []string
var globs []string
var childFolders []string
// The first iteration of this callback is going to be called for the root folder,
// followed by all the files and folders in it.
// Only the top level files and folders are of interest, so we skip the rest.
err := filepath.WalkDir(folderPath, func(path string, dirEntry os.DirEntry, err error) error {
if err != nil {
return err
}
if dirEntry.IsDir() {
if path != folderPath {
// We don't need to recursively go through the child folders yet, because they might be ignored.
// Returning SkipDir will skip the entire subtree of the current folder.
childFolders = append(childFolders, path)
return filepath.SkipDir
}
// If it's the root folder, collect the globs
globs = f.collectGlobs(path)
f.globsPerFolder[path] = globs
return nil
} else { // If it's a file, collect its path
files = append(files, path)
}
return nil
})
content := folderContent{
Files: files,
Globs: globs,
Folders: childFolders,
}
return content, err
}
func (f *FileFilter) collectGlobs(path string) []string {
var globs []string
folderPath := path
if path != f.repoRoot {
globs = append(globs, f.globsPerFolder[filepath.Dir(path)]...)
} else {
folderPath = f.repoRoot
defaultGlobs := []string{"**/.git/**", "**/.svn/**", "**/.hg/**", "**/.bzr/**", "**/.DS_Store/**"}
globs = append(globs, defaultGlobs...)
}
for _, ignoreFile := range f.ignoreFiles {
ignoreFilePath := filepath.Join(path, ignoreFile)
fileInfo, err := os.Stat(ignoreFilePath)
fileFound := err == nil && !fileInfo.IsDir()
if fileFound {
var content []byte
content, err = os.ReadFile(ignoreFilePath)
if err != nil {
f.logger.Err(err).Msg("Can't parse ignore file" + ignoreFilePath)
}
if filepath.Base(ignoreFilePath) == ".snyk" { // .snyk files are yaml files and should be parsed differently
parsedRules, err := parseDotSnykFile(content, folderPath)
globs = append(globs, parsedRules...)
if err != nil {
f.logger.Err(err).Msg("Can't parse .snyk file")
}
} else { // .gitignore, .dcignore, etc. are just a list of ignore rules
parsedRules := parseIgnoreFile(content, folderPath)
globs = append(globs, parsedRules...)
}
}
}
return globs
}
func parseDotSnykFile(content []byte, baseDir string) ([]string, error) {
type DotSnykRules struct {
Exclude struct {
Code []string `yaml:"code"`
Global []string `yaml:"global"`
} `yaml:"exclude"`
}
var rules DotSnykRules
err := yaml.Unmarshal(content, &rules)
if err != nil {
return nil, err
}
var globs []string
for _, codeRule := range rules.Exclude.Code {
globs = append(globs, parseIgnoreRuleToGlobs(codeRule, baseDir)...)
}
for _, codeRule := range rules.Exclude.Global {
globs = append(globs, parseIgnoreRuleToGlobs(codeRule, baseDir)...)
}
return globs, nil
}
func parseIgnoreFile(content []byte, baseDir string) (ignores []string) {
ignores = []string{}
lines := strings.Split(string(content), "\n")
for _, line := range lines {
if strings.HasPrefix(line, "#") || strings.TrimSpace(line) == "" {
continue
}
globs := parseIgnoreRuleToGlobs(line, baseDir)
ignores = append(ignores, globs...)
}
return ignores
}
func parseIgnoreRuleToGlobs(rule string, baseDir string) (globs []string) {
// Shamelessly stolen from code-client: https://github.com/snyk/code-client/blob/7a9e5cdbed4e8a6a0f2597fcd64b67800279e585/src/files.ts#L67
// Mappings from .gitignore format to glob format:
// `/foo/` => `/foo/**` (meaning: Ignore root (not sub) foo dir and its paths underneath.)
// `/foo` => `/foo/**`, `/foo` (meaning: Ignore root (not sub) file and dir and its paths underneath.)
// `foo/` => `**/foo/**` (meaning: Ignore (root/sub) foo dirs and their paths underneath.)
// `foo` => `**/foo/**`, `foo` (meaning: Ignore (root/sub) foo files and dirs and their paths underneath.)
prefix := ""
const negation = "!"
const slash = "/"
const all = "**"
baseDir = filepath.ToSlash(baseDir)
if strings.HasPrefix(rule, negation) {
rule = rule[1:]
prefix = negation
}
startingSlash := strings.HasPrefix(rule, slash)
startingGlobstar := strings.HasPrefix(rule, all)
endingSlash := strings.HasSuffix(rule, slash)
endingGlobstar := strings.HasSuffix(rule, all)
if startingSlash || startingGlobstar {
// case `/foo/`, `/foo` => `{baseDir}/foo/**`
// case `**/foo/`, `**/foo` => `{baseDir}/**/foo/**`
if !endingGlobstar {
globs = append(globs, filepath.ToSlash(prefix+filepath.Join(baseDir, rule, all)))
}
// case `/foo` => `{baseDir}/foo`
// case `**/foo` => `{baseDir}/**/foo`
// case `/foo/**` => `{baseDir}/foo/**`
// case `**/foo/**` => `{baseDir}/**/foo/**`
if !endingSlash {
globs = append(globs, filepath.ToSlash(prefix+filepath.Join(baseDir, rule)))
}
} else {
// case `foo/`, `foo` => `{baseDir}/**/foo/**`
if !endingGlobstar {
globs = append(globs, filepath.ToSlash(prefix+filepath.Join(baseDir, all, rule, all)))
}
// case `foo` => `{baseDir}/**/foo`
// case `foo/**` => `{baseDir}/**/foo/**`
if !endingSlash {
globs = append(globs, filepath.ToSlash(prefix+filepath.Join(baseDir, all, rule)))
}
}
return globs
}