-
Notifications
You must be signed in to change notification settings - Fork 24
/
indexer.go
445 lines (391 loc) · 13.4 KB
/
indexer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
package indexer
import (
"context"
"encoding/json"
"fmt"
"io/fs"
"io/ioutil"
"log"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"
"github.com/hashicorp/go-multierror"
"github.com/pkg/errors"
"github.com/sourcegraph/doctree/doctree/apischema"
"github.com/sourcegraph/doctree/doctree/git"
"github.com/sourcegraph/doctree/doctree/schema"
)
// Language describes an indexer for a specific language.
type Language interface {
// Name of the language this indexer works for.
Name() schema.Language
// Extensions returns a list of file extensions commonly associated with the language.
Extensions() []string
// IndexDir indexes a directory of code likely to contain sources in this language recursively.
IndexDir(ctx context.Context, dir string) (*schema.Index, error)
}
// Registered indexers by language ID ("go", "objc", "cpp", etc.)
var Registered = map[string]Language{}
// Registers a doctree language indexer.
func Register(indexer Language) {
Registered[indexer.Name().ID] = indexer
}
// IndexDir indexes the specified directory recursively. It looks at the file extension of every
// file, and then asks the registered indexers for each language to index.
//
// Returns the successful indexes and any errors.
func IndexDir(ctx context.Context, dir string) (map[string]*schema.Index, error) {
// Identify all file extensions in the directory recursively.
extensions := map[string]struct{}{}
if err := fs.WalkDir(os.DirFS(dir), ".", func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err // error walking dir
}
ext := filepath.Ext(path)
if ext != "" && ext != "." {
ext = ext[1:] // ".txt" -> "txt"
extensions[ext] = struct{}{}
}
return nil
}); err != nil {
return nil, errors.Wrap(err, "WalkDir")
}
// Map extensions to indexers.
indexersByExtension := map[string][]Language{}
for _, language := range Registered {
for _, ext := range language.Extensions() {
indexers := indexersByExtension[ext]
indexers = append(indexers, language)
indexersByExtension[ext] = indexers
}
}
absDir, err := filepath.Abs(dir)
if err != nil {
return nil, errors.Wrap(err, "Abs")
}
// Run indexers for each language.
var (
wg sync.WaitGroup
mu sync.Mutex
errs error
results = map[string]*schema.Index{}
)
// TODO: configurable parallelism?
for ext := range extensions {
ext := ext
for _, indexer := range indexersByExtension[ext] {
indexer := indexer
wg.Add(1)
go func() {
defer wg.Done()
start := time.Now()
index, err := indexer.IndexDir(ctx, dir)
if index != nil {
index.GitRepository, _ = git.URIForFile(dir)
index.GitCommitID, _ = git.RevParse(dir, false, "HEAD")
index.GitRefName, _ = git.RevParse(dir, true, "HEAD")
index.DurationSeconds = time.Since(start).Seconds()
index.CreatedAt = time.Now().Format(time.RFC3339)
index.Directory = absDir
}
mu.Lock()
defer mu.Unlock()
if err != nil {
errs = multierror.Append(errs, errors.Wrap(err, indexer.Name().ID+": IndexDir"))
} else {
results[indexer.Name().ID] = index
}
}()
}
}
wg.Wait()
return results, errs
}
// WriteIndexes writes indexes to the index data directory:
//
// index/<project_name>/<language_id>
func WriteIndexes(projectName string, indexDataDir string, indexes map[string]*schema.Index) error {
// TODO: binary format?
// TODO: compression
// Ensure paths are absolute first. Index ID is absolute path of indexed directory effectively.
var err error
indexDataDir, err = filepath.Abs(indexDataDir)
if err != nil {
return errors.Wrap(err, "Abs")
}
outDir := filepath.Join(indexDataDir, encodeProjectName(projectName))
// Delete any old index data in this dir (e.g. if we had python+go before, but now only go, we
// need to delete python index.)
if err := os.RemoveAll(outDir); err != nil {
return errors.Wrap(err, "RemoveAll")
}
if err := os.MkdirAll(outDir, os.ModePerm); err != nil {
return errors.Wrap(err, "MkdirAll")
}
for lang, index := range indexes {
f, err := os.Create(filepath.Join(outDir, lang))
if err != nil {
return errors.Wrap(err, "Create")
}
defer f.Close()
if err := json.NewEncoder(f).Encode(index); err != nil {
return errors.Wrap(err, "Encode")
}
}
return nil
}
// Lists all indexes found in the index data directory.
func List(indexDataDir string) ([]string, error) {
dir, err := ioutil.ReadDir(indexDataDir)
if os.IsNotExist(err) {
return []string{}, nil
}
if err != nil {
return nil, errors.Wrap(err, "ReadDir")
}
indexes := []string{}
for _, info := range dir {
if info.IsDir() {
indexes = append(indexes, decodeProjectName(info.Name()))
}
}
return indexes, nil
}
var (
// Cache of indexes read from disk, because reading+decoding can be slow (~500ms for e.g.
// the golang/go index)
jsonDecodeCacheMu sync.RWMutex
jsonDecodeCache = map[string]struct {
modTime time.Time
value schema.Index
}{}
)
// GetIndex gets all the language indexes for the specified project.
//
// When autoCloneMissing is true, if the project does not exist the server will attempt to
// `git clone <projectName> and index it. Beware, this may not be safe to enable if you have Git
// configured to access private repositories and the server is public!
func GetIndex(ctx context.Context, dataDir, indexDataDir, projectName string, autoCloneMissing bool) (apischema.ProjectIndexes, error) {
indexName := encodeProjectName(projectName)
if strings.Contains(indexName, "/") || strings.Contains(indexName, "..") {
return nil, errors.New("potentially malicious index name (this is likely a bug)")
}
indexes := apischema.ProjectIndexes{}
dir, err := ioutil.ReadDir(filepath.Join(indexDataDir, indexName))
if os.IsNotExist(err) {
if autoCloneMissing {
repositoryURL := "https://" + projectName
log.Println("cloning", repositoryURL)
if err := cloneAndIndex(ctx, repositoryURL, dataDir); err != nil {
log.Println("failed to clone", repositoryURL, "error:", err)
return nil, errors.Wrap(err, "cloneAndIndex")
}
return GetIndex(ctx, dataDir, indexDataDir, projectName, false)
}
}
if err != nil {
return nil, errors.Wrap(err, "ReadDir")
}
for _, info := range dir {
if !info.IsDir() && info.Name() != "search-index.sinter" && info.Name() != "version" {
lang := info.Name()
indexFile := filepath.Join(indexDataDir, indexName, lang)
f, err := os.Open(indexFile)
if err != nil {
return nil, errors.Wrap(err, "Open")
}
defer f.Close()
stat, err := f.Stat()
if err != nil {
return nil, errors.Wrap(err, "Stat")
}
jsonDecodeCacheMu.RLock()
cached, ok := jsonDecodeCache[indexFile]
jsonDecodeCacheMu.RUnlock()
if ok && cached.modTime == stat.ModTime() {
indexes[lang] = cached.value
continue
}
var decoded schema.Index
if err := json.NewDecoder(f).Decode(&decoded); err != nil {
return nil, errors.Wrap(err, "Decode")
}
jsonDecodeCacheMu.Lock()
jsonDecodeCache[indexFile] = struct {
modTime time.Time
value schema.Index
}{
modTime: stat.ModTime(),
value: decoded,
}
jsonDecodeCacheMu.Unlock()
indexes[lang] = decoded
}
}
return indexes, nil
}
func CloneAndIndexIfOutdated(ctx context.Context, projectName, repositoryURL, dataDir, indexedCommit string) error {
// Clone the repository into a temp dir.
dir, err := os.MkdirTemp(os.TempDir(), "doctree-clone")
if err != nil {
return errors.Wrap(err, "TempDir")
}
defer os.RemoveAll(dir)
cmd := exec.CommandContext(ctx, "git", "clone", "--depth=1", repositoryURL, "repo/")
cmd.Dir = dir
out, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("git %s: %v\n%s", strings.Join(cmd.Args, " "), err, out)
}
repoDir := filepath.Join(dir, "repo")
latestGitCommit, err := git.RevParse(repoDir, false, "HEAD")
if err != nil {
return errors.Wrap(err, "RevParse")
}
if indexedCommit != latestGitCommit {
// Index the repository.
if err := RunIndexers(ctx, repoDir, dataDir, projectName); err != nil {
return errors.Wrap(err, "RunIndexers")
}
}
return nil
}
func cloneAndIndex(ctx context.Context, repositoryURL, dataDir string) error {
// Clone the repository into a temp dir.
dir, err := os.MkdirTemp(os.TempDir(), "doctree-clone")
if err != nil {
return errors.Wrap(err, "TempDir")
}
defer os.RemoveAll(dir)
cmd := exec.CommandContext(ctx, "git", "clone", "--depth=1", repositoryURL, "repo/")
cmd.Dir = dir
out, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("git %s: %v\n%s", strings.Join(cmd.Args, " "), err, out)
}
// Index the repository.
projectName := strings.TrimPrefix(repositoryURL, "https://")
if err := RunIndexers(ctx, filepath.Join(dir, "repo"), dataDir, projectName); err != nil {
return errors.Wrap(err, "RunIndexers")
}
return err
}
func encodeProjectName(name string) string {
return strings.ReplaceAll(name, "/", "---")
}
func decodeProjectName(name string) string {
return strings.ReplaceAll(name, "---", "/")
}
type AutoIndexedProject struct {
// Name of the project to be auto-indexed
Name string `json:"name"`
}
// Runs all the registered language indexes along with the search indexer and stores the results.
//
// If an error is returned, it may be the case that some indexers succeeded while others failed.
func RunIndexers(ctx context.Context, dir, dataDir, projectName string) error {
var err error
// Ensure the doctree data dir exists, and that it has a version file.
if err := ensureDataDir(dataDir); err != nil {
return errors.Wrap(err, "ensureDataDir")
}
// IndexDir may partially complete, with some indexers succeeding while others fail. In this
// case indexes and indexErr are both != nil.
indexes, indexErr := IndexDir(ctx, dir)
for _, index := range indexes {
fmt.Printf("%v: indexed %v files (%v bytes) in %v\n", index.Language.ID, index.NumFiles, index.NumBytes, time.Duration(index.DurationSeconds*float64(time.Second)).Round(time.Millisecond))
}
if indexErr != nil {
err = multierror.Append(err, errors.Wrap(indexErr, "IndexDir"))
}
// Write indexes that we did produce.
indexDataDir := filepath.Join(dataDir, "index")
writeErr := WriteIndexes(projectName, indexDataDir, indexes)
if writeErr != nil {
err = multierror.Append(err, errors.Wrap(writeErr, "WriteIndexes"))
}
// Index for search the indexes that we did produce.
projectDir := filepath.Join(indexDataDir, encodeProjectName(projectName))
searchErr := IndexForSearch(projectName, indexDataDir, indexes)
if searchErr != nil {
if rmErr := os.RemoveAll(projectDir); rmErr != nil {
err = multierror.Append(err, errors.Wrap(rmErr, "RemoveAll"))
}
err = multierror.Append(err, errors.Wrap(searchErr, "IndexForSearch"))
}
// Write a version number file.
versionErr := os.WriteFile(filepath.Join(projectDir, "version"), []byte(projectDirVersion), 0o666)
if versionErr != nil {
if rmErr := os.RemoveAll(projectDir); rmErr != nil {
err = multierror.Append(err, errors.Wrap(rmErr, "RemoveAll"))
}
err = multierror.Append(err, errors.Wrap(searchErr, "WriteFile (version)"))
}
return err
}
// The version stored in e.g. ~/.doctree/index/<project>/version - indicating the version of the
// project directory. If we need to change search indexing, add support for more languages, etc.
// this file is how we'd determine which directories need to be re-indexed / removed.
//
// An incrementing integer. No relation to other version numbers.
const projectDirVersion = "2"
// The version stored in e.g. ~/.doctree/version - indicating the version of the overall data
// directory. If we need to change the directory structure in some way, change the autoindex file
// format, etc. this is what we'd use to determine when to do that.
//
// An incrementing integer. No relation to other version numbers.
const dataDirVersion = "1"
func ensureDataDir(dataDir string) error {
versionFile := filepath.Join(dataDir, "version")
_, err := os.Stat(versionFile)
if os.IsNotExist(err) {
// Create the directory if needed.
if err := os.MkdirAll(dataDir, os.ModePerm); err != nil {
return errors.Wrap(err, "MkdirAll")
}
// Write the version info.
return os.WriteFile(versionFile, []byte(dataDirVersion), 0o666)
}
if err != nil {
return err
}
return nil
}
// RunMigrations handles upgrades to new doctree versions, if necessary.
func RunMigrations(ctx context.Context, cloudMode bool, dataDir, indexDataDir string) error {
projects, err := List(indexDataDir)
if err != nil {
return errors.Wrap(err, "List")
}
for _, projectName := range projects {
projectDir := filepath.Join(indexDataDir, encodeProjectName(projectName))
data, err := os.ReadFile(filepath.Join(projectDir, "version"))
if err != nil && !os.IsNotExist(err) {
return errors.Wrap(err, "Read project version")
}
if (string(data) != projectDirVersion) || os.IsNotExist(err) {
// Project dir version has changed. Need to reindex.
if cloudMode {
log.Println("migration: doctree schema has changed, reindexing:", projectName)
repositoryURL := "https://" + projectName
log.Println("cloning", repositoryURL)
err := cloneAndIndex(ctx, repositoryURL, dataDir)
if err != nil {
log.Println("migration: failed to reindex", repositoryURL, err)
continue
}
} else {
// Auto indexer should index the project again, or if not in auto index list then
// user needs to rerun index command manually.
log.Println("migration: doctree schema has changed, removing:", projectName)
if err := os.RemoveAll(projectDir); err != nil {
return errors.Wrap(err, "RemoveAll")
}
}
}
}
return nil
}