Skip to content

Commit

Permalink
Merge pull request #101 from mcarmonaa/feature/add-stars-to-pga
Browse files Browse the repository at this point in the history
Add stars field to pga index
  • Loading branch information
jfontan committed Feb 13, 2019
2 parents 17f732e + 41fe311 commit eb71a82
Show file tree
Hide file tree
Showing 8 changed files with 247 additions and 16 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
.idea

**/.ci
**/build

19 changes: 13 additions & 6 deletions PublicGitArchive/pga-create/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,21 @@ To process the downloaded repositories you will need the `pga-create index` comm
Same environment variables as in borges can be used to configure the database access.
```
pga-create index -debug -logfile=borges-indexer.log
pga-create index --debug --logfile=pga-create-index.log
```
The arguments accepted by borges indexer are the following:
* `-debug`: print more verbose logs that can be used for debugging purposes
* `-logfile=<LOGFILE PATH>`: path to the file where logs will be written
* `-limit=N`: max number of repositories to process (useful for batch processing)
* `-offset=N`: skip the first N repositories (useful for batch processing)
The options accepted by `pga-create index` are the following:
```
-o, --output= csv file path with the results (default: data/index.csv)
--debug show debug logs
--logfile= write logs to file
--limit= max number of repositories to process
--offset= skip initial n repositories
--workers= number of workers to use (defaults to number of CPUs)
--repos-file= path to a file with a repository per line, only those will be processed
-s, --stars= input path for the file with the numbers of stars per repository (default: data/stars.gz)
-r, --repositories= input path for the gzipped file with the repository names and identifiers (default: data/repositories.gz)
```
**NOTE:** this spawns as many workers as CPUs are available in the machine. Take into account that some repositories may be considerably large and this process may take a very big amount of memory in the machine.
Expand Down
18 changes: 11 additions & 7 deletions PublicGitArchive/pga-create/cmd/pga-create/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ import (
)

type indexCommand struct {
Output string `short:"o" long:"output" default:"data/index.csv" description:"csv file path with the results"`
Debug bool `long:"debug" description:"show debug logs"`
LogFile string `long:"logfile" description:"write logs to file"`
Limit uint64 `long:"limit" description:"max number of repositories to process"`
Offset uint64 `long:"offset" description:"skip initial n repositories"`
Workers int `long:"workers" description:"number of workers to use (defaults to number of CPUs)"`
ReposFile string `long:"repos-file" description:"path to a file with a repository per line, only those will be processed"`
Output string `short:"o" long:"output" default:"data/index.csv" description:"csv file path with the results"`
Debug bool `long:"debug" description:"show debug logs"`
LogFile string `long:"logfile" description:"write logs to file"`
Limit uint64 `long:"limit" description:"max number of repositories to process"`
Offset uint64 `long:"offset" description:"skip initial n repositories"`
Workers int `long:"workers" description:"number of workers to use (defaults to number of CPUs)"`
ReposFile string `long:"repos-file" description:"path to a file with a repository per line, only those will be processed"`
Stars string `short:"s" long:"stars" default:"data/stars.gz" description:"input path for the file with the numbers of stars per repository"`
Repositories string `short:"r" long:"repositories" default:"data/repositories.gz" description:"input path for the gzipped file with the repository names and identifiers"`
}

func (c *indexCommand) Execute(args []string) error {
Expand Down Expand Up @@ -71,6 +73,8 @@ func (c *indexCommand) Execute(args []string) error {
c.Limit,
c.Offset,
repos,
c.Repositories,
c.Stars,
)

return nil
Expand Down
23 changes: 22 additions & 1 deletion PublicGitArchive/pga-create/cmd/pga-create/select.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ func selectRepos(parameters selectionParameters) {
}
defer gzf.Close()
scanner := bufio.NewScanner(gzf)
var count int
for scanner.Scan() {
var repoID int
var repoName string
Expand All @@ -100,9 +101,17 @@ func selectRepos(parameters selectionParameters) {
fail("parsing repositories file "+parameters.ReposFile, err)
}
if selectedRepos[repoID] {
count++
bar.Increment()
fmt.Fprintf(os.Stdout, parameters.URLTemplate+"\n", repoName)
}
if count >= len(selectedRepos) {
break
}
}

if sErr := scanner.Err(); sErr != nil {
fail("scanning repositories file "+parameters.ReposFile, sErr)
}
}

Expand All @@ -112,7 +121,14 @@ func filterStars(path string, minStars int, topN int, selectedRepos map[int]bool
fail("opening stars file "+path, err)
}
defer f.Close()
scanner := bufio.NewScanner(f)

gzf, err := gzip.NewReader(f)
if err != nil {
fail("decompressing stars file "+path, err)
}
defer gzf.Close()

scanner := bufio.NewScanner(gzf)
repos := map[int]bool{}
var stars int
for scanner.Scan() {
Expand All @@ -139,6 +155,11 @@ func filterStars(path string, minStars int, topN int, selectedRepos map[int]bool
break
}
}

if sErr := scanner.Err(); sErr != nil {
fail("scanning stars file "+path, sErr)
}

return repos
}

Expand Down
177 changes: 176 additions & 1 deletion PublicGitArchive/pga-create/index.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
package indexer

import (
"bufio"
"compress/gzip"
"encoding/csv"
"fmt"
"io"
"os"
"os/signal"
"strings"

"github.com/sirupsen/logrus"
"gopkg.in/src-d/core-retrieval.v0/model"
Expand All @@ -21,6 +26,8 @@ func Index(
limit uint64,
offset uint64,
list []string,
reposIDPath string,
starsPath string,
) {
f, err := createOutputFile(outputFile)
if err != nil {
Expand All @@ -40,9 +47,14 @@ func Index(
logrus.WithField("err", err).Fatal("unable to get result set")
}

stars, err := getRepoToStars(reposIDPath, starsPath, list)
if err != nil {
logrus.WithField("err", err).Fatal("unable to get repositories' stars")
}

signals := make(chan os.Signal, 1)
signal.Notify(signals, os.Interrupt)
repos := processRepos(workers, txer, rs)
repos := processRepos(workers, txer, rs, stars)
var processed int
for {
select {
Expand Down Expand Up @@ -126,3 +138,166 @@ func getResultSet(

return rs, total, nil
}

func getRepoToStars(reposIDPath, starsPath string, list []string) (map[string]int, error) {
r, err := os.Open(reposIDPath)
if err != nil {
return nil, err
}
defer r.Close()

s, err := os.Open(starsPath)
if err != nil {
return nil, err
}
defer s.Close()

rgz, err := gzip.NewReader(r)
if err != nil {
return nil, err
}
defer rgz.Close()

sgz, err := gzip.NewReader(s)
if err != nil {
return nil, err
}
defer sgz.Close()

var repoSet map[string]struct{}
if len(list) != 0 {
repoSet = reposListToSet(list)
}

repos, err := buildIDToRepo(rgz, repoSet)
if err != nil {
return nil, err
}

var idSet map[int]struct{}
if len(list) != 0 {
idSet = make(map[int]struct{}, len(repos))
for id := range repos {
idSet[id] = struct{}{}
}
}

stars, err := buildIDToStars(sgz, idSet)
if err != nil {
return nil, err
}

repoStars := make(map[string]int)
for id, repo := range repos {
// if id is not present in stars map that repo has no stars.
n, ok := stars[id]
if ok {
repoStars[repo] = n
}
}

return repoStars, nil
}

func reposListToSet(list []string) map[string]struct{} {
if len(list) == 0 {
return nil
}

repos := make(map[string]struct{}, len(list))
for _, url := range list {
name := trimRepoURL(url)
repos[name] = struct{}{}
}

return repos
}

func trimRepoURL(url string) string {
const (
HTTPprefix = "https://github.com/"
SSHprefix = "git://github.com/"
suffix = ".git"
)

var repo string
if strings.HasPrefix(url, HTTPprefix) {
repo = strings.TrimPrefix(url, HTTPprefix)
} else if strings.HasPrefix(url, SSHprefix) {
repo = strings.TrimPrefix(url, SSHprefix)
repo = strings.TrimSuffix(repo, suffix)
}

return repo
}

func buildIDToRepo(r io.Reader, repoSet map[string]struct{}) (map[int]string, error) {
repos := make(map[int]string)
scanner := bufio.NewScanner(r)
var count int
for scanner.Scan() {
var (
id int
name string
)

line := scanner.Text()
if line == "" {
continue
}

if _, err := fmt.Sscan(line, &id, &name); err != nil {
return nil, err
}

_, ok := repoSet[name]
if len(repoSet) == 0 || ok {
repos[id] = name
count++
}

if len(repoSet) != 0 && count >= len(repoSet) {
break
}
}

if err := scanner.Err(); err != nil {
return nil, err
}

return repos, nil
}

func buildIDToStars(r io.Reader, idSet map[int]struct{}) (map[int]int, error) {
stars := make(map[int]int)
scanner := bufio.NewScanner(r)
var count int
for scanner.Scan() {
var id, nstar int

line := scanner.Text()
if line == "" {
continue
}

if _, err := fmt.Sscan(line, &id, &nstar); err != nil {
return nil, err
}

_, ok := idSet[id]
if len(idSet) == 0 || ok {
stars[id] = nstar
count++
}

if len(idSet) != 0 && count >= len(idSet) {
break
}
}

if err := scanner.Err(); err != nil {
return nil, err
}

return stars, nil
}
Loading

0 comments on commit eb71a82

Please sign in to comment.