Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] fast GitHub match with GetCommit API #63

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/match-identities/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ func main() {
if err != nil {
logrus.Fatalf("unable to load the blacklist: %v", err)
}
people, nameFreqs, err := idmatch.FindPeople(ctx, connStr, args.Cache, blacklist)
people, nameFreqs, err := idmatch.FindPeople(ctx, connStr, args.Cache, blacklist, extmatcher)
if err != nil {
logrus.Fatalf("unable to find people: %v", err)
}
Expand Down
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
version: '3.2'
services:
gitbase:
image: srcd/gitbase:v0.23.1
image: srcd/gitbase:v0.24.0-beta3
restart: unless-stopped
ports:
- 3306:3306
Expand All @@ -20,4 +20,4 @@ services:
POSTGRES_PASSWORD: superset
POSTGRES_USER: superset
ports:
- 5432:5432
- 5432:5432
6 changes: 6 additions & 0 deletions external/cached_matcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ func (m CachedMatcher) DumpCache() error {
return m.cache.DumpCache()
}

//FIXME
func (m CachedMatcher) ScanCommit(ctx context.Context, repo, email, commit string) error {
cs := m.matcher.(CommitScanner)
return cs.ScanCommit(ctx, repo, email, commit)
}

// MatchByEmail returns the latest GitHub user with the given email.
// If email was fetched already it uses cached value.
// MatchByEmail runs `matcher.MatchByEmail` if not.
Expand Down
184 changes: 133 additions & 51 deletions external/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,21 @@ package external
import (
"context"
"net/http"
"regexp"
"strconv"
"strings"
"time"

"github.com/sirupsen/logrus"
"golang.org/x/oauth2"
"gopkg.in/google/go-github.v15/github"
"github.com/google/go-github/v28/github"
)

// GitHubMatcher matches emails and GitHub users.
type GitHubMatcher struct {
client *github.Client
emailToUserCache map[string]string
userToNameCache map[string]string
}

// NewGitHubMatcher creates a new matcher given a GitHub token.
Expand All @@ -35,21 +38,36 @@ func NewGitHubMatcher(apiURL, token string) (Matcher, error) {
if err != nil {
return GitHubMatcher{}, err
}
return GitHubMatcher{client}, nil
return GitHubMatcher{
client: client,
emailToUserCache: make(map[string]string),
userToNameCache: make(map[string]string),
}, nil
}

var searchOpts = &github.SearchOptions{
Sort: "joined",
ListOptions: github.ListOptions{PerPage: 1},
}

// MatchByEmail returns the latest GitHub user with the given email.
func (m GitHubMatcher) MatchByEmail(ctx context.Context, email string) (user, name string, err error) {
finished := make(chan struct{})
go func() {
defer func() { finished <- struct{}{} }()
// CommitScan
func (m GitHubMatcher) ScanCommit(ctx context.Context, repo, email, commit string) error {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of doing this, FindPeople could just save the repo and commit for later use in ReducePeople.

I think the CommitterScan interface and ScanCommit function wouldn't be part of the final solution.

logrus.Infof("scanning commit %s %s", repo, commit)
p := regexp.MustCompile(`.*github.com/([^/]+)/([^/]+?)(?:\.git)?$`) //TODO: initialize statically
match := p.FindStringSubmatch(repo)
if len(match) == 0 {
logrus.Infof("no github", repo, commit)
return nil
}

// do not check more than 1 commit per author/committer email
if _, ok := m.emailToUserCache[email]; ok {
logrus.Infof("email %s already checked", email)
return nil
}

var numFailures uint64
//TODO: refactor out, repeated in MatchByEmail
var numFailures uint64
const maxNumFailures = 8
const (
success = 0
Expand All @@ -72,7 +90,7 @@ func (m GitHubMatcher) MatchByEmail(ctx context.Context, email string) (user, na
response.Response.Header["X-Ratelimit-Reset"][0], 10, 64)
if err != nil {
logrus.Errorf("Bad X-Ratelimit-Reset header: %v. %s",
err, response.String())
err)
return fail
}
resetTime := time.Unix(t, 0).Add(time.Second)
Expand All @@ -83,7 +101,7 @@ func (m GitHubMatcher) MatchByEmail(ctx context.Context, email string) (user, na

if err != nil || code >= 500 && code < 600 || code == 408 || code == 429 {
sleepTime := time.Duration((1 << numFailures) * int64(time.Second))
logrus.Warnf("HTTP %d: %s. %s. Sleeping until %s", code, err, response.String(),
logrus.Warnf("HTTP %d: %s. Sleeping until %s", code, err,
time.Now().UTC().Add(sleepTime))
time.Sleep(sleepTime)
numFailures++
Expand All @@ -92,59 +110,123 @@ func (m GitHubMatcher) MatchByEmail(ctx context.Context, email string) (user, na
}
return retry
}
logrus.Warnf("HTTP %d: %s. %s", code, err, response.String())
logrus.Warnf("HTTP %d: %s", code, err)
return fail
}

query := email + " in:email"
for { // api rate limit retry loop
if isNoReplyEmail(email) {
user = userFromEmail(email)
for {
c, resp, err := m.client.Repositories.GetCommit(ctx, match[1], match[2], commit)
status := check(resp, err)
if status == retry {
continue
} else if status == fail {
return err
}

//TODO: c.Author and c.Committer are the same type, refactor when
// committer support is added.
if c.Author != nil {
if c.Author.Login != nil {
logrus.Infof("found login: %s -> %s", email, *c.Author.Login)
m.emailToUserCache[email] = *c.Author.Login
} else {
var result *github.UsersSearchResult
var response *github.Response
result, response, err = m.client.Search.Users(ctx, query, searchOpts)
status := check(response, err)
if status == retry {
continue
} else if status == fail {
return
}
if len(result.Users) == 0 {
if strings.Contains(query, "@") {
// Hacking time! user+domain may work instead of user@domain
query = strings.Replace(query, "@", " ", 1)
continue
}
logrus.Warnf("unable to find users for email: %s", email)
err = ErrNoMatches
return
}
user = result.Users[0].GetLogin()
break
logrus.Infof("login not found: %s", email)
m.emailToUserCache[email] = ""
}
}

for { // api rate limit retry loop
var u *github.User
var response *github.Response
u, response, err = m.client.Users.Get(ctx, user)
status := check(response, err)
if status == retry {
continue
} else if status == fail {
break
}

return nil
}

// MatchByEmail returns the latest GitHub user with the given email.
func (m GitHubMatcher) MatchByEmail(ctx context.Context, email string) (user, name string, err error) {
var numFailures uint64
const maxNumFailures = 8
const (
success = 0
retry = 1
fail = 2
)
check := func(response *github.Response, err error) int {
code := response.Response.StatusCode
if err == nil && code >= 200 && code < 300 {
return success
}

rateLimitHit := false
if val, exists := response.Response.Header["X-Ratelimit-Remaining"]; code == 403 &&
exists && len(val) == 1 && val[0] == "0" {
rateLimitHit = true
}
if rateLimitHit {
t, err := strconv.ParseInt(
response.Response.Header["X-Ratelimit-Reset"][0], 10, 64)
if err != nil {
logrus.Errorf("Bad X-Ratelimit-Reset header: %v",
err)
return fail
}
resetTime := time.Unix(t, 0).Add(time.Second)
logrus.Warnf("rate limit was hit, waiting until %s", resetTime.String())
time.Sleep(resetTime.Sub(time.Now().UTC()))
return retry
}

if err != nil || code >= 500 && code < 600 || code == 408 || code == 429 {
sleepTime := time.Duration((1 << numFailures) * int64(time.Second))
logrus.Warnf("HTTP %d: %s. Sleeping until %s", code, err,
time.Now().UTC().Add(sleepTime))
time.Sleep(sleepTime)
numFailures++
if numFailures > maxNumFailures {
return fail
}
return retry
}
logrus.Warnf("HTTP %d: %s", code, err)
return fail
}

if isNoReplyEmail(email) {
user = userFromEmail(email)
} else {
u, ok := m.emailToUserCache[email]
if ok {
if u == "" {
logrus.Infof("MatchByEmail commit scan cache empty of %s", email)
return
}
user = u.GetLogin()
name = u.GetName()

logrus.Infof("MatchByEmail succeeded with commit scan cache %s", email)
user = u
} else {
logrus.Infof("MatchByEmail no cache %s", email)
return
}
}

if n, ok := m.userToNameCache[user]; ok {
name = n
return
}

for { // api rate limit retry loop
var u *github.User
var response *github.Response
u, response, err = m.client.Users.Get(ctx, user)
status := check(response, err)
if status == retry {
continue
} else if status == fail {
return
}
}()
select {
case <-finished:
user = u.GetLogin()
name = u.GetName()
m.userToNameCache[user] = name
return
case <-ctx.Done():
return "", "", context.Canceled
}
}

Expand Down
6 changes: 6 additions & 0 deletions external/matcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ type Matcher interface {
MatchByEmail(ctx context.Context, email string) (user, name string, err error)
}

// CommitScanner defines an external matching service that can scan commit
// hashes before matching emails.
type CommitScanner interface {
ScanCommit(ctx context.Context, repo, email, commit string) error
}

// MatcherConstructor is the Matcher constructor function type.
type MatcherConstructor func(apiURL, token string) (Matcher, error)

Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@ module github.com/src-d/eee-identity-matching
require (
github.com/apache/thrift v0.12.0 // indirect
github.com/briandowns/spinner v1.6.1
github.com/davecgh/go-spew v1.1.1
github.com/go-sql-driver/mysql v1.4.1
github.com/golang/snappy v0.0.1 // indirect
github.com/google/go-github v17.0.0+incompatible // indirect
github.com/google/go-github/v28 v28.1.1
github.com/mjibson/esc v0.2.0
github.com/pkg/errors v0.8.1 // indirect
github.com/sirupsen/logrus v1.3.0
Expand All @@ -20,5 +21,4 @@ require (
golang.org/x/text v0.3.0
golang.org/x/tools v0.0.0-20190709211700-7b25e351ac0e
gonum.org/v1/gonum v0.0.0-20190624220246-e34e6b933b2b
gopkg.in/google/go-github.v15 v15.0.0
)
11 changes: 7 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ github.com/golang/protobuf v1.2.0 h1:P3YflyNX/ehuJFLhxviNdFxQPkGK5cDcApsge1SqnvM
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/go-github v17.0.0+incompatible h1:N0LgJ1j65A7kfXrZnUDaYCs/Sf4rEjNlfyDHW9dolSY=
github.com/google/go-github v17.0.0+incompatible/go.mod h1:zLgOLi98H3fifZn+44m+umXrS52loVEgC2AApnigrVQ=
github.com/google/go-github/v28 v28.1.1 h1:kORf5ekX5qwXO2mGzXXOjMe/g6ap8ahVe0sBEulhSxo=
github.com/google/go-github/v28 v28.1.1/go.mod h1:bsqJWQX05omyWVmc00nEUql9mhQyv38lDZ8kPZcQVoM=
github.com/google/go-querystring v1.0.0 h1:Xkwi/a1rcvNg1PPYe5vI8GbeBY/jrVuDX5ASuANWTrk=
github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck=
github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGiHgQ4OO8tzTaLawm8vnODuwDk=
Expand All @@ -32,6 +32,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/sirupsen/logrus v1.3.0 h1:hI/7Q+DtNZ2kINb6qt/lS+IyXnHQe9e90POfeewL/ME=
github.com/sirupsen/logrus v1.3.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
github.com/smola/go-github/v28 v28.9.0 h1:WFZAtB7cFI/R2fk/ZqT6bwHEDXXYstHoHF/u6/HB5sQ=
github.com/smola/go-github/v28 v28.9.0/go.mod h1:bsqJWQX05omyWVmc00nEUql9mhQyv38lDZ8kPZcQVoM=
github.com/spf13/pflag v1.0.3 h1:zPAT6CGy6wXeQ7NtTnaTerfKOsV6V6F8agHXFiazDkg=
github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
Expand Down Expand Up @@ -62,12 +64,14 @@ golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73r
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20181106182150-f42d05182288/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190219183015-4b83411ed2b3 h1:OFrJ/rEn4wUq1PbIHcIdfOGIy3uCTe5XOealV2ngn/w=
golang.org/x/oauth2 v0.0.0-20190219183015-4b83411ed2b3/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4 h1:YUO/7uOKsKeq9UokNS62b8FYywz3ker1l1vDZRCRefw=
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58 h1:8gQV6CLnAEikrhgkHFbMAEhagSSnXWGV915qUMm9mrU=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
Expand All @@ -85,8 +89,7 @@ gonum.org/v1/gonum v0.0.0-20190624220246-e34e6b933b2b h1:B1drcdqog/XZuRy27GcSpP9
gonum.org/v1/gonum v0.0.0-20190624220246-e34e6b933b2b/go.mod h1:03dgh78c4UvU1WksguQ/lvJQXbezKQGJSrwwRq5MraQ=
gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.3.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/appengine v1.4.0 h1:/wp5JvzpHIxhs/dumFmF7BXTf3Z+dd4uXta4kVyO508=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
gopkg.in/google/go-github.v15 v15.0.0 h1:cT2oL8cepTN0y1qjicixn/r4ZRwn6/pkkO1JNoMls2g=
gopkg.in/google/go-github.v15 v15.0.0/go.mod h1:l5hcbHSKRLPHjIKB0rFqYBNFUOFpa6iG6+JdaxRuqMo=
5 changes: 5 additions & 0 deletions matching.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package idmatch
import (
"context"
"fmt"
"math"
"sort"

"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -206,6 +207,10 @@ func ReducePeople(people People, matcher external.Matcher, blacklist Blacklist,
}
}
mean, std := stat.MeanStdDev(componentsSize, nil)
// If std is NaN, serialization to JSON will fail
if math.IsNaN(std) {
std = 0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LOL, this is the fix I had to do independently in #69

}
reporter.Commit("connected component size mean", mean)
reporter.Commit("connected component size std", std)
reporter.Commit("connected component size max", floats.Max(componentsSize))
Expand Down
Loading