Skip to content

Commit

Permalink
Change the rate limit strategy
Browse files Browse the repository at this point in the history
We currently use the token bucket strategy to avoid rate limit in Github API. The
problem is when there're other programs concurrently querying Github with the
same account, this could increase the number of Forbidden status code from
Github API. To avoid this we should drop the token bucket strategy and start
respecting the Github response HTTP headers.

Closes #2
  • Loading branch information
rafaeljusto committed Oct 28, 2015
1 parent 81cf93b commit 0a5af08
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 82 deletions.
57 changes: 0 additions & 57 deletions gddoexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"time"

"github.com/golang/gddo/database"
"github.com/juju/ratelimit"
)

// unused stores the time that an unmodified project is considered unused.
Expand All @@ -23,22 +22,6 @@ const commitsPeriod = 7 * 24 * time.Hour
// a list of packages
const agents = 4

// RateLimit controls the number of requests sent to the Github API for
// authenticated and unauthenticated scenarios using the token bucket
// strategy. For more information on the values, please check:
// https://developer.github.com/v3/#rate-limiting
var RateLimit = struct {
FillInterval time.Duration
Capacity int64
AuthFillInterval time.Duration
AuthCapacity int64
}{
FillInterval: time.Minute,
Capacity: agents,
AuthFillInterval: time.Second,
AuthCapacity: agents,
}

// gddoDB contains all used methods from Database type of
// github.com/golang/gddo/database. This is useful for mocking and building
// tests.
Expand Down Expand Up @@ -103,41 +86,21 @@ func ShouldArchivePackages(packages []database.Package, db gddoDB, auth *GithubA
out := make(chan ArchiveResponse, agents)

go func() {
var bucket *ratelimit.Bucket
if auth == nil {
bucket = ratelimit.NewBucket(RateLimit.FillInterval, RateLimit.Capacity)
} else {
bucket = ratelimit.NewBucket(RateLimit.AuthFillInterval, RateLimit.AuthCapacity)
}

var wg sync.WaitGroup
wg.Add(agents)

in := make(chan database.Package)

for i := 0; i < agents; i++ {
go func() {
// if the go routine retrieve a response from cache, it can run again
// without waiting for a token, as no hit was made in the Github API
wait := true
for p := range in {
if wait {
bucket.Wait(1)
} else {
wait = true
}

archive, cache, err := ShouldArchivePackage(p, db, auth)
out <- ArchiveResponse{
Path: p.Path,
Archive: archive,
Cache: cache,
Error: err,
}

if cache {
wait = false
}
}

wg.Done()
Expand Down Expand Up @@ -223,41 +186,21 @@ func AreFastForkPackages(packages []database.Package, auth *GithubAuth) <-chan F
out := make(chan FastForkResponse, agents)

go func() {
var bucket *ratelimit.Bucket
if auth == nil {
bucket = ratelimit.NewBucket(RateLimit.FillInterval, RateLimit.Capacity)
} else {
bucket = ratelimit.NewBucket(RateLimit.AuthFillInterval, RateLimit.AuthCapacity)
}

var wg sync.WaitGroup
wg.Add(agents)

in := make(chan database.Package)

for i := 0; i < agents; i++ {
go func() {
// if the go routine retrieve a response from cache, it can run again
// without waiting for a token, as no hit was made in the Github API
wait := true
for p := range in {
if wait {
bucket.Wait(1)
} else {
wait = true
}

fastFork, cache, err := IsFastForkPackage(p, auth)
out <- FastForkResponse{
Path: p.Path,
FastFork: fastFork,
Cache: cache,
Error: err,
}

if cache {
wait = false
}
}

wg.Done()
Expand Down
76 changes: 53 additions & 23 deletions gddoexp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/http"
"reflect"
"sort"
"strconv"
"strings"
"testing"
"time"
Expand Down Expand Up @@ -243,7 +244,7 @@ func TestShouldArchivePackage(t *testing.T) {
expectedError: gddoexp.NewError("github.com/rafaeljusto/gddoexp", gddoexp.ErrorCodeGithubFetch, fmt.Errorf("i'm a crazy error")),
},
{
description: "it should fail when the HTTP status code from Github API is 403 Forbidden (ratelimit)",
description: "it should fail when the HTTP status code from Github API is 403 Forbidden (ratelimit) without reset HTTP header",
path: "github.com/rafaeljusto/gddoexp",
db: databaseMock{
importerCountMock: func(path string) (int, error) {
Expand All @@ -259,6 +260,57 @@ func TestShouldArchivePackage(t *testing.T) {
},
expectedError: gddoexp.NewError("github.com/rafaeljusto/gddoexp", gddoexp.ErrorCodeGithubForbidden, nil),
},
{
description: "it should fail when the HTTP status code from Github API is 403 Forbidden (ratelimit) with reset HTTP header",
path: "github.com/rafaeljusto/gddoexp",
db: databaseMock{
importerCountMock: func(path string) (int, error) {
return 0, nil
},
},
httpClient: httpClientMock{
getMock: func() func(string) (*http.Response, error) {
var requestNumber int

return func(url string) (*http.Response, error) {
requestNumber++

if url != "https://api.github.com/repos/rafaeljusto/gddoexp" {
return &http.Response{
StatusCode: http.StatusBadRequest,
}, nil
}

switch requestNumber {
case 1:
return &http.Response{
StatusCode: http.StatusForbidden,
Header: http.Header{
"X-Ratelimit-Reset": []string{strconv.FormatInt(time.Now().Add(10*time.Millisecond).Unix(), 10)},
},
}, nil

case 2:
return &http.Response{
StatusCode: http.StatusOK,
Body: ioutil.NopCloser(bytes.NewBufferString(`{
"created_at": "2010-08-03T21:56:23Z",
"forks_count": 194,
"network_count": 194,
"stargazers_count": 1133,
"updated_at": "` + time.Now().Add(-2*365*24*time.Hour).Format(time.RFC3339) + `"
}`)),
}, nil
}

return &http.Response{
StatusCode: http.StatusInternalServerError,
}, nil
}
}(),
},
expected: true,
},
{
description: "it should fail when the HTTP status code from Github API is 404 Not Found",
path: "github.com/rafaeljusto/gddoexp",
Expand Down Expand Up @@ -552,17 +604,6 @@ func TestShouldArchivePackages(t *testing.T) {
return r.Header.Get("Cache") == "1"
}

rateLimitBkp := gddoexp.RateLimit
defer func() {
gddoexp.RateLimit = rateLimitBkp
}()

// Change rate limit parameters to speed up the test
gddoexp.RateLimit.FillInterval = time.Millisecond
gddoexp.RateLimit.Capacity = 100
gddoexp.RateLimit.AuthFillInterval = time.Millisecond
gddoexp.RateLimit.AuthCapacity = 100

for i, item := range data {
gddoexp.HTTPClient = item.httpClient

Expand Down Expand Up @@ -1146,17 +1187,6 @@ func TestAreFastForkPackages(t *testing.T) {
return r.Header.Get("Cache") == "1"
}

rateLimitBkp := gddoexp.RateLimit
defer func() {
gddoexp.RateLimit = rateLimitBkp
}()

// Change rate limit parameters to speed up the test
gddoexp.RateLimit.FillInterval = time.Millisecond
gddoexp.RateLimit.Capacity = 100
gddoexp.RateLimit.AuthFillInterval = time.Millisecond
gddoexp.RateLimit.AuthCapacity = 100

for i, item := range data {
gddoexp.HTTPClient = item.httpClient

Expand Down
24 changes: 22 additions & 2 deletions github.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,17 @@ import (
"encoding/json"
"fmt"
"net/http"
"strconv"
"strings"
"time"
)

const (
// githubRateLimitResetHeader is the HTTP header label that stores the time
// at which the current rate limit window resets in UTC epoch seconds.
githubRateLimitResetHeader = "X-Ratelimit-Reset"
)

// httpClient contains all used methods to perform requests in Github API.
// This is useful for mocking and building tests.
type httpClient interface {
Expand Down Expand Up @@ -117,8 +124,11 @@ func normalizePath(path string) (string, error) {

// doGithub is the low level function that do actually the work of querying
// Github API and parsing the response. It is also responsible for verifying if
// the response was a cache hit or not.
// the response was a cache hit or not. If Github API answers with Forbidden
// status code, the function will sleep until the next available round to query
// again for the information.
func doGithub(path, url string, obj interface{}) (cache bool, err error) {
query:
rsp, err := HTTPClient.Get(url)
if err != nil {
return false, NewError(path, ErrorCodeGithubFetch, err)
Expand All @@ -133,10 +143,20 @@ func doGithub(path, url string, obj interface{}) (cache bool, err error) {
switch rsp.StatusCode {
case http.StatusOK:
// valid response

case http.StatusForbidden:
return false, NewError(path, ErrorCodeGithubForbidden, nil)
epoch, err := strconv.ParseInt(rsp.Header.Get(githubRateLimitResetHeader), 10, 64)
if err != nil {
return false, NewError(path, ErrorCodeGithubForbidden, nil)
}

resetAt := time.Unix(epoch, 0)
time.Sleep(resetAt.Sub(time.Now()))
goto query

case http.StatusNotFound:
return false, NewError(path, ErrorCodeGithubNotFound, nil)

default:
return false, NewError(path, ErrorCodeGithubStatusCode, nil)
}
Expand Down

0 comments on commit 0a5af08

Please sign in to comment.