Permalink
Browse files

Merge pull request #843 from sonatype/s3-scraper-fix

[REVIEW] S3 scraper fix
  • Loading branch information...
2 parents d7ec3c3 + 8141731 commit 1a148aa45318140d3459426ce97c40ec9fb586fc @cstamas cstamas committed Apr 30, 2013
@@ -80,6 +80,7 @@ protected RemoteDetectionResult detectRemoteRepository( final ScrapeContext cont
{
String prefix = null;
Page initialPage = page;
+ String initialPageUrl = page.getUrl();
if ( initialPage.getHttpResponse().getStatusLine().getStatusCode() != 200 )
{
// we probably have the NoSuchKey response from S3, usually when repo root is not in bucket root
@@ -93,95 +94,103 @@ protected RemoteDetectionResult detectRemoteRepository( final ScrapeContext cont
return null;
}
// repo.remoteUrl does not have query parameters...
- String fixedUrl =
+ initialPageUrl =
context.getRemoteRepositoryRootUrl().substring( 0,
context.getRemoteRepositoryRootUrl().length() - prefix.length() );
- getLogger().debug( "Retrying URL {} to scrape Amazon S3 hosted repository on remote URL {}", fixedUrl,
+ getLogger().debug( "Retrying URL {} to scrape Amazon S3 hosted repository on remote URL {}", initialPageUrl,
context.getRemoteRepositoryRootUrl() );
- initialPage = Page.getPageFor( context, fixedUrl + "?prefix=" + prefix );
+ initialPage = Page.getPageFor( context, initialPageUrl + "?prefix=" + prefix );
}
final HashSet<String> entries = new HashSet<String>();
- diveIn( context, initialPage, initialPage.getUrl(), prefix, entries );
+ diveIn( context, initialPage, initialPageUrl, prefix, entries );
return new ArrayList<String>( entries );
}
// ==
- protected void diveIn( final ScrapeContext context, final Page page, final String rootUrl, final String prefix,
+ protected void diveIn( final ScrapeContext context, final Page firstPage, final String rootUrl, final String prefix,
final Set<String> entries )
throws IOException
{
- // cancelation
- CancelableUtil.checkInterruption();
-
- // response should be 200 OK, if not, give up
- if ( page.getHttpResponse().getStatusLine().getStatusCode() != 200 )
+ Page page = firstPage;
+ boolean truncated;
+ do
{
- context.stop( "Remote recognized as " + getTargetedServer()
- + ", but cannot be scraped (unexpected response status " + page.getHttpResponse().getStatusLine() + ")" );
- return;
- }
+ // check for truncation (isTruncated elem, this means we need to "page" the bucket to get all entries)
+ truncated = isTruncated( page );
- final Elements root = page.getDocument().getElementsByTag( "ListBucketResult" );
- if ( root.size() != 1 || !root.get( 0 ).attr( "xmlns" ).equals( "http://s3.amazonaws.com/doc/2006-03-01/" ) )
- {
- context.stop( "Remote recognized as " + getTargetedServer()
- + ", but unexpected response was received (not \"ListBucketResult\")." );
- return;
- }
+ // cancelation
+ CancelableUtil.checkInterruption();
- getLogger().debug( "Processing S3 page response from URL {}", page.getUrl() );
- String markerElement = null;
- final Elements elements = page.getDocument().getElementsByTag( "Contents" );
- for ( Element element : elements )
- {
- final Elements keyElements = element.getElementsByTag( "Key" );
- if ( keyElements.isEmpty() )
- {
- continue; // skip it
- }
- final Elements sizeElements = element.getElementsByTag( "Size" );
- if ( sizeElements.isEmpty() )
- {
- continue; // skip it
- }
- final String key = keyElements.get( 0 ).text();
- if ( key.startsWith( "." ) || key.contains( "/." ) )
+ // response should be 200 OK, if not, give up
+ if ( page.getHttpResponse().getStatusLine().getStatusCode() != 200 )
{
- continue; // skip it, it's a ".dot" file
+ context.stop( "Remote recognized as " + getTargetedServer()
+ + ", but cannot be scraped (unexpected response status " + page.getHttpResponse().getStatusLine() + ")" );
+ return;
}
- final long size = Long.parseLong( sizeElements.get( 0 ).text() );
- if ( size > 0 )
+
+ final Elements root = page.getDocument().getElementsByTag( "ListBucketResult" );
+ if ( root.size() != 1 || !root.get( 0 ).attr( "xmlns" ).equals( "http://s3.amazonaws.com/doc/2006-03-01/" ) )
{
- markerElement = key;
- // fix key with prefix
- final String fixedKey = prefix != null ? key.substring( prefix.length() ) : key;
- // we just need prefix from first few path elements (remote scrape depth)
- final String normalizedPath =
- PathUtils.pathFrom( PathUtils.elementsOf( fixedKey ), context.getScrapeDepth() );
- entries.add( normalizedPath );
+ context.stop( "Remote recognized as " + getTargetedServer()
+ + ", but unexpected response was received (not \"ListBucketResult\")." );
+ return;
}
- }
- if ( isTruncated( page ) )
- {
- // cancelation
- CancelableUtil.checkInterruption();
-
- final ArrayList<String> queryParams = new ArrayList<String>();
- if ( prefix != null )
+ getLogger().debug( "Processing S3 page response from URL {}", page.getUrl() );
+ String markerElement = null;
+ final Elements elements = page.getDocument().getElementsByTag( "Contents" );
+ for ( Element element : elements )
{
- queryParams.add( "prefix=" + prefix );
+ final Elements keyElements = element.getElementsByTag( "Key" );
+ if ( keyElements.isEmpty() )
+ {
+ continue; // skip it
+ }
+ final Elements sizeElements = element.getElementsByTag( "Size" );
+ if ( sizeElements.isEmpty() )
+ {
+ continue; // skip it
+ }
+ final String key = keyElements.get( 0 ).text();
+ if ( key.startsWith( "." ) || key.contains( "/." ) )
+ {
+ continue; // skip it, it's a ".dot" file
+ }
+ final long size = Long.parseLong( sizeElements.get( 0 ).text() );
+ if ( size > 0 )
+ {
+ markerElement = key;
+ // fix key with prefix
+ final String fixedKey = prefix != null ? key.substring( prefix.length() ) : key;
+ // we just need prefix from first few path elements (remote scrape depth)
+ final String normalizedPath =
+ PathUtils.pathFrom( PathUtils.elementsOf( fixedKey ), context.getScrapeDepth() );
+ entries.add( normalizedPath );
+ }
}
- if ( markerElement != null )
+
+ if ( truncated )
{
- queryParams.add( "marker=" + markerElement );
+ // cancelation
+ CancelableUtil.checkInterruption();
+
+ final ArrayList<String> queryParams = new ArrayList<String>();
+ if ( prefix != null )
+ {
+ queryParams.add( "prefix=" + prefix );
+ }
+ if ( markerElement != null )
+ {
+ queryParams.add( "marker=" + markerElement );
+ }
+ final String url = appendParameters( rootUrl, queryParams );
+ page = Page.getPageFor( context, url );
}
- final String url = appendParameters( rootUrl, queryParams );
- final Page nextPage = Page.getPageFor( context, url );
- diveIn( context, nextPage, rootUrl, prefix, entries );
}
+ while ( truncated );
}
protected String appendParameters( final String baseUrl, List<String> queryParams )
Oops, something went wrong.

0 comments on commit 1a148aa

Please sign in to comment.