Skip to content

Commit

Permalink
RobotsTxt API supports variable user-agent
Browse files Browse the repository at this point in the history
  • Loading branch information
emmanuel-keller committed Jun 18, 2017
1 parent 83485af commit 53b3ddb
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 11 deletions.
2 changes: 1 addition & 1 deletion src/main/java/com/qwazr/crawler/web/WebCrawlThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ private RobotsTxt.Status checkRobotsTxt(CurrentURI currentURI)
robotsTxt = RobotsTxt.download(driver.getProxy(), robotsTxtUserAgent, robotsTxtURI);
robotsTxtMap.put(robotsTxtURI, robotsTxt);
}
return robotsTxt.getStatus(uri);
return robotsTxt.getStatus(uri, robotsTxtUserAgent);
} finally {
timeTracker.next("Robots.txt check");
}
Expand Down
19 changes: 9 additions & 10 deletions src/main/java/com/qwazr/crawler/web/robotstxt/RobotsTxt.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,14 @@ public enum Status {
private static final Logger logger = LoggerFactory.getLogger(RobotsTxt.class);

private final RobotsTxtUserAgentMap userAgentMap;
private final String userAgent;
private final int httpStatusCode;

public RobotsTxt(final String userAgent, final InputStream input, final Charset charset) throws IOException {
this.userAgent = userAgent;
public RobotsTxt(final InputStream input, final Charset charset) throws IOException {
this.userAgentMap = RobotsTxtUserAgentMap.of(input, charset);
this.httpStatusCode = 200;
}

RobotsTxt(final String userAgent, final int statusCode) {
this.userAgent = userAgent;
RobotsTxt(final int statusCode) {
this.userAgentMap = null;
this.httpStatusCode = statusCode;
}
Expand Down Expand Up @@ -102,14 +99,16 @@ public static URI getRobotsURI(final URI uri) throws MalformedURLException, URIS
* @throws MalformedURLException
* @throws URISyntaxException
*/
public final Status getStatus(final URI uri) throws MalformedURLException, URISyntaxException {
final Status status = getStatusNoLogs(uri);
public final Status getStatus(final URI uri, final String userAgent)
throws MalformedURLException, URISyntaxException {
final Status status = getStatusNoLogs(uri, userAgent);
if (logger.isInfoEnabled())
logger.info("Check robots.txt returns " + status.name() + " for " + uri);
return status;
}

private Status getStatusNoLogs(final URI uri) throws MalformedURLException, URISyntaxException {
private Status getStatusNoLogs(final URI uri, final String userAgent)
throws MalformedURLException, URISyntaxException {
switch (httpStatusCode) {
case 400:
case 404:
Expand Down Expand Up @@ -149,7 +148,7 @@ public static RobotsTxt download(final WebCrawlDefinition.ProxyDefinition proxy,
final ContentType contentType = ContentType.getOrDefault(entity);
final Charset charset = contentType.getCharset();
try (final InputStream is = entity.getContent()) {
return new RobotsTxt(userAgent, is, charset == null ? CharsetUtils.CharsetUTF8 : charset);
return new RobotsTxt(is, charset == null ? CharsetUtils.CharsetUTF8 : charset);
}
} finally {
IOUtils.close((CloseableHttpResponse) response);
Expand All @@ -164,7 +163,7 @@ public static RobotsTxt download(final WebCrawlDefinition.ProxyDefinition proxy,
if (logger.isInfoEnabled())
logger.info("Get wrong status (" + sc + " code for: " + uri);
}
return new RobotsTxt(userAgent, sc);
return new RobotsTxt(sc);
}
}

Expand Down

0 comments on commit 53b3ddb

Please sign in to comment.