Skip to content

Commit

Permalink
Merge branch 'master' into UseDefaultCredentials
Browse files Browse the repository at this point in the history
  • Loading branch information
Evirth committed Sep 12, 2018
2 parents e47539c + 080f9e3 commit 195b959
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 4 deletions.
45 changes: 42 additions & 3 deletions Abot.Tests.Unit/App.config
Expand Up @@ -62,9 +62,48 @@
</log4net>

<abot>
<crawlBehavior maxConcurrentThreads="11" maxPagesToCrawl="33" maxPagesToCrawlPerDomain="333" maxPageSizeInBytes="4444" userAgentString="aaaa" crawlTimeoutSeconds="44" downloadableContentTypes="bbbb" isUriRecrawlingEnabled="true" isExternalPageCrawlingEnabled="true" isExternalPageLinksCrawlingEnabled="true" httpServicePointConnectionLimit="21" httpRequestTimeoutInSeconds="22" httpRequestMaxAutoRedirects="23" isHttpRequestAutoRedirectsEnabled="true" isHttpRequestAutomaticDecompressionEnabled="true" isSendingCookiesEnabled="true" isSslCertificateValidationEnabled="false" isRespectUrlNamedAnchorOrHashbangEnabled="true" minAvailableMemoryRequiredInMb="25" maxMemoryUsageInMb="26" maxMemoryUsageCacheTimeInSeconds="27" maxCrawlDepth="28" maxLinksPerPage="29" isForcedLinkParsingEnabled="true" maxRetryCount="4" minRetryDelayInMilliseconds="4444" />
<politeness isRespectRobotsDotTextEnabled="true" isRespectMetaRobotsNoFollowEnabled="true" isRespectHttpXRobotsTagHeaderNoFollowEnabled="true" isRespectAnchorRelNoFollowEnabled="true" isIgnoreRobotsDotTextIfRootDisallowedEnabled="true" robotsDotTextUserAgentString="zzzz" maxRobotsDotTextCrawlDelayInSeconds="5" minCrawlDelayPerDomainMilliSeconds="55" />
<authorization isAlwaysLogin="true" loginUser="test" loginPassword="dummyPass" useDefaultCredentials="false" />
<crawlBehavior
maxConcurrentThreads="11"
maxPagesToCrawl="33"
maxPagesToCrawlPerDomain="333"
maxPageSizeInBytes="4444"
userAgentString="aaaa"
httpProtocolVersion="1.0"
crawlTimeoutSeconds="44"
downloadableContentTypes="bbbb"
isUriRecrawlingEnabled="true"
isExternalPageCrawlingEnabled="true"
isExternalPageLinksCrawlingEnabled="true"
httpServicePointConnectionLimit="21"
httpRequestTimeoutInSeconds="22"
httpRequestMaxAutoRedirects="23"
isHttpRequestAutoRedirectsEnabled="true"
isHttpRequestAutomaticDecompressionEnabled="true"
isSendingCookiesEnabled="true"
isSslCertificateValidationEnabled="false"
isRespectUrlNamedAnchorOrHashbangEnabled="true"
minAvailableMemoryRequiredInMb="25"
maxMemoryUsageInMb="26"
maxMemoryUsageCacheTimeInSeconds="27"
maxCrawlDepth="28"
maxLinksPerPage="29"
isForcedLinkParsingEnabled="true"
maxRetryCount="4"
minRetryDelayInMilliseconds="4444" />
<politeness
isRespectRobotsDotTextEnabled="true"
isRespectMetaRobotsNoFollowEnabled="true"
isRespectHttpXRobotsTagHeaderNoFollowEnabled="true"
isRespectAnchorRelNoFollowEnabled="true"
isIgnoreRobotsDotTextIfRootDisallowedEnabled="true"
robotsDotTextUserAgentString="zzzz"
maxRobotsDotTextCrawlDelayInSeconds="5"
minCrawlDelayPerDomainMilliSeconds="55" />
<authorization
isAlwaysLogin="true"
loginUser="test"
loginPassword="dummyPass"
useDefaultCredentials="false" />
<extensionValues>
<add key="key1" value="value1" />
<add key="key2" value="value2" />
Expand Down
2 changes: 2 additions & 0 deletions Abot.Tests.Unit/Core/AbotConfigurationSectionHandlerTest.cs
Expand Up @@ -21,6 +21,7 @@ public void GetSetion_FillsConfigValuesFromAppConfigFile()
Assert.AreEqual(333, _uut.CrawlBehavior.MaxPagesToCrawlPerDomain);
Assert.AreEqual(4444, _uut.CrawlBehavior.MaxPageSizeInBytes);
Assert.AreEqual("aaaa", _uut.CrawlBehavior.UserAgentString);
Assert.AreEqual("1.0", _uut.CrawlBehavior.HttpProtocolVersion);
Assert.AreEqual(true, _uut.CrawlBehavior.IsExternalPageCrawlingEnabled);
Assert.AreEqual(true, _uut.CrawlBehavior.IsExternalPageLinksCrawlingEnabled);
Assert.AreEqual(21, _uut.CrawlBehavior.HttpServicePointConnectionLimit);
Expand Down Expand Up @@ -71,6 +72,7 @@ public void Convert_CovertsFromSectionObjectToDtoObject()
Assert.AreEqual(result.MaxPagesToCrawlPerDomain, _uut.CrawlBehavior.MaxPagesToCrawlPerDomain);
Assert.AreEqual(result.MaxPageSizeInBytes, _uut.CrawlBehavior.MaxPageSizeInBytes);
Assert.AreEqual(result.UserAgentString, _uut.CrawlBehavior.UserAgentString);
Assert.AreEqual(result.HttpProtocolVersion, HttpProtocolVersion.Version10);
Assert.AreEqual(result.IsExternalPageCrawlingEnabled, _uut.CrawlBehavior.IsExternalPageCrawlingEnabled);
Assert.AreEqual(result.IsExternalPageLinksCrawlingEnabled, _uut.CrawlBehavior.IsExternalPageLinksCrawlingEnabled);
Assert.AreEqual(result.HttpServicePointConnectionLimit, _uut.CrawlBehavior.HttpServicePointConnectionLimit);
Expand Down
1 change: 1 addition & 0 deletions Abot/Abot.csproj
Expand Up @@ -96,6 +96,7 @@
<Compile Include="Core\CompactCrawledUrlRepository.cs" />
<Compile Include="Core\AngleSharpHyperLinkParser.cs" />
<Compile Include="Crawler\RobotsDotTextParseCompletedArgs.cs" />
<Compile Include="Poco\Enums.cs" />
<Compile Include="Poco\HttpWebResponseWrapper.cs" />
<Compile Include="Util\BloomFilter.cs" />
<Compile Include="Util\CachedMemoryMonitor.cs" />
Expand Down
21 changes: 21 additions & 0 deletions Abot/Core/AbotConfigurationSectionHandler.cs
Expand Up @@ -58,6 +58,7 @@ private void Map(CrawlBehaviorElement src, CrawlConfiguration dest)
dest.MaxPagesToCrawlPerDomain = src.MaxPagesToCrawlPerDomain;
dest.MaxPageSizeInBytes = src.MaxPageSizeInBytes;
dest.UserAgentString = src.UserAgentString;
dest.HttpProtocolVersion = GetHttpProtocolVersion(src);
dest.CrawlTimeoutSeconds = src.CrawlTimeoutSeconds;
dest.IsUriRecrawlingEnabled = src.IsUriRecrawlingEnabled;
dest.IsExternalPageCrawlingEnabled = src.IsExternalPageCrawlingEnabled;
Expand Down Expand Up @@ -101,6 +102,20 @@ private void Map(AuthorizationElement src, CrawlConfiguration dest)
dest.UseDefaultCredentials = src.UseDefaultCredentials;
}

private HttpProtocolVersion GetHttpProtocolVersion(CrawlBehaviorElement src)
{

switch (src.HttpProtocolVersion)
{
case "1.0":
return HttpProtocolVersion.Version10;
case "1.1":
return HttpProtocolVersion.Version11;
default:
return HttpProtocolVersion.NotSpecified;
}
}

public static AbotConfigurationSectionHandler LoadFromXml()
{
return ((AbotConfigurationSectionHandler)System.Configuration.ConfigurationManager.GetSection("abot"));
Expand Down Expand Up @@ -231,6 +246,12 @@ public string UserAgentString
get { return (string)this["userAgentString"]; }
}

[ConfigurationProperty("httpProtocolVersion", IsRequired = false)]
public string HttpProtocolVersion
{
get{ return (string)this["httpProtocolVersion"]; }
}

[ConfigurationProperty("crawlTimeoutSeconds", IsRequired = false)]
public int CrawlTimeoutSeconds
{
Expand Down
10 changes: 9 additions & 1 deletion Abot/Core/PageRequester.cs
Expand Up @@ -204,7 +204,7 @@ protected virtual HttpWebRequest BuildRequestObject(Uri uri)
request.AllowAutoRedirect = _config.IsHttpRequestAutoRedirectsEnabled;
request.UserAgent = _config.UserAgentString;
request.Accept = "*/*";
request.ProtocolVersion = HttpVersion.Version10; //https://github.com/sjdirect/abot/issues/187
request.ProtocolVersion = GetEquivalentHttpProtocolVersion();

if (_config.HttpRequestMaxAutoRedirects > 0)
request.MaximumAutomaticRedirections = _config.HttpRequestMaxAutoRedirects;
Expand Down Expand Up @@ -238,6 +238,14 @@ protected virtual HttpWebRequest BuildRequestObject(Uri uri)
return request;
}

private Version GetEquivalentHttpProtocolVersion()
{
if (_config.HttpProtocolVersion == Abot.Poco.HttpProtocolVersion.Version10)
return HttpVersion.Version10;

return HttpVersion.Version11;
}

protected virtual void ProcessResponseObject(HttpWebResponse response)
{
if (response != null && _config.IsSendingCookiesEnabled)
Expand Down
5 changes: 5 additions & 0 deletions Abot/Poco/CrawlConfiguration.cs
Expand Up @@ -53,6 +53,11 @@ public CrawlConfiguration()
/// </summary>
public string UserAgentString { get; set; }

/// <summary>
/// The http protocol version number to use during http requests. Currently supporting values "1.1" and "1.0".
/// </summary>
public HttpProtocolVersion HttpProtocolVersion { get; set; }

/// <summary>
/// Maximum seconds before the crawl times out and stops.
/// If zero, this setting has no effect.
Expand Down
9 changes: 9 additions & 0 deletions Abot/Poco/Enums.cs
@@ -0,0 +1,9 @@
namespace Abot.Poco
{
public enum HttpProtocolVersion
{
NotSpecified,
Version10,
Version11,
}
}

0 comments on commit 195b959

Please sign in to comment.