From 80b4ff7ffab64dc0c30d045b746e3009cb06907c Mon Sep 17 00:00:00 2001 From: Emmanuel Keller Date: Tue, 2 May 2017 23:11:36 +0200 Subject: [PATCH] Update htmlunit --- pom.xml | 44 +++++++++++++++++-- .../qwazr/crawler/web/WebCrawlerServer.java | 4 +- .../web/driver/AdditionalCapabilities.java | 7 +-- .../crawler/web/driver/BrowserDriver.java | 5 +-- .../web/driver/BrowserDriverBuilder.java | 32 +++++++++----- .../web/driver/HtmlUnitBrowserDriver.java | 9 +--- .../web/driver/HtmlUnitDriverWebClient.java | 24 +--------- .../crawler/web/test/WebCrawlerTest.java | 2 +- 8 files changed, 68 insertions(+), 59 deletions(-) diff --git a/pom.xml b/pom.xml index cf39b33..f2a9558 100644 --- a/pom.xml +++ b/pom.xml @@ -16,8 +16,9 @@ - 3.3.1 - 2.25 + 3.4.0 + 2.26 + 9.4.3.v20170317 @@ -26,6 +27,41 @@ git@github.com:qwazr/crawlers.git + + + + xml-apis + xml-apis + 1.4.01 + + + org.seleniumhq.selenium + selenium-support + ${selenium.version} + + + org.roaringbitmap + RoaringBitmap + 0.6.43 + + + commons-codec + commons-codec + 1.10 + + + org.eclipse.jetty + jetty-io + ${jetty.version} + + + org.eclipse.jetty + jetty-util + ${jetty.version} + + + + com.qwazr @@ -94,7 +130,7 @@ com.codeborne phantomjsdriver - 1.4.1 + 1.4.2 selenium-api @@ -131,7 +167,7 @@ org.apache.maven.plugins maven-shade-plugin - ${shade.version} + 3.0.0 package diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlerServer.java b/src/main/java/com/qwazr/crawler/web/WebCrawlerServer.java index f57402d..7fa43ae 100644 --- a/src/main/java/com/qwazr/crawler/web/WebCrawlerServer.java +++ b/src/main/java/com/qwazr/crawler/web/WebCrawlerServer.java @@ -42,10 +42,10 @@ private WebCrawlerServer(final ServerConfiguration configuration) throws IOExcep .registerProtocolListener(builder) .registerWebService(builder); final LibraryManager libraryManager = - new LibraryManager(null, configuration.dataDirectory, configuration.getEtcFiles()).registerWebService( + new LibraryManager(configuration.dataDirectory, configuration.getEtcFiles()).registerWebService( builder); final ScriptManager scriptManager = new ScriptManager(executorService, clusterManager, libraryManager, - configuration.dataDirectory).registerWebService(builder); + configuration.dataDirectory).registerWebService(builder); final WebCrawlerManager webCrawlerManager = new WebCrawlerManager(clusterManager, scriptManager, executorService).registerWebService(builder); serviceBuilder = new WebCrawlerServiceBuilder(clusterManager, webCrawlerManager); diff --git a/src/main/java/com/qwazr/crawler/web/driver/AdditionalCapabilities.java b/src/main/java/com/qwazr/crawler/web/driver/AdditionalCapabilities.java index 465efdb..c09c086 100644 --- a/src/main/java/com/qwazr/crawler/web/driver/AdditionalCapabilities.java +++ b/src/main/java/com/qwazr/crawler/web/driver/AdditionalCapabilities.java @@ -52,17 +52,12 @@ interface SaveBinaryFile { void saveBinaryFile(File file) throws IOException; } - interface SetAttribute { - - void setAttribute(WebElement element, String name, String value); - } - interface WebRequest { void request(WebRequestDefinition request); } - interface All extends ResponseHeader, SafeText, InnerHtml, SaveBinaryFile, SetAttribute, WebRequest { + interface All extends ResponseHeader, SafeText, InnerHtml, SaveBinaryFile, WebRequest { } } diff --git a/src/main/java/com/qwazr/crawler/web/driver/BrowserDriver.java b/src/main/java/com/qwazr/crawler/web/driver/BrowserDriver.java index 9ed2493..7ece024 100644 --- a/src/main/java/com/qwazr/crawler/web/driver/BrowserDriver.java +++ b/src/main/java/com/qwazr/crawler/web/driver/BrowserDriver.java @@ -343,10 +343,7 @@ public List findElementsByCssSelector(String cssSelector) { } public void setAttribute(WebElement element, String name, String value) { - if (driver instanceof AdditionalCapabilities.SetAttribute) - ((AdditionalCapabilities.SetAttribute) driver).setAttribute(element, name, value); - else - this.executeScript("arguments[0].setAttribute(arguments[1], arguments[2])", false, element, name, value); + this.executeScript("arguments[0].setAttribute(arguments[1], arguments[2])", false, element, name, value); } @Override diff --git a/src/main/java/com/qwazr/crawler/web/driver/BrowserDriverBuilder.java b/src/main/java/com/qwazr/crawler/web/driver/BrowserDriverBuilder.java index 57c3b01..5824feb 100644 --- a/src/main/java/com/qwazr/crawler/web/driver/BrowserDriverBuilder.java +++ b/src/main/java/com/qwazr/crawler/web/driver/BrowserDriverBuilder.java @@ -21,6 +21,7 @@ import org.openqa.selenium.Proxy; import org.openqa.selenium.WebDriver; import org.openqa.selenium.phantomjs.PhantomJSDriverService; +import org.openqa.selenium.remote.BrowserType; import org.openqa.selenium.remote.CapabilityType; import org.openqa.selenium.remote.DesiredCapabilities; @@ -102,8 +103,8 @@ public BrowserDriver build() throws ReflectiveOperationException, SecurityExcept // Setup the language if (crawlDefinition.browser_language != null) { capabilities = checkCapabilities(capabilities); - capabilities - .setCapability(AdditionalCapabilities.QWAZR_BROWSER_LANGUAGE, crawlDefinition.browser_language); + capabilities.setCapability(AdditionalCapabilities.QWAZR_BROWSER_LANGUAGE, + crawlDefinition.browser_language); if (browserType == BrowserDriverEnum.phantomjs) capabilities.setCapability( PhantomJSDriverService.PHANTOMJS_PAGE_CUSTOMHEADERS_PREFIX + "Accept-Language", @@ -115,16 +116,16 @@ public BrowserDriver build() throws ReflectiveOperationException, SecurityExcept capabilities = checkCapabilities(capabilities); if (browserType == BrowserDriverEnum.phantomjs) capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_PAGE_SETTINGS_PREFIX + "loadImages", - crawlDefinition.download_images); + crawlDefinition.download_images); } // Web security if (crawlDefinition.web_security != null) { capabilities = checkCapabilities(capabilities); if (browserType == BrowserDriverEnum.phantomjs) - capabilities - .setCapability(PhantomJSDriverService.PHANTOMJS_PAGE_SETTINGS_PREFIX + "webSecurityEnabled", - crawlDefinition.web_security); + capabilities.setCapability( + PhantomJSDriverService.PHANTOMJS_PAGE_SETTINGS_PREFIX + "webSecurityEnabled", + crawlDefinition.web_security); } // Choose a browser name @@ -143,9 +144,9 @@ public BrowserDriver build() throws ReflectiveOperationException, SecurityExcept capabilities = checkCapabilities(capabilities); capabilities.setJavascriptEnabled(crawlDefinition.javascript_enabled); if (browserType == BrowserDriverEnum.phantomjs) - capabilities - .setCapability(PhantomJSDriverService.PHANTOMJS_PAGE_SETTINGS_PREFIX + "javascriptEnabled", - crawlDefinition.javascript_enabled); + capabilities.setCapability( + PhantomJSDriverService.PHANTOMJS_PAGE_SETTINGS_PREFIX + "javascriptEnabled", + crawlDefinition.javascript_enabled); } @@ -154,15 +155,22 @@ public BrowserDriver build() throws ReflectiveOperationException, SecurityExcept if (browserType == BrowserDriverEnum.phantomjs) { capabilities = checkCapabilities(capabilities); - capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, - new String[] { "--webdriver-loglevel=none", "--ignore-ssl-errors=true", "--ssl-protocol=any" }); + capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, new String[] { + "--webdriver-loglevel=none", + "--ignore-ssl-errors=true", + "--ssl-protocol=any" }); + } + + if (browserType == BrowserDriverEnum.html_unit) { + capabilities = checkCapabilities(capabilities); + capabilities.setBrowserName(BrowserType.HTMLUNIT); } final WebDriver driver = browserType.getNewInstance(capabilities); try { final BrowserDriver browserDriver = new BrowserDriver(browserType, driver, proxyDef); browserDriver.setTimeouts(crawlDefinition.implicitly_wait, crawlDefinition.page_load_timeout, - crawlDefinition.script_timeout); + crawlDefinition.script_timeout); if (crawlDefinition.cookies != null) for (Map.Entry cookie : crawlDefinition.cookies.entrySet()) diff --git a/src/main/java/com/qwazr/crawler/web/driver/HtmlUnitBrowserDriver.java b/src/main/java/com/qwazr/crawler/web/driver/HtmlUnitBrowserDriver.java index 480b97b..1442114 100644 --- a/src/main/java/com/qwazr/crawler/web/driver/HtmlUnitBrowserDriver.java +++ b/src/main/java/com/qwazr/crawler/web/driver/HtmlUnitBrowserDriver.java @@ -37,14 +37,13 @@ public class HtmlUnitBrowserDriver extends HtmlUnitDriverWebClient implements AdditionalCapabilities.ResponseHeader, AdditionalCapabilities.SafeText, - AdditionalCapabilities.InnerHtml, AdditionalCapabilities.SaveBinaryFile, AdditionalCapabilities.SetAttribute, - AdditionalCapabilities.WebRequest { + AdditionalCapabilities.InnerHtml, AdditionalCapabilities.SaveBinaryFile, AdditionalCapabilities.WebRequest { protected static final Logger logger = LoggerFactory.getLogger(HtmlUnitBrowserDriver.class); public HtmlUnitBrowserDriver() { } - + public HtmlUnitBrowserDriver(Capabilities capabilities) { super(capabilities); } @@ -156,8 +155,4 @@ public String getTextSafe(WebElement webElement) { } } - @Override - public void setAttribute(WebElement element, String name, String value) { - ((HtmlUnitDriverWebElement) element).getDomElement().setAttribute(name, value); - } } diff --git a/src/main/java/com/qwazr/crawler/web/driver/HtmlUnitDriverWebClient.java b/src/main/java/com/qwazr/crawler/web/driver/HtmlUnitDriverWebClient.java index 14e7c20..38e841b 100644 --- a/src/main/java/com/qwazr/crawler/web/driver/HtmlUnitDriverWebClient.java +++ b/src/main/java/com/qwazr/crawler/web/driver/HtmlUnitDriverWebClient.java @@ -24,15 +24,12 @@ import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.WebRequest; import com.gargoylesoftware.htmlunit.WebWindow; -import com.gargoylesoftware.htmlunit.html.DomElement; import com.gargoylesoftware.htmlunit.util.NameValuePair; import com.qwazr.crawler.web.WebRequestDefinition; import org.openqa.selenium.Capabilities; import org.openqa.selenium.TimeoutException; import org.openqa.selenium.WebDriverException; -import org.openqa.selenium.WebElement; import org.openqa.selenium.htmlunit.HtmlUnitDriver; -import org.openqa.selenium.htmlunit.HtmlUnitWebElement; import java.net.ConnectException; import java.net.MalformedURLException; @@ -79,25 +76,6 @@ public WebClient getWebClient() { return super.getWebClient(); } - WebElement convertNode(DomElement element) { - return newHtmlUnitWebElement(element); - } - - protected WebElement newHtmlUnitWebElement(DomElement element) { - return new HtmlUnitDriverWebElement(this, element); - } - - public class HtmlUnitDriverWebElement extends HtmlUnitWebElement { - - public HtmlUnitDriverWebElement(HtmlUnitDriver parent, DomElement element) { - super(parent, element); - } - - DomElement getDomElement() { - return super.getElement(); - } - } - public void request(final WebRequestDefinition webRequestDef) { final WebRequest request; try { @@ -111,7 +89,7 @@ public void request(final WebRequestDefinition webRequestDef) { window.setEnclosedPage(page); } catch (UnknownHostException e) { window.setEnclosedPage(new UnexpectedPage(new StringWebResponse("Unknown host", request.getUrl()), - getCurrentWindow().getTopWindow())); + getCurrentWindow().getTopWindow())); } catch (ConnectException e) { // This might be expected } catch (SocketTimeoutException e) { diff --git a/src/test/java/com/qwazr/crawler/web/test/WebCrawlerTest.java b/src/test/java/com/qwazr/crawler/web/test/WebCrawlerTest.java index 8100637..89d1171 100644 --- a/src/test/java/com/qwazr/crawler/web/test/WebCrawlerTest.java +++ b/src/test/java/com/qwazr/crawler/web/test/WebCrawlerTest.java @@ -101,7 +101,7 @@ public void test400CrawlEvent() throws InterruptedException { final WebCrawlDefinition webCrawl = getNewWebCrawl(); webCrawl.scripts = new HashMap<>(); webCrawl.scripts.put(WebCrawlDefinition.EventEnum.before_crawl, - new WebCrawlDefinition.Script(BeforeCrawl.class.getName())); + new WebCrawlDefinition.Script(BeforeCrawl.class.getName())); remote.runSession(sessionName, webCrawl); crawlWait(sessionName, 3); Assert.assertEquals(4, BeforeCrawl.count.get());