From 69f7330c7ba3911aa3c8023f9244e29a8cf580d1 Mon Sep 17 00:00:00 2001 From: Emmanuel Keller Date: Fri, 9 Jun 2017 01:20:22 +0200 Subject: [PATCH] Introduce FileCrawler --- .../qwazr/crawler/common/CrawlDefinition.java | 131 ++++++++++++++++++ .../qwazr/crawler/common/CrawlManager.java | 87 ++++++++++++ .../CrawlSession.java} | 4 +- .../CrawlSessionImpl.java} | 44 +++--- .../com/qwazr/crawler/common/CrawlStatus.java | 78 +++++++++++ .../com/qwazr/crawler/common/CrawlThread.java | 55 ++++++++ .../crawler/file/FileCrawlDefinition.java | 114 +++++++++++++++ .../qwazr/crawler/file/FileCrawlThread.java | 41 ++++++ .../crawler/file/FileCrawlerManager.java | 39 ++++++ .../qwazr/crawler/web/WebCrawlDefinition.java | 104 ++------------ .../com/qwazr/crawler/web/WebCrawlStatus.java | 77 ---------- .../com/qwazr/crawler/web/WebCrawlThread.java | 46 ++---- .../qwazr/crawler/web/WebCrawlerManager.java | 73 +--------- .../crawler/web/WebCrawlerMultiClient.java | 23 +-- .../crawler/web/WebCrawlerServiceImpl.java | 13 +- .../web/WebCrawlerServiceInterface.java | 13 +- .../crawler/web/WebCrawlerSingleClient.java | 21 +-- .../crawler/web/test/WebCrawlerTest.java | 10 +- 18 files changed, 639 insertions(+), 334 deletions(-) create mode 100644 src/main/java/com/qwazr/crawler/common/CrawlDefinition.java create mode 100644 src/main/java/com/qwazr/crawler/common/CrawlManager.java rename src/main/java/com/qwazr/crawler/{web/CurrentSession.java => common/CrawlSession.java} (97%) rename src/main/java/com/qwazr/crawler/{web/CurrentSessionImpl.java => common/CrawlSessionImpl.java} (73%) create mode 100644 src/main/java/com/qwazr/crawler/common/CrawlStatus.java create mode 100644 src/main/java/com/qwazr/crawler/common/CrawlThread.java create mode 100644 src/main/java/com/qwazr/crawler/file/FileCrawlDefinition.java create mode 100644 src/main/java/com/qwazr/crawler/file/FileCrawlThread.java create mode 100644 src/main/java/com/qwazr/crawler/file/FileCrawlerManager.java delete mode 100644 src/main/java/com/qwazr/crawler/web/WebCrawlStatus.java diff --git a/src/main/java/com/qwazr/crawler/common/CrawlDefinition.java b/src/main/java/com/qwazr/crawler/common/CrawlDefinition.java new file mode 100644 index 0000000..ca77614 --- /dev/null +++ b/src/main/java/com/qwazr/crawler/common/CrawlDefinition.java @@ -0,0 +1,131 @@ +/** + * Copyright 2015-2017 Emmanuel Keller / QWAZR + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package com.qwazr.crawler.common; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonInclude.Include; + +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; + +@JsonInclude(Include.NON_EMPTY) +public class CrawlDefinition implements Cloneable { + + /** + * The global variables shared by all the scripts. + */ + public LinkedHashMap variables = null; + + /** + * A list of scripts paths mapped with the events which fire the scripts. + */ + public Map scripts = null; + + public enum EventEnum { + + /** + * Executed before the crawl session start + */ + before_session, + + /** + * Executed after the crawl session ends + */ + after_session, + + /** + * Executed before an URL is crawled + */ + before_crawl, + + /** + * Executed after an URL has been crawled + */ + after_crawl + } + + @JsonInclude(Include.NON_EMPTY) + public static class Script implements Cloneable { + + /** + * The path to the scripts + */ + public String name = null; + + /** + * The local variables passed to the scripts + */ + public Map variables = null; + + public Script() { + } + + public Script(String name) { + this.name = name; + } + + protected Script(Script src) { + this.name = src.name; + this.variables = src.variables == null ? null : new HashMap(src.variables); + } + + @Override + final public Object clone() { + return new Script(this); + } + + public Script addVariable(String name, String value) { + if (variables == null) + variables = new HashMap<>(); + variables.put(name, value); + return this; + } + + } + + public CrawlDefinition() { + } + + protected CrawlDefinition(CrawlDefinition src) { + variables = src.variables == null ? null : new LinkedHashMap<>(src.variables); + if (src.scripts == null) { + scripts = null; + } else { + scripts = new HashMap<>(); + src.scripts.forEach((eventEnum, script) -> scripts.put(eventEnum, new Script(script))); + } + } + + public Object clone() { + return new CrawlDefinition(this); + } + + @JsonIgnore + public Script addScript(final String event, final String name) { + if (scripts == null) + scripts = new LinkedHashMap<>(); + Script script = new Script(name); + scripts.put(EventEnum.valueOf(event), script); + return script; + } + + public Map getScripts() { + return scripts; + } + +} diff --git a/src/main/java/com/qwazr/crawler/common/CrawlManager.java b/src/main/java/com/qwazr/crawler/common/CrawlManager.java new file mode 100644 index 0000000..17aad62 --- /dev/null +++ b/src/main/java/com/qwazr/crawler/common/CrawlManager.java @@ -0,0 +1,87 @@ +/* + * Copyright 2017 Emmanuel Keller / QWAZR + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.qwazr.crawler.common; + +import com.qwazr.cluster.ClusterManager; +import com.qwazr.server.ServerException; +import org.slf4j.Logger; + +import javax.ws.rs.core.Response; +import java.util.TreeMap; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.atomic.AtomicBoolean; + +public abstract class CrawlManager, D extends CrawlDefinition> { + + private final ConcurrentHashMap crawlSessionMap; + private final ExecutorService executorService; + private final Logger logger; + public final String myAddress; + public final ClusterManager clusterManager; + + protected CrawlManager(final ClusterManager clusterManager, final ExecutorService executorService, + final Logger logger) { + crawlSessionMap = new ConcurrentHashMap<>(); + this.clusterManager = clusterManager; + this.myAddress = clusterManager.getServiceBuilder().local().getStatus().me; + this.executorService = executorService; + this.logger = logger; + } + + public TreeMap getSessions() { + final TreeMap map = new TreeMap<>(); + crawlSessionMap.forEach((key, crawl) -> map.put(key, crawl.getStatus())); + return map; + } + + public CrawlStatus getSession(final String sessionName) { + final T crawlThread = crawlSessionMap.get(sessionName); + return crawlThread == null ? null : crawlThread.getStatus(); + } + + public void abortSession(final String sessionName, final String abortingReason) throws ServerException { + final T crawlThread = crawlSessionMap.get(sessionName); + if (crawlThread == null) + throw new ServerException(Response.Status.NOT_FOUND, "Session not found: " + sessionName); + logger.info("Aborting session: {} - {}", sessionName, abortingReason); + crawlThread.abort(abortingReason); + } + + protected abstract T newCrawlThread(final String sessionName, final D crawlDefinition); + + public CrawlStatus runSession(final String sessionName, final D crawlDefinition) throws ServerException { + + final AtomicBoolean newThread = new AtomicBoolean(false); + + final T crawlThread = crawlSessionMap.computeIfAbsent(sessionName, key -> { + logger.info("Create session: {}", sessionName); + newThread.set(true); + return newCrawlThread(sessionName, crawlDefinition); + }); + + if (!newThread.get()) + throw new ServerException(Response.Status.CONFLICT, "The session already exists: " + sessionName); + executorService.execute(crawlThread); + return crawlThread.getStatus(); + } + + public void removeSession(final T crawlThread) { + logger.info("Remove session: {}", crawlThread.getSessionName()); + crawlSessionMap.remove(crawlThread.getSessionName(), crawlThread); + } + +} diff --git a/src/main/java/com/qwazr/crawler/web/CurrentSession.java b/src/main/java/com/qwazr/crawler/common/CrawlSession.java similarity index 97% rename from src/main/java/com/qwazr/crawler/web/CurrentSession.java rename to src/main/java/com/qwazr/crawler/common/CrawlSession.java index 1cb3105..5cec3d3 100644 --- a/src/main/java/com/qwazr/crawler/web/CurrentSession.java +++ b/src/main/java/com/qwazr/crawler/common/CrawlSession.java @@ -13,13 +13,13 @@ * See the License for the specific language governing permissions and * limitations under the License. **/ -package com.qwazr.crawler.web; +package com.qwazr.crawler.common; import com.qwazr.utils.TimeTracker; import java.util.Map; -public interface CurrentSession { +public interface CrawlSession { Map getVariables(); diff --git a/src/main/java/com/qwazr/crawler/web/CurrentSessionImpl.java b/src/main/java/com/qwazr/crawler/common/CrawlSessionImpl.java similarity index 73% rename from src/main/java/com/qwazr/crawler/web/CurrentSessionImpl.java rename to src/main/java/com/qwazr/crawler/common/CrawlSessionImpl.java index 6530e2d..d2a4b3c 100644 --- a/src/main/java/com/qwazr/crawler/web/CurrentSessionImpl.java +++ b/src/main/java/com/qwazr/crawler/common/CrawlSessionImpl.java @@ -1,4 +1,4 @@ -/** +/* * Copyright 2015-2017 Emmanuel Keller / QWAZR *

* Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,18 +13,17 @@ * See the License for the specific language governing permissions and * limitations under the License. **/ -package com.qwazr.crawler.web; +package com.qwazr.crawler.common; import com.qwazr.utils.TimeTracker; -import org.apache.commons.collections4.CollectionUtils; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicBoolean; -class CurrentSessionImpl implements CurrentSession { +public class CrawlSessionImpl implements CrawlSession { - private final WebCrawlDefinition crawlDefinition; + private final T crawlDefinition; private final String name; private final AtomicBoolean abort; private final TimeTracker timeTracker; @@ -37,19 +36,20 @@ class CurrentSessionImpl implements CurrentSession { private volatile Integer currentDepth = null; private volatile String abortingReason = null; - CurrentSessionImpl(WebCrawlDefinition crawlDefinition, String name, TimeTracker timeTracker) { + public CrawlSessionImpl(T crawlDefinition, String name) { this.crawlDefinition = crawlDefinition; - this.timeTracker = timeTracker; + this.timeTracker = new TimeTracker(); this.name = name; abort = new AtomicBoolean(false); this.variables = new ConcurrentHashMap<>(); - if (crawlDefinition.variables != null) - for (Map.Entry entry : crawlDefinition.variables.entrySet()) - if (entry.getKey() != null && entry.getValue() != null) - this.variables.put(entry.getKey(), entry.getValue()); + if (crawlDefinition.variables != null) { + crawlDefinition.variables.forEach((key, value) -> { + if (key != null && value != null) + this.variables.put(key, value); + }); + } } - @Override public Map getVariables() { return variables; } @@ -59,7 +59,10 @@ public Object getVariable(String name) { return variables.get(name); } - @Override + /** + * @param name the name of the variable + * @return the value of the variable + */ public Object setVariable(String name, Object value) { if (value == null) return removeVariable(name); @@ -104,7 +107,7 @@ public String getAbortingReason() { return abortingReason; } - synchronized int incIgnoredCount() { + public synchronized int incIgnoredCount() { return ++ignoredCount; } @@ -113,7 +116,7 @@ public Integer getIgnoredCount() { return ignoredCount; } - synchronized int incCrawledCount() { + public synchronized int incCrawledCount() { return ++crawledCount; } @@ -122,7 +125,7 @@ public Integer getCrawledCount() { return crawledCount; } - synchronized int incErrorCount() { + public synchronized int incErrorCount() { return ++errorCount; } @@ -146,20 +149,15 @@ public Integer getCurrentDepth() { return currentDepth; } - synchronized void setCurrentURI(String currentURI, Integer currentDepth) { + public synchronized void setCurrentURI(String currentURI, Integer currentDepth) { this.currentURI = currentURI; this.currentDepth = currentDepth; } - public WebCrawlDefinition getCrawlDefinition() { + public T getCrawlDefinition() { return crawlDefinition; } - public boolean isURLPatterns() { - return crawlDefinition != null && !CollectionUtils.isEmpty(crawlDefinition.inclusion_patterns) - && !CollectionUtils.isEmpty(crawlDefinition.exclusion_patterns); - } - public TimeTracker getTimeTracker() { return timeTracker; } diff --git a/src/main/java/com/qwazr/crawler/common/CrawlStatus.java b/src/main/java/com/qwazr/crawler/common/CrawlStatus.java new file mode 100644 index 0000000..70a6506 --- /dev/null +++ b/src/main/java/com/qwazr/crawler/common/CrawlStatus.java @@ -0,0 +1,78 @@ +/* + * Copyright 2015-2017 Emmanuel Keller / QWAZR + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package com.qwazr.crawler.common; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonInclude.Include; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.qwazr.utils.TimeTracker; + +@JsonInclude(Include.NON_NULL) +public class CrawlStatus { + + final public String entry_url; + final public String node_address; + final public Boolean aborting; + final public String aborting_reason; + final public TimeTracker.Status timer; + final public UrlStatus urls; + + @JsonCreator + public CrawlStatus(@JsonProperty("node_address") String nodeAddress, @JsonProperty("aborting") Boolean aborting, + @JsonProperty("aborting_reason") String abortingReason, @JsonProperty("entry_url") String entryUrl, + @JsonProperty("timer") TimeTracker.Status timer, @JsonProperty("urls") UrlStatus urlStatus) { + this.node_address = nodeAddress; + this.timer = timer; + this.aborting = aborting; + this.aborting_reason = abortingReason; + this.entry_url = entryUrl; + this.urls = urlStatus; + } + + public CrawlStatus(final String nodeAddress, final String entryUrl, final CrawlSession session) { + this(nodeAddress, session.isAborting(), session.getAbortingReason(), entryUrl, + session.getTimeTracker() == null ? null : session.getTimeTracker().getStatus(), new UrlStatus(session)); + + } + + @JsonInclude(Include.NON_EMPTY) + public static class UrlStatus { + + final public int crawled; + final public int ignored; + final public int error; + final public String current_uri; + final public Integer current_depth; + + @JsonCreator + UrlStatus(@JsonProperty("crawled") Integer crawled, @JsonProperty("ignored") Integer ignored, + @JsonProperty("error") Integer error, @JsonProperty("current_uri") String currentUri, + @JsonProperty("current_depth") Integer currentDepth) { + this.crawled = crawled == null ? 0 : crawled; + this.ignored = ignored == null ? 0 : ignored; + this.error = error == null ? 0 : error; + this.current_uri = currentUri; + this.current_depth = currentDepth; + } + + private UrlStatus(CrawlSession session) { + this(session.getCrawledCount(), session.getIgnoredCount(), session.getErrorCount(), session.getCurrentURI(), + session.getCurrentDepth()); + } + } + +} diff --git a/src/main/java/com/qwazr/crawler/common/CrawlThread.java b/src/main/java/com/qwazr/crawler/common/CrawlThread.java new file mode 100644 index 0000000..7fff6a9 --- /dev/null +++ b/src/main/java/com/qwazr/crawler/common/CrawlThread.java @@ -0,0 +1,55 @@ +/* + * Copyright 2017 Emmanuel Keller / QWAZR + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.qwazr.crawler.common; + +import org.slf4j.Logger; + +public abstract class CrawlThread implements Runnable { + + protected final M manager; + protected final CrawlSessionImpl session; + protected final Logger logger; + + protected CrawlThread(final M manager, final CrawlSessionImpl session, final Logger logger) { + this.manager = manager; + this.session = session; + this.logger = logger; + } + + public String getSessionName() { + return session.getName(); + } + + protected abstract void runner() throws Exception; + + @Override + final public void run() { + try { + runner(); + } catch (Exception e) { + logger.error(e.getMessage(), e); + } finally { + manager.removeSession(this); + } + } + + public abstract CrawlStatus getStatus(); + + public void abort(final String abortingReason) { + session.abort(abortingReason); + + } +} diff --git a/src/main/java/com/qwazr/crawler/file/FileCrawlDefinition.java b/src/main/java/com/qwazr/crawler/file/FileCrawlDefinition.java new file mode 100644 index 0000000..a82763f --- /dev/null +++ b/src/main/java/com/qwazr/crawler/file/FileCrawlDefinition.java @@ -0,0 +1,114 @@ +/* + * Copyright 2017 Emmanuel Keller / QWAZR + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package com.qwazr.crawler.file; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonInclude.Include; +import com.qwazr.crawler.common.CrawlDefinition; +import com.qwazr.utils.StringUtils; +import com.qwazr.utils.json.JsonMapper; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +@JsonInclude(Include.NON_EMPTY) +public class FileCrawlDefinition extends CrawlDefinition { + + /** + * The entry point PATH of the crawl. + */ + public String entry_path = null; + + /** + * A list of regular expression patterns. An URL may not be crawled if it + * matches any pattern. + */ + public List exclusion_patterns = null; + + /** + * Time wait on successfull crawl + */ + public Integer crawl_wait_ms = null; + + public FileCrawlDefinition() { + } + + protected FileCrawlDefinition(FileCrawlDefinition src) { + super(src); + entry_path = src.entry_path; + exclusion_patterns = src.exclusion_patterns == null ? null : new ArrayList<>(src.exclusion_patterns); + crawl_wait_ms = src.crawl_wait_ms; + } + + public Object clone() { + return new FileCrawlDefinition(this); + } + + @JsonIgnore + public FileCrawlDefinition setEntryPath(final String entryPath) { + this.entry_path = entryPath; + return this; + } + + @JsonIgnore + public String getEntryPath() { + return this.entry_path; + } + + @JsonIgnore + public FileCrawlDefinition setExclusionPattern(final String exclusionPatternText) throws IOException { + if (exclusionPatternText == null) { + exclusion_patterns = null; + return this; + } + exclusion_patterns = new ArrayList<>(); + StringUtils.linesCollector(exclusionPatternText, false, exclusion_patterns); + return this; + } + + @JsonIgnore + public FileCrawlDefinition addExclusionPattern(final String exclusionPattern) { + if (exclusion_patterns == null) + exclusion_patterns = new ArrayList<>(); + exclusion_patterns.add(exclusionPattern); + return this; + } + + @JsonIgnore + public Collection getExclusionPatterns() { + return exclusion_patterns; + } + + @JsonIgnore + public FileCrawlDefinition setCrawlWaitMs(Integer crawlWaitMs) { + this.crawl_wait_ms = crawlWaitMs; + return this; + } + + @JsonIgnore + public Integer getCrawlWaitMs() { + return crawl_wait_ms; + } + + @JsonIgnore + public static FileCrawlDefinition newInstance(final String json) throws IOException { + return JsonMapper.MAPPER.readValue(json, FileCrawlDefinition.class); + } + +} diff --git a/src/main/java/com/qwazr/crawler/file/FileCrawlThread.java b/src/main/java/com/qwazr/crawler/file/FileCrawlThread.java new file mode 100644 index 0000000..a6f6a8d --- /dev/null +++ b/src/main/java/com/qwazr/crawler/file/FileCrawlThread.java @@ -0,0 +1,41 @@ +/* + * Copyright 2017 Emmanuel Keller / QWAZR + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package com.qwazr.crawler.file; + +import com.qwazr.crawler.common.CrawlSessionImpl; +import com.qwazr.crawler.common.CrawlStatus; +import com.qwazr.crawler.common.CrawlThread; +import org.slf4j.Logger; + +public class FileCrawlThread extends CrawlThread { + + private final FileCrawlDefinition crawlDefinition; + + public FileCrawlThread(FileCrawlerManager manager, CrawlSessionImpl session, Logger logger) { + super(manager, session, logger); + crawlDefinition = session.getCrawlDefinition(); + } + + @Override + protected void runner() throws Exception { + + } + + @Override + public CrawlStatus getStatus() { + return new CrawlStatus(manager.myAddress, crawlDefinition.entry_path, session); + } +} diff --git a/src/main/java/com/qwazr/crawler/file/FileCrawlerManager.java b/src/main/java/com/qwazr/crawler/file/FileCrawlerManager.java new file mode 100644 index 0000000..098500d --- /dev/null +++ b/src/main/java/com/qwazr/crawler/file/FileCrawlerManager.java @@ -0,0 +1,39 @@ +/** + * Copyright 2017 Emmanuel Keller / QWAZR + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ +package com.qwazr.crawler.file; + +import com.qwazr.cluster.ClusterManager; +import com.qwazr.crawler.common.CrawlManager; +import com.qwazr.crawler.common.CrawlSessionImpl; +import com.qwazr.crawler.web.WebCrawlerManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.concurrent.ExecutorService; + +public class FileCrawlerManager extends CrawlManager { + + private static final Logger LOGGER = LoggerFactory.getLogger(WebCrawlerManager.class); + + protected FileCrawlerManager(ClusterManager clusterManager, ExecutorService executorService, Logger logger) { + super(clusterManager, executorService, logger); + } + + @Override + protected FileCrawlThread newCrawlThread(String sessionName, FileCrawlDefinition crawlDefinition) { + return new FileCrawlThread(this, new CrawlSessionImpl<>(crawlDefinition, sessionName), LOGGER); + } +} diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlDefinition.java b/src/main/java/com/qwazr/crawler/web/WebCrawlDefinition.java index 0042183..d304c38 100644 --- a/src/main/java/com/qwazr/crawler/web/WebCrawlDefinition.java +++ b/src/main/java/com/qwazr/crawler/web/WebCrawlDefinition.java @@ -1,5 +1,5 @@ /** - * Copyright 2014-2016 Emmanuel Keller / QWAZR + * Copyright 2015-2017 Emmanuel Keller / QWAZR *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonInclude.Include; import com.fasterxml.jackson.annotation.JsonProperty; +import com.qwazr.crawler.common.CrawlDefinition; import com.qwazr.crawler.web.driver.BrowserDriverEnum; import com.qwazr.utils.StringUtils; import com.qwazr.utils.json.JsonMapper; @@ -26,10 +27,14 @@ import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; @JsonInclude(Include.NON_EMPTY) -public class WebCrawlDefinition implements Cloneable { +public class WebCrawlDefinition extends CrawlDefinition { /** * URL called before a crawl session starts @@ -238,82 +243,11 @@ public ProxyDefinition setHttpProxy(String http_proxy) { public Integer page_load_timeout = null; - /** - * The global variables shared by all the scripts. - */ - public LinkedHashMap variables = null; - - /** - * A list of scripts paths mapped with the events which fire the scripts. - */ - public Map scripts = null; - - public enum EventEnum { - - /** - * Executed before the crawl session start - */ - before_session, - - /** - * Executed after the crawl session ends - */ - after_session, - - /** - * Executed before an URL is crawled - */ - before_crawl, - - /** - * Executed after an URL has been crawled - */ - after_crawl - } - - @JsonInclude(Include.NON_EMPTY) - public static class Script implements Cloneable { - - /** - * The path to the scripts - */ - public String name = null; - - /** - * The local variables passed to the scripts - */ - public Map variables = null; - - public Script() { - } - - public Script(String name) { - this.name = name; - } - - protected Script(Script src) { - this.name = src.name; - this.variables = src.variables == null ? null : new HashMap(src.variables); - } - - @Override - final public Object clone() { - return new Script(this); - } - - public Script addVariable(String name, String value) { - if (variables == null) - variables = new HashMap<>(); - variables.put(name, value); - return this; - } - - } - public WebCrawlDefinition() { } protected WebCrawlDefinition(WebCrawlDefinition src) { + super(src); pre_url = src.pre_url; entry_url = src.entry_url; entry_request = src.entry_request; @@ -340,13 +274,6 @@ protected WebCrawlDefinition(WebCrawlDefinition src) { script_timeout = src.script_timeout; page_load_timeout = src.page_load_timeout; crawl_wait_ms = src.crawl_wait_ms; - variables = src.variables == null ? null : new LinkedHashMap<>(src.variables); - if (src.scripts == null) { - scripts = null; - } else { - scripts = new HashMap<>(); - src.scripts.forEach((eventEnum, script) -> scripts.put(eventEnum, new Script(script))); - } } public Object clone() { @@ -673,19 +600,6 @@ public WebCrawlDefinition setCookies(final Map cookies) { return this; } - @JsonIgnore - public Script addScript(final String event, final String name) { - if (scripts == null) - scripts = new LinkedHashMap<>(); - Script script = new Script(name); - scripts.put(EventEnum.valueOf(event), script); - return script; - } - - public Map getScripts() { - return scripts; - } - @JsonIgnore public String urlEncode(final String value) throws UnsupportedEncodingException { return URLEncoder.encode(value, "UTF-8"); diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlStatus.java b/src/main/java/com/qwazr/crawler/web/WebCrawlStatus.java deleted file mode 100644 index 28c425c..0000000 --- a/src/main/java/com/qwazr/crawler/web/WebCrawlStatus.java +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Copyright 2014-2016 Emmanuel Keller / QWAZR - *

- * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

- * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - **/ -package com.qwazr.crawler.web; - -import com.fasterxml.jackson.annotation.JsonInclude; -import com.fasterxml.jackson.annotation.JsonInclude.Include; -import com.qwazr.utils.TimeTracker; -import com.qwazr.crawler.web.CurrentSession; - -@JsonInclude(Include.NON_NULL) -public class WebCrawlStatus { - - final public String node_address; - final public String entry_url; - final public Boolean aborting; - final public String aborting_reason; - final public UrlStatus urls; - final public TimeTracker.Status timer; - - public WebCrawlStatus() { - node_address = null; - timer = null; - entry_url = null; - aborting = null; - aborting_reason = null; - urls = null; - } - - public WebCrawlStatus(String node_address, String entry_url, CurrentSession session) { - this.node_address = node_address; - this.entry_url = entry_url; - TimeTracker tt = session.getTimeTracker(); - this.timer = tt == null ? null : tt.getStatus(); - this.aborting = session.isAborting(); - this.aborting_reason = session.getAbortingReason(); - this.urls = new UrlStatus(session); - } - - @JsonInclude(Include.NON_EMPTY) - public static class UrlStatus { - - final public int crawled; - final public int ignored; - final public int error; - final public String current_uri; - final public Integer current_depth; - - public UrlStatus() { - crawled = 0; - ignored = 0; - error = 0; - current_uri = null; - current_depth = null; - } - - private UrlStatus(CurrentSession session) { - this.crawled = session.getCrawledCount(); - this.ignored = session.getIgnoredCount(); - this.error = session.getErrorCount(); - this.current_uri = session.getCurrentURI(); - this.current_depth = session.getCurrentDepth(); - } - } -} diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlThread.java b/src/main/java/com/qwazr/crawler/web/WebCrawlThread.java index 6b5d0e1..82876d6 100644 --- a/src/main/java/com/qwazr/crawler/web/WebCrawlThread.java +++ b/src/main/java/com/qwazr/crawler/web/WebCrawlThread.java @@ -1,5 +1,5 @@ /** - * Copyright 2014-2016 Emmanuel Keller / QWAZR + * Copyright 2015-2017 Emmanuel Keller / QWAZR *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,13 @@ package com.qwazr.crawler.web; import com.google.common.net.InternetDomainName; +import com.qwazr.crawler.common.CrawlDefinition.EventEnum; +import com.qwazr.crawler.common.CrawlDefinition.Script; +import com.qwazr.crawler.common.CrawlSessionImpl; +import com.qwazr.crawler.common.CrawlStatus; +import com.qwazr.crawler.common.CrawlThread; import com.qwazr.crawler.web.driver.BrowserDriver; import com.qwazr.crawler.web.driver.BrowserDriverBuilder; -import com.qwazr.crawler.web.WebCrawlDefinition.EventEnum; -import com.qwazr.crawler.web.WebCrawlDefinition.Script; import com.qwazr.scripts.ScriptRunThread; import com.qwazr.server.ServerException; import com.qwazr.utils.RegExpUtils; @@ -55,12 +58,10 @@ import java.util.regex.Matcher; import java.util.regex.PatternSyntaxException; -public class WebCrawlThread implements Runnable { +public class WebCrawlThread extends CrawlThread { private static final Logger LOGGER = LoggerFactory.getLogger(WebCrawlThread.class); - private final WebCrawlerManager webCrawlerManager; - private final CurrentSessionImpl session; private final WebCrawlDefinition crawlDefinition; private final InternetDomainName internetDomainName; @@ -78,10 +79,9 @@ public class WebCrawlThread implements Runnable { WebCrawlThread(final WebCrawlerManager webCrawlerManager, final String sessionName, final WebCrawlDefinition crawlDefinition) throws ServerException { - timeTracker = new TimeTracker(); - this.webCrawlerManager = webCrawlerManager; - this.session = new CurrentSessionImpl(crawlDefinition, sessionName, timeTracker); + super(webCrawlerManager, new CrawlSessionImpl<>(crawlDefinition, sessionName), LOGGER); this.crawlDefinition = crawlDefinition; + this.timeTracker = session.getTimeTracker(); if (crawlDefinition.browser_type == null) throw new ServerException(Status.NOT_ACCEPTABLE, "The browser_type is missing"); if (crawlDefinition.entry_url == null && crawlDefinition.entry_request == null) @@ -115,16 +115,9 @@ public class WebCrawlThread implements Runnable { } } - String getSessionName() { - return session.getName(); - } - - WebCrawlStatus getStatus() { - return new WebCrawlStatus(webCrawlerManager.myAddress, crawlDefinition.entry_url, session); - } - - void abort(String reason) { - session.abort(reason); + @Override + public CrawlStatus getStatus() { + return new CrawlStatus(manager.myAddress, crawlDefinition.entry_url, session); } /** @@ -510,7 +503,7 @@ private boolean script(EventEnum event, CurrentURI currentURI) objects.put("driver", driver); if (currentURI != null) objects.put("current", currentURI); - final ScriptRunThread scriptRunThread = webCrawlerManager.scriptManager.runSync(script.name, objects); + final ScriptRunThread scriptRunThread = manager.scriptManager.runSync(script.name, objects); if (scriptRunThread.getException() != null) throw new ServerException(scriptRunThread.getException()); return true; @@ -519,7 +512,7 @@ private boolean script(EventEnum event, CurrentURI currentURI) } } - private void runner() + protected void runner() throws URISyntaxException, IOException, ScriptException, ServerException, ReflectiveOperationException, NoSuchAlgorithmException, KeyStoreException, KeyManagementException, InterruptedException { try { @@ -547,17 +540,6 @@ else if (crawlDefinition.entry_request != null) } } - @Override - final public void run() { - try { - runner(); - } catch (Exception e) { - LOGGER.error(e.getMessage(), e); - } finally { - webCrawlerManager.removeSession(this); - } - } - private abstract class CrawlProvider { protected final URI uri; diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlerManager.java b/src/main/java/com/qwazr/crawler/web/WebCrawlerManager.java index 1ba9653..d73fffb 100644 --- a/src/main/java/com/qwazr/crawler/web/WebCrawlerManager.java +++ b/src/main/java/com/qwazr/crawler/web/WebCrawlerManager.java @@ -16,43 +16,29 @@ package com.qwazr.crawler.web; import com.qwazr.cluster.ClusterManager; +import com.qwazr.crawler.common.CrawlManager; import com.qwazr.scripts.ScriptManager; import com.qwazr.server.ApplicationBuilder; import com.qwazr.server.GenericServer; -import com.qwazr.server.ServerException; -import com.qwazr.utils.LockUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.ws.rs.core.Response.Status; import java.io.IOException; import java.net.URISyntaxException; -import java.util.HashMap; -import java.util.Map; -import java.util.TreeMap; import java.util.concurrent.ExecutorService; -public class WebCrawlerManager { +public class WebCrawlerManager extends CrawlManager { private static final Logger LOGGER = LoggerFactory.getLogger(WebCrawlerManager.class); - final String myAddress; - final ClusterManager clusterManager; final ScriptManager scriptManager; - private final ExecutorService executorService; - - private final LockUtils.ReadWriteLock rwlSessionMap = new LockUtils.ReadWriteLock(); - private final HashMap crawlSessionMap; private WebCrawlerServiceImpl service; public WebCrawlerManager(final ClusterManager clusterManager, final ScriptManager scriptManager, final ExecutorService executor) throws IOException, URISyntaxException { + super(clusterManager, executor, LOGGER); this.scriptManager = scriptManager; - this.clusterManager = clusterManager; - myAddress = clusterManager.getServiceBuilder().local().getStatus().me; - this.executorService = executor; - crawlSessionMap = new HashMap<>(); service = new WebCrawlerServiceImpl(this); } @@ -70,56 +56,9 @@ public WebCrawlerServiceInterface getService() { return service; } - TreeMap getSessions() { - return rwlSessionMap.read(() -> { - final TreeMap map = new TreeMap<>(); - for (Map.Entry entry : crawlSessionMap.entrySet()) - map.put(entry.getKey(), entry.getValue().getStatus()); - return map; - }); - } - - WebCrawlStatus getSession(final String sessionName) { - return rwlSessionMap.read(() -> { - final WebCrawlThread crawlThread = crawlSessionMap.get(sessionName); - if (crawlThread == null) - return null; - return crawlThread.getStatus(); - }); - } - - void abortSession(final String sessionName, final String abortingReason) throws ServerException { - rwlSessionMap.readEx(() -> { - final WebCrawlThread crawlThread = crawlSessionMap.get(sessionName); - if (crawlThread == null) - throw new ServerException(Status.NOT_FOUND, "Session not found: " + sessionName); - if (LOGGER.isInfoEnabled()) - LOGGER.info("Aborting session: " + sessionName + " - " + abortingReason); - crawlThread.abort(abortingReason); - }); - } - - WebCrawlStatus runSession(final String sessionName, final WebCrawlDefinition crawlJson) throws ServerException { - return rwlSessionMap.writeEx(() -> { - if (crawlSessionMap.containsKey(sessionName)) - throw new ServerException(Status.CONFLICT, "The session already exists: " + sessionName); - if (LOGGER.isInfoEnabled()) - LOGGER.info("Create session: " + sessionName); - - WebCrawlThread crawlThread = new WebCrawlThread(this, sessionName, crawlJson); - crawlSessionMap.put(sessionName, crawlThread); - executorService.execute(crawlThread); - return crawlThread.getStatus(); - }); - } - - void removeSession(final WebCrawlThread crawlThread) { - rwlSessionMap.writeEx(() -> { - final String sessionName = crawlThread.getSessionName(); - if (LOGGER.isInfoEnabled()) - LOGGER.info("Remove session: " + sessionName); - crawlSessionMap.remove(sessionName, crawlThread); - }); + @Override + protected WebCrawlThread newCrawlThread(String sessionName, WebCrawlDefinition crawlDef) { + return new WebCrawlThread(this, sessionName, crawlDef); } } diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlerMultiClient.java b/src/main/java/com/qwazr/crawler/web/WebCrawlerMultiClient.java index 9d68547..0f1aa02 100644 --- a/src/main/java/com/qwazr/crawler/web/WebCrawlerMultiClient.java +++ b/src/main/java/com/qwazr/crawler/web/WebCrawlerMultiClient.java @@ -1,5 +1,5 @@ -/** - * Copyright 2014-2016 Emmanuel Keller / QWAZR +/* + * Copyright 2015-2017 Emmanuel Keller / QWAZR *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,11 +15,12 @@ **/ package com.qwazr.crawler.web; -import com.qwazr.utils.ExceptionUtils; -import com.qwazr.utils.http.HttpResponseEntityException; -import com.qwazr.server.client.JsonMultiClientAbstract; +import com.qwazr.crawler.common.CrawlStatus; import com.qwazr.server.RemoteService; import com.qwazr.server.ServerException; +import com.qwazr.server.client.JsonMultiClientAbstract; +import com.qwazr.utils.ExceptionUtils; +import com.qwazr.utils.http.HttpResponseEntityException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -44,13 +45,13 @@ protected WebCrawlerSingleClient newClient(final RemoteService remote) { } @Override - public TreeMap getSessions(String group) { + public TreeMap getSessions(String group) { // We merge the result of all the nodes - TreeMap globalSessions = new TreeMap(); + TreeMap globalSessions = new TreeMap(); for (WebCrawlerSingleClient client : this) { try { - TreeMap localSessions = client.getSessions(group); + TreeMap localSessions = client.getSessions(group); if (localSessions == null) continue; globalSessions.putAll(localSessions); @@ -62,7 +63,7 @@ public TreeMap getSessions(String group) { } @Override - public WebCrawlStatus getSession(String session_name, String group) { + public CrawlStatus getSession(String session_name, String group) { for (WebCrawlerSingleClient client : this) { try { @@ -93,7 +94,7 @@ public Response abortSession(String session_name, String reason, String group) { } @Override - public WebCrawlStatus runSession(final String session_name, final WebCrawlDefinition crawlDefinition) { + public CrawlStatus runSession(final String session_name, final WebCrawlDefinition crawlDefinition) { final ExceptionUtils.Holder exceptionHolder = new ExceptionUtils.Holder(logger); for (WebCrawlerSingleClient client : this) { try { @@ -112,7 +113,7 @@ public WebCrawlStatus runSession(final String session_name, final WebCrawlDefini } @Override - public WebCrawlStatus runSession(String session_name, String jsonCrawlDefinition) throws IOException { + public CrawlStatus runSession(String session_name, String jsonCrawlDefinition) throws IOException { return runSession(session_name, WebCrawlDefinition.newInstance(jsonCrawlDefinition)); } diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceImpl.java b/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceImpl.java index 30f8ace..986b3d3 100644 --- a/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceImpl.java +++ b/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceImpl.java @@ -15,6 +15,7 @@ **/ package com.qwazr.crawler.web; +import com.qwazr.crawler.common.CrawlStatus; import com.qwazr.server.AbstractServiceImpl; import com.qwazr.server.ServerException; import org.slf4j.Logger; @@ -45,7 +46,7 @@ public void init() { } @Override - public TreeMap getSessions(final String group) { + public TreeMap getSessions(final String group) { // Read the sessions in the local node if (!webrawlerManager.clusterManager.isGroup(group)) return new TreeMap<>(); @@ -53,10 +54,10 @@ public TreeMap getSessions(final String group) { } @Override - public WebCrawlStatus getSession(final String session_name, final String group) { + public CrawlStatus getSession(final String session_name, final String group) { try { - final WebCrawlStatus status = - webrawlerManager.clusterManager.isGroup(group) ? webrawlerManager.getSession(session_name) : null; + final CrawlStatus status = webrawlerManager.clusterManager.isGroup(group) ? webrawlerManager.getSession( + session_name) : null; if (status != null) return status; throw new ServerException(Status.NOT_FOUND, "Session not found"); @@ -78,7 +79,7 @@ public Response abortSession(final String session_name, final String reason, fin } @Override - public WebCrawlStatus runSession(final String session_name, final WebCrawlDefinition crawlDefinition) { + public CrawlStatus runSession(final String session_name, final WebCrawlDefinition crawlDefinition) { try { return webrawlerManager.runSession(session_name, crawlDefinition); } catch (ServerException e) { @@ -86,7 +87,7 @@ public WebCrawlStatus runSession(final String session_name, final WebCrawlDefini } } - public WebCrawlStatus runSession(final String session_name, final String jsonCrawlDefinition) throws IOException { + public CrawlStatus runSession(final String session_name, final String jsonCrawlDefinition) throws IOException { return runSession(session_name, WebCrawlDefinition.newInstance(jsonCrawlDefinition)); } diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceInterface.java b/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceInterface.java index 3428cce..4c61401 100644 --- a/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceInterface.java +++ b/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceInterface.java @@ -16,6 +16,7 @@ package com.qwazr.crawler.web; import com.fasterxml.jackson.core.type.TypeReference; +import com.qwazr.crawler.common.CrawlStatus; import com.qwazr.server.ServiceInterface; import javax.annotation.security.RolesAllowed; @@ -40,12 +41,12 @@ public interface WebCrawlerServiceInterface extends ServiceInterface { @GET @Path("/sessions") @Produces(ServiceInterface.APPLICATION_JSON_UTF8) - TreeMap getSessions(@QueryParam("group") String group); + TreeMap getSessions(@QueryParam("group") String group); @GET @Path("/sessions/{session_name}") @Produces(ServiceInterface.APPLICATION_JSON_UTF8) - WebCrawlStatus getSession(@PathParam("session_name") String session_name, @QueryParam("group") String group); + CrawlStatus getSession(@PathParam("session_name") String session_name, @QueryParam("group") String group); @DELETE @Path("/sessions/{session_name}") @@ -56,12 +57,12 @@ Response abortSession(@PathParam("session_name") String session_name, @QueryPara @Path("/sessions/{session_name}") @Consumes(ServiceInterface.APPLICATION_JSON_UTF8) @Produces(ServiceInterface.APPLICATION_JSON_UTF8) - WebCrawlStatus runSession(@PathParam("session_name") String session_name, WebCrawlDefinition crawlDefinition); + CrawlStatus runSession(@PathParam("session_name") String session_name, WebCrawlDefinition crawlDefinition); - WebCrawlStatus runSession(String session_name, String jsonCrawlDefinition) throws IOException; + CrawlStatus runSession(String session_name, String jsonCrawlDefinition) throws IOException; - TypeReference> TreeMapStringCrawlTypeRef = - new TypeReference>() { + TypeReference> TreeMapStringCrawlTypeRef = + new TypeReference>() { }; } diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlerSingleClient.java b/src/main/java/com/qwazr/crawler/web/WebCrawlerSingleClient.java index 65a08f4..a4dfa78 100644 --- a/src/main/java/com/qwazr/crawler/web/WebCrawlerSingleClient.java +++ b/src/main/java/com/qwazr/crawler/web/WebCrawlerSingleClient.java @@ -1,5 +1,5 @@ -/** - * Copyright 2014-2016 Emmanuel Keller / QWAZR +/* + * Copyright 2015-2017 Emmanuel Keller / QWAZR *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ **/ package com.qwazr.crawler.web; +import com.qwazr.crawler.common.CrawlStatus; import com.qwazr.server.RemoteService; import com.qwazr.server.client.JsonClientAbstract; import com.qwazr.utils.UBuilder; @@ -31,19 +32,19 @@ class WebCrawlerSingleClient extends JsonClientAbstract implements WebCrawlerSer } @Override - public TreeMap getSessions(final String group) { - final UBuilder uriBuilder = - RemoteService.getNewUBuilder(remote, "/crawler/web/sessions").setParameter("group", group); + public TreeMap getSessions(final String group) { + final UBuilder uriBuilder = RemoteService.getNewUBuilder(remote, "/crawler/web/sessions").setParameter("group", + group); HttpRequest request = HttpRequest.Get(uriBuilder.buildNoEx()); return executeJson(request, null, null, TreeMapStringCrawlTypeRef, valid200Json); } @Override - public WebCrawlStatus getSession(final String session_name, final String group) { + public CrawlStatus getSession(final String session_name, final String group) { final UBuilder uriBuilder = RemoteService.getNewUBuilder(remote, "/crawler/web/sessions/", session_name) .setParameter("group", group); HttpRequest request = HttpRequest.Get(uriBuilder.buildNoEx()); - return executeJson(request, null, null, WebCrawlStatus.class, valid200Json); + return executeJson(request, null, null, CrawlStatus.class, valid200Json); } @Override @@ -57,14 +58,14 @@ public Response abortSession(final String session_name, final String reason, fin } @Override - public WebCrawlStatus runSession(final String session_name, final WebCrawlDefinition crawlDefinition) { + public CrawlStatus runSession(final String session_name, final WebCrawlDefinition crawlDefinition) { final UBuilder uriBuilder = RemoteService.getNewUBuilder(remote, "/crawler/web/sessions/", session_name); final HttpRequest request = HttpRequest.Post(uriBuilder.buildNoEx()); - return executeJson(request, crawlDefinition, null, WebCrawlStatus.class, valid200202Json); + return executeJson(request, crawlDefinition, null, CrawlStatus.class, valid200202Json); } @Override - public WebCrawlStatus runSession(final String session_name, final String jsonCrawlDefinition) throws IOException { + public CrawlStatus runSession(final String session_name, final String jsonCrawlDefinition) throws IOException { return runSession(session_name, WebCrawlDefinition.newInstance(jsonCrawlDefinition)); } diff --git a/src/test/java/com/qwazr/crawler/web/test/WebCrawlerTest.java b/src/test/java/com/qwazr/crawler/web/test/WebCrawlerTest.java index 89d1171..f59b6f3 100644 --- a/src/test/java/com/qwazr/crawler/web/test/WebCrawlerTest.java +++ b/src/test/java/com/qwazr/crawler/web/test/WebCrawlerTest.java @@ -15,8 +15,8 @@ **/ package com.qwazr.crawler.web.test; +import com.qwazr.crawler.common.CrawlStatus; import com.qwazr.crawler.web.WebCrawlDefinition; -import com.qwazr.crawler.web.WebCrawlStatus; import com.qwazr.crawler.web.WebCrawlerServer; import com.qwazr.crawler.web.WebCrawlerServiceBuilder; import com.qwazr.crawler.web.WebCrawlerServiceInterface; @@ -31,8 +31,8 @@ import org.junit.Test; import org.junit.runners.MethodSorters; -import java.util.TreeMap; import java.util.HashMap; +import java.util.TreeMap; import java.util.concurrent.TimeUnit; @FixMethodOrder(MethodSorters.NAME_ASCENDING) @@ -62,7 +62,7 @@ public void test100startServer() throws Exception { @Test public void test200emptySessions() { - TreeMap sessions = remote.getSessions(null); + TreeMap sessions = remote.getSessions(null); Assert.assertNotNull(sessions); Assert.assertTrue(sessions.isEmpty()); } @@ -81,7 +81,7 @@ private WebCrawlDefinition getNewWebCrawl() { private void crawlWait(final String sessionName, final int crawlCount) throws InterruptedException { WaitFor.of().timeOut(TimeUnit.MINUTES, 2).until(() -> { - WebCrawlStatus status = ErrorWrapper.bypass(() -> remote.getSession(sessionName, null), 404); + CrawlStatus status = ErrorWrapper.bypass(() -> remote.getSession(sessionName, null), 404); if (status == null) return false; return status.urls.crawled == crawlCount; @@ -101,7 +101,7 @@ public void test400CrawlEvent() throws InterruptedException { final WebCrawlDefinition webCrawl = getNewWebCrawl(); webCrawl.scripts = new HashMap<>(); webCrawl.scripts.put(WebCrawlDefinition.EventEnum.before_crawl, - new WebCrawlDefinition.Script(BeforeCrawl.class.getName())); + new WebCrawlDefinition.Script(BeforeCrawl.class.getName())); remote.runSession(sessionName, webCrawl); crawlWait(sessionName, 3); Assert.assertEquals(4, BeforeCrawl.count.get());