diff --git a/src/main/java/com/qwazr/crawler/common/CrawlDefinition.java b/src/main/java/com/qwazr/crawler/common/CrawlDefinition.java
new file mode 100644
index 0000000..ca77614
--- /dev/null
+++ b/src/main/java/com/qwazr/crawler/common/CrawlDefinition.java
@@ -0,0 +1,131 @@
+/**
+ * Copyright 2015-2017 Emmanuel Keller / QWAZR
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ **/
+package com.qwazr.crawler.common;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonInclude.Include;
+
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+@JsonInclude(Include.NON_EMPTY)
+public class CrawlDefinition implements Cloneable {
+
+ /**
+ * The global variables shared by all the scripts.
+ */
+ public LinkedHashMap variables = null;
+
+ /**
+ * A list of scripts paths mapped with the events which fire the scripts.
+ */
+ public Map scripts = null;
+
+ public enum EventEnum {
+
+ /**
+ * Executed before the crawl session start
+ */
+ before_session,
+
+ /**
+ * Executed after the crawl session ends
+ */
+ after_session,
+
+ /**
+ * Executed before an URL is crawled
+ */
+ before_crawl,
+
+ /**
+ * Executed after an URL has been crawled
+ */
+ after_crawl
+ }
+
+ @JsonInclude(Include.NON_EMPTY)
+ public static class Script implements Cloneable {
+
+ /**
+ * The path to the scripts
+ */
+ public String name = null;
+
+ /**
+ * The local variables passed to the scripts
+ */
+ public Map variables = null;
+
+ public Script() {
+ }
+
+ public Script(String name) {
+ this.name = name;
+ }
+
+ protected Script(Script src) {
+ this.name = src.name;
+ this.variables = src.variables == null ? null : new HashMap(src.variables);
+ }
+
+ @Override
+ final public Object clone() {
+ return new Script(this);
+ }
+
+ public Script addVariable(String name, String value) {
+ if (variables == null)
+ variables = new HashMap<>();
+ variables.put(name, value);
+ return this;
+ }
+
+ }
+
+ public CrawlDefinition() {
+ }
+
+ protected CrawlDefinition(CrawlDefinition src) {
+ variables = src.variables == null ? null : new LinkedHashMap<>(src.variables);
+ if (src.scripts == null) {
+ scripts = null;
+ } else {
+ scripts = new HashMap<>();
+ src.scripts.forEach((eventEnum, script) -> scripts.put(eventEnum, new Script(script)));
+ }
+ }
+
+ public Object clone() {
+ return new CrawlDefinition(this);
+ }
+
+ @JsonIgnore
+ public Script addScript(final String event, final String name) {
+ if (scripts == null)
+ scripts = new LinkedHashMap<>();
+ Script script = new Script(name);
+ scripts.put(EventEnum.valueOf(event), script);
+ return script;
+ }
+
+ public Map getScripts() {
+ return scripts;
+ }
+
+}
diff --git a/src/main/java/com/qwazr/crawler/common/CrawlManager.java b/src/main/java/com/qwazr/crawler/common/CrawlManager.java
new file mode 100644
index 0000000..17aad62
--- /dev/null
+++ b/src/main/java/com/qwazr/crawler/common/CrawlManager.java
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2017 Emmanuel Keller / QWAZR
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.qwazr.crawler.common;
+
+import com.qwazr.cluster.ClusterManager;
+import com.qwazr.server.ServerException;
+import org.slf4j.Logger;
+
+import javax.ws.rs.core.Response;
+import java.util.TreeMap;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+public abstract class CrawlManager, D extends CrawlDefinition> {
+
+ private final ConcurrentHashMap crawlSessionMap;
+ private final ExecutorService executorService;
+ private final Logger logger;
+ public final String myAddress;
+ public final ClusterManager clusterManager;
+
+ protected CrawlManager(final ClusterManager clusterManager, final ExecutorService executorService,
+ final Logger logger) {
+ crawlSessionMap = new ConcurrentHashMap<>();
+ this.clusterManager = clusterManager;
+ this.myAddress = clusterManager.getServiceBuilder().local().getStatus().me;
+ this.executorService = executorService;
+ this.logger = logger;
+ }
+
+ public TreeMap getSessions() {
+ final TreeMap map = new TreeMap<>();
+ crawlSessionMap.forEach((key, crawl) -> map.put(key, crawl.getStatus()));
+ return map;
+ }
+
+ public CrawlStatus getSession(final String sessionName) {
+ final T crawlThread = crawlSessionMap.get(sessionName);
+ return crawlThread == null ? null : crawlThread.getStatus();
+ }
+
+ public void abortSession(final String sessionName, final String abortingReason) throws ServerException {
+ final T crawlThread = crawlSessionMap.get(sessionName);
+ if (crawlThread == null)
+ throw new ServerException(Response.Status.NOT_FOUND, "Session not found: " + sessionName);
+ logger.info("Aborting session: {} - {}", sessionName, abortingReason);
+ crawlThread.abort(abortingReason);
+ }
+
+ protected abstract T newCrawlThread(final String sessionName, final D crawlDefinition);
+
+ public CrawlStatus runSession(final String sessionName, final D crawlDefinition) throws ServerException {
+
+ final AtomicBoolean newThread = new AtomicBoolean(false);
+
+ final T crawlThread = crawlSessionMap.computeIfAbsent(sessionName, key -> {
+ logger.info("Create session: {}", sessionName);
+ newThread.set(true);
+ return newCrawlThread(sessionName, crawlDefinition);
+ });
+
+ if (!newThread.get())
+ throw new ServerException(Response.Status.CONFLICT, "The session already exists: " + sessionName);
+ executorService.execute(crawlThread);
+ return crawlThread.getStatus();
+ }
+
+ public void removeSession(final T crawlThread) {
+ logger.info("Remove session: {}", crawlThread.getSessionName());
+ crawlSessionMap.remove(crawlThread.getSessionName(), crawlThread);
+ }
+
+}
diff --git a/src/main/java/com/qwazr/crawler/web/CurrentSession.java b/src/main/java/com/qwazr/crawler/common/CrawlSession.java
similarity index 97%
rename from src/main/java/com/qwazr/crawler/web/CurrentSession.java
rename to src/main/java/com/qwazr/crawler/common/CrawlSession.java
index 1cb3105..5cec3d3 100644
--- a/src/main/java/com/qwazr/crawler/web/CurrentSession.java
+++ b/src/main/java/com/qwazr/crawler/common/CrawlSession.java
@@ -13,13 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
**/
-package com.qwazr.crawler.web;
+package com.qwazr.crawler.common;
import com.qwazr.utils.TimeTracker;
import java.util.Map;
-public interface CurrentSession {
+public interface CrawlSession {
Map getVariables();
diff --git a/src/main/java/com/qwazr/crawler/web/CurrentSessionImpl.java b/src/main/java/com/qwazr/crawler/common/CrawlSessionImpl.java
similarity index 73%
rename from src/main/java/com/qwazr/crawler/web/CurrentSessionImpl.java
rename to src/main/java/com/qwazr/crawler/common/CrawlSessionImpl.java
index 6530e2d..d2a4b3c 100644
--- a/src/main/java/com/qwazr/crawler/web/CurrentSessionImpl.java
+++ b/src/main/java/com/qwazr/crawler/common/CrawlSessionImpl.java
@@ -1,4 +1,4 @@
-/**
+/*
* Copyright 2015-2017 Emmanuel Keller / QWAZR
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,18 +13,17 @@
* See the License for the specific language governing permissions and
* limitations under the License.
**/
-package com.qwazr.crawler.web;
+package com.qwazr.crawler.common;
import com.qwazr.utils.TimeTracker;
-import org.apache.commons.collections4.CollectionUtils;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicBoolean;
-class CurrentSessionImpl implements CurrentSession {
+public class CrawlSessionImpl implements CrawlSession {
- private final WebCrawlDefinition crawlDefinition;
+ private final T crawlDefinition;
private final String name;
private final AtomicBoolean abort;
private final TimeTracker timeTracker;
@@ -37,19 +36,20 @@ class CurrentSessionImpl implements CurrentSession {
private volatile Integer currentDepth = null;
private volatile String abortingReason = null;
- CurrentSessionImpl(WebCrawlDefinition crawlDefinition, String name, TimeTracker timeTracker) {
+ public CrawlSessionImpl(T crawlDefinition, String name) {
this.crawlDefinition = crawlDefinition;
- this.timeTracker = timeTracker;
+ this.timeTracker = new TimeTracker();
this.name = name;
abort = new AtomicBoolean(false);
this.variables = new ConcurrentHashMap<>();
- if (crawlDefinition.variables != null)
- for (Map.Entry entry : crawlDefinition.variables.entrySet())
- if (entry.getKey() != null && entry.getValue() != null)
- this.variables.put(entry.getKey(), entry.getValue());
+ if (crawlDefinition.variables != null) {
+ crawlDefinition.variables.forEach((key, value) -> {
+ if (key != null && value != null)
+ this.variables.put(key, value);
+ });
+ }
}
- @Override
public Map getVariables() {
return variables;
}
@@ -59,7 +59,10 @@ public Object getVariable(String name) {
return variables.get(name);
}
- @Override
+ /**
+ * @param name the name of the variable
+ * @return the value of the variable
+ */
public Object setVariable(String name, Object value) {
if (value == null)
return removeVariable(name);
@@ -104,7 +107,7 @@ public String getAbortingReason() {
return abortingReason;
}
- synchronized int incIgnoredCount() {
+ public synchronized int incIgnoredCount() {
return ++ignoredCount;
}
@@ -113,7 +116,7 @@ public Integer getIgnoredCount() {
return ignoredCount;
}
- synchronized int incCrawledCount() {
+ public synchronized int incCrawledCount() {
return ++crawledCount;
}
@@ -122,7 +125,7 @@ public Integer getCrawledCount() {
return crawledCount;
}
- synchronized int incErrorCount() {
+ public synchronized int incErrorCount() {
return ++errorCount;
}
@@ -146,20 +149,15 @@ public Integer getCurrentDepth() {
return currentDepth;
}
- synchronized void setCurrentURI(String currentURI, Integer currentDepth) {
+ public synchronized void setCurrentURI(String currentURI, Integer currentDepth) {
this.currentURI = currentURI;
this.currentDepth = currentDepth;
}
- public WebCrawlDefinition getCrawlDefinition() {
+ public T getCrawlDefinition() {
return crawlDefinition;
}
- public boolean isURLPatterns() {
- return crawlDefinition != null && !CollectionUtils.isEmpty(crawlDefinition.inclusion_patterns)
- && !CollectionUtils.isEmpty(crawlDefinition.exclusion_patterns);
- }
-
public TimeTracker getTimeTracker() {
return timeTracker;
}
diff --git a/src/main/java/com/qwazr/crawler/common/CrawlStatus.java b/src/main/java/com/qwazr/crawler/common/CrawlStatus.java
new file mode 100644
index 0000000..70a6506
--- /dev/null
+++ b/src/main/java/com/qwazr/crawler/common/CrawlStatus.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2015-2017 Emmanuel Keller / QWAZR
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ **/
+package com.qwazr.crawler.common;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonInclude.Include;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.qwazr.utils.TimeTracker;
+
+@JsonInclude(Include.NON_NULL)
+public class CrawlStatus {
+
+ final public String entry_url;
+ final public String node_address;
+ final public Boolean aborting;
+ final public String aborting_reason;
+ final public TimeTracker.Status timer;
+ final public UrlStatus urls;
+
+ @JsonCreator
+ public CrawlStatus(@JsonProperty("node_address") String nodeAddress, @JsonProperty("aborting") Boolean aborting,
+ @JsonProperty("aborting_reason") String abortingReason, @JsonProperty("entry_url") String entryUrl,
+ @JsonProperty("timer") TimeTracker.Status timer, @JsonProperty("urls") UrlStatus urlStatus) {
+ this.node_address = nodeAddress;
+ this.timer = timer;
+ this.aborting = aborting;
+ this.aborting_reason = abortingReason;
+ this.entry_url = entryUrl;
+ this.urls = urlStatus;
+ }
+
+ public CrawlStatus(final String nodeAddress, final String entryUrl, final CrawlSession session) {
+ this(nodeAddress, session.isAborting(), session.getAbortingReason(), entryUrl,
+ session.getTimeTracker() == null ? null : session.getTimeTracker().getStatus(), new UrlStatus(session));
+
+ }
+
+ @JsonInclude(Include.NON_EMPTY)
+ public static class UrlStatus {
+
+ final public int crawled;
+ final public int ignored;
+ final public int error;
+ final public String current_uri;
+ final public Integer current_depth;
+
+ @JsonCreator
+ UrlStatus(@JsonProperty("crawled") Integer crawled, @JsonProperty("ignored") Integer ignored,
+ @JsonProperty("error") Integer error, @JsonProperty("current_uri") String currentUri,
+ @JsonProperty("current_depth") Integer currentDepth) {
+ this.crawled = crawled == null ? 0 : crawled;
+ this.ignored = ignored == null ? 0 : ignored;
+ this.error = error == null ? 0 : error;
+ this.current_uri = currentUri;
+ this.current_depth = currentDepth;
+ }
+
+ private UrlStatus(CrawlSession session) {
+ this(session.getCrawledCount(), session.getIgnoredCount(), session.getErrorCount(), session.getCurrentURI(),
+ session.getCurrentDepth());
+ }
+ }
+
+}
diff --git a/src/main/java/com/qwazr/crawler/common/CrawlThread.java b/src/main/java/com/qwazr/crawler/common/CrawlThread.java
new file mode 100644
index 0000000..7fff6a9
--- /dev/null
+++ b/src/main/java/com/qwazr/crawler/common/CrawlThread.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2017 Emmanuel Keller / QWAZR
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.qwazr.crawler.common;
+
+import org.slf4j.Logger;
+
+public abstract class CrawlThread implements Runnable {
+
+ protected final M manager;
+ protected final CrawlSessionImpl session;
+ protected final Logger logger;
+
+ protected CrawlThread(final M manager, final CrawlSessionImpl session, final Logger logger) {
+ this.manager = manager;
+ this.session = session;
+ this.logger = logger;
+ }
+
+ public String getSessionName() {
+ return session.getName();
+ }
+
+ protected abstract void runner() throws Exception;
+
+ @Override
+ final public void run() {
+ try {
+ runner();
+ } catch (Exception e) {
+ logger.error(e.getMessage(), e);
+ } finally {
+ manager.removeSession(this);
+ }
+ }
+
+ public abstract CrawlStatus getStatus();
+
+ public void abort(final String abortingReason) {
+ session.abort(abortingReason);
+
+ }
+}
diff --git a/src/main/java/com/qwazr/crawler/file/FileCrawlDefinition.java b/src/main/java/com/qwazr/crawler/file/FileCrawlDefinition.java
new file mode 100644
index 0000000..a82763f
--- /dev/null
+++ b/src/main/java/com/qwazr/crawler/file/FileCrawlDefinition.java
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2017 Emmanuel Keller / QWAZR
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ **/
+package com.qwazr.crawler.file;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonInclude.Include;
+import com.qwazr.crawler.common.CrawlDefinition;
+import com.qwazr.utils.StringUtils;
+import com.qwazr.utils.json.JsonMapper;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+@JsonInclude(Include.NON_EMPTY)
+public class FileCrawlDefinition extends CrawlDefinition {
+
+ /**
+ * The entry point PATH of the crawl.
+ */
+ public String entry_path = null;
+
+ /**
+ * A list of regular expression patterns. An URL may not be crawled if it
+ * matches any pattern.
+ */
+ public List exclusion_patterns = null;
+
+ /**
+ * Time wait on successfull crawl
+ */
+ public Integer crawl_wait_ms = null;
+
+ public FileCrawlDefinition() {
+ }
+
+ protected FileCrawlDefinition(FileCrawlDefinition src) {
+ super(src);
+ entry_path = src.entry_path;
+ exclusion_patterns = src.exclusion_patterns == null ? null : new ArrayList<>(src.exclusion_patterns);
+ crawl_wait_ms = src.crawl_wait_ms;
+ }
+
+ public Object clone() {
+ return new FileCrawlDefinition(this);
+ }
+
+ @JsonIgnore
+ public FileCrawlDefinition setEntryPath(final String entryPath) {
+ this.entry_path = entryPath;
+ return this;
+ }
+
+ @JsonIgnore
+ public String getEntryPath() {
+ return this.entry_path;
+ }
+
+ @JsonIgnore
+ public FileCrawlDefinition setExclusionPattern(final String exclusionPatternText) throws IOException {
+ if (exclusionPatternText == null) {
+ exclusion_patterns = null;
+ return this;
+ }
+ exclusion_patterns = new ArrayList<>();
+ StringUtils.linesCollector(exclusionPatternText, false, exclusion_patterns);
+ return this;
+ }
+
+ @JsonIgnore
+ public FileCrawlDefinition addExclusionPattern(final String exclusionPattern) {
+ if (exclusion_patterns == null)
+ exclusion_patterns = new ArrayList<>();
+ exclusion_patterns.add(exclusionPattern);
+ return this;
+ }
+
+ @JsonIgnore
+ public Collection getExclusionPatterns() {
+ return exclusion_patterns;
+ }
+
+ @JsonIgnore
+ public FileCrawlDefinition setCrawlWaitMs(Integer crawlWaitMs) {
+ this.crawl_wait_ms = crawlWaitMs;
+ return this;
+ }
+
+ @JsonIgnore
+ public Integer getCrawlWaitMs() {
+ return crawl_wait_ms;
+ }
+
+ @JsonIgnore
+ public static FileCrawlDefinition newInstance(final String json) throws IOException {
+ return JsonMapper.MAPPER.readValue(json, FileCrawlDefinition.class);
+ }
+
+}
diff --git a/src/main/java/com/qwazr/crawler/file/FileCrawlThread.java b/src/main/java/com/qwazr/crawler/file/FileCrawlThread.java
new file mode 100644
index 0000000..a6f6a8d
--- /dev/null
+++ b/src/main/java/com/qwazr/crawler/file/FileCrawlThread.java
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2017 Emmanuel Keller / QWAZR
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ **/
+package com.qwazr.crawler.file;
+
+import com.qwazr.crawler.common.CrawlSessionImpl;
+import com.qwazr.crawler.common.CrawlStatus;
+import com.qwazr.crawler.common.CrawlThread;
+import org.slf4j.Logger;
+
+public class FileCrawlThread extends CrawlThread {
+
+ private final FileCrawlDefinition crawlDefinition;
+
+ public FileCrawlThread(FileCrawlerManager manager, CrawlSessionImpl session, Logger logger) {
+ super(manager, session, logger);
+ crawlDefinition = session.getCrawlDefinition();
+ }
+
+ @Override
+ protected void runner() throws Exception {
+
+ }
+
+ @Override
+ public CrawlStatus getStatus() {
+ return new CrawlStatus(manager.myAddress, crawlDefinition.entry_path, session);
+ }
+}
diff --git a/src/main/java/com/qwazr/crawler/file/FileCrawlerManager.java b/src/main/java/com/qwazr/crawler/file/FileCrawlerManager.java
new file mode 100644
index 0000000..098500d
--- /dev/null
+++ b/src/main/java/com/qwazr/crawler/file/FileCrawlerManager.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2017 Emmanuel Keller / QWAZR
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ **/
+package com.qwazr.crawler.file;
+
+import com.qwazr.cluster.ClusterManager;
+import com.qwazr.crawler.common.CrawlManager;
+import com.qwazr.crawler.common.CrawlSessionImpl;
+import com.qwazr.crawler.web.WebCrawlerManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.concurrent.ExecutorService;
+
+public class FileCrawlerManager extends CrawlManager {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(WebCrawlerManager.class);
+
+ protected FileCrawlerManager(ClusterManager clusterManager, ExecutorService executorService, Logger logger) {
+ super(clusterManager, executorService, logger);
+ }
+
+ @Override
+ protected FileCrawlThread newCrawlThread(String sessionName, FileCrawlDefinition crawlDefinition) {
+ return new FileCrawlThread(this, new CrawlSessionImpl<>(crawlDefinition, sessionName), LOGGER);
+ }
+}
diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlDefinition.java b/src/main/java/com/qwazr/crawler/web/WebCrawlDefinition.java
index 0042183..d304c38 100644
--- a/src/main/java/com/qwazr/crawler/web/WebCrawlDefinition.java
+++ b/src/main/java/com/qwazr/crawler/web/WebCrawlDefinition.java
@@ -1,5 +1,5 @@
/**
- * Copyright 2014-2016 Emmanuel Keller / QWAZR
+ * Copyright 2015-2017 Emmanuel Keller / QWAZR
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonInclude.Include;
import com.fasterxml.jackson.annotation.JsonProperty;
+import com.qwazr.crawler.common.CrawlDefinition;
import com.qwazr.crawler.web.driver.BrowserDriverEnum;
import com.qwazr.utils.StringUtils;
import com.qwazr.utils.json.JsonMapper;
@@ -26,10 +27,14 @@
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
@JsonInclude(Include.NON_EMPTY)
-public class WebCrawlDefinition implements Cloneable {
+public class WebCrawlDefinition extends CrawlDefinition {
/**
* URL called before a crawl session starts
@@ -238,82 +243,11 @@ public ProxyDefinition setHttpProxy(String http_proxy) {
public Integer page_load_timeout = null;
- /**
- * The global variables shared by all the scripts.
- */
- public LinkedHashMap variables = null;
-
- /**
- * A list of scripts paths mapped with the events which fire the scripts.
- */
- public Map scripts = null;
-
- public enum EventEnum {
-
- /**
- * Executed before the crawl session start
- */
- before_session,
-
- /**
- * Executed after the crawl session ends
- */
- after_session,
-
- /**
- * Executed before an URL is crawled
- */
- before_crawl,
-
- /**
- * Executed after an URL has been crawled
- */
- after_crawl
- }
-
- @JsonInclude(Include.NON_EMPTY)
- public static class Script implements Cloneable {
-
- /**
- * The path to the scripts
- */
- public String name = null;
-
- /**
- * The local variables passed to the scripts
- */
- public Map variables = null;
-
- public Script() {
- }
-
- public Script(String name) {
- this.name = name;
- }
-
- protected Script(Script src) {
- this.name = src.name;
- this.variables = src.variables == null ? null : new HashMap(src.variables);
- }
-
- @Override
- final public Object clone() {
- return new Script(this);
- }
-
- public Script addVariable(String name, String value) {
- if (variables == null)
- variables = new HashMap<>();
- variables.put(name, value);
- return this;
- }
-
- }
-
public WebCrawlDefinition() {
}
protected WebCrawlDefinition(WebCrawlDefinition src) {
+ super(src);
pre_url = src.pre_url;
entry_url = src.entry_url;
entry_request = src.entry_request;
@@ -340,13 +274,6 @@ protected WebCrawlDefinition(WebCrawlDefinition src) {
script_timeout = src.script_timeout;
page_load_timeout = src.page_load_timeout;
crawl_wait_ms = src.crawl_wait_ms;
- variables = src.variables == null ? null : new LinkedHashMap<>(src.variables);
- if (src.scripts == null) {
- scripts = null;
- } else {
- scripts = new HashMap<>();
- src.scripts.forEach((eventEnum, script) -> scripts.put(eventEnum, new Script(script)));
- }
}
public Object clone() {
@@ -673,19 +600,6 @@ public WebCrawlDefinition setCookies(final Map cookies) {
return this;
}
- @JsonIgnore
- public Script addScript(final String event, final String name) {
- if (scripts == null)
- scripts = new LinkedHashMap<>();
- Script script = new Script(name);
- scripts.put(EventEnum.valueOf(event), script);
- return script;
- }
-
- public Map getScripts() {
- return scripts;
- }
-
@JsonIgnore
public String urlEncode(final String value) throws UnsupportedEncodingException {
return URLEncoder.encode(value, "UTF-8");
diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlStatus.java b/src/main/java/com/qwazr/crawler/web/WebCrawlStatus.java
deleted file mode 100644
index 28c425c..0000000
--- a/src/main/java/com/qwazr/crawler/web/WebCrawlStatus.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Copyright 2014-2016 Emmanuel Keller / QWAZR
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- **/
-package com.qwazr.crawler.web;
-
-import com.fasterxml.jackson.annotation.JsonInclude;
-import com.fasterxml.jackson.annotation.JsonInclude.Include;
-import com.qwazr.utils.TimeTracker;
-import com.qwazr.crawler.web.CurrentSession;
-
-@JsonInclude(Include.NON_NULL)
-public class WebCrawlStatus {
-
- final public String node_address;
- final public String entry_url;
- final public Boolean aborting;
- final public String aborting_reason;
- final public UrlStatus urls;
- final public TimeTracker.Status timer;
-
- public WebCrawlStatus() {
- node_address = null;
- timer = null;
- entry_url = null;
- aborting = null;
- aborting_reason = null;
- urls = null;
- }
-
- public WebCrawlStatus(String node_address, String entry_url, CurrentSession session) {
- this.node_address = node_address;
- this.entry_url = entry_url;
- TimeTracker tt = session.getTimeTracker();
- this.timer = tt == null ? null : tt.getStatus();
- this.aborting = session.isAborting();
- this.aborting_reason = session.getAbortingReason();
- this.urls = new UrlStatus(session);
- }
-
- @JsonInclude(Include.NON_EMPTY)
- public static class UrlStatus {
-
- final public int crawled;
- final public int ignored;
- final public int error;
- final public String current_uri;
- final public Integer current_depth;
-
- public UrlStatus() {
- crawled = 0;
- ignored = 0;
- error = 0;
- current_uri = null;
- current_depth = null;
- }
-
- private UrlStatus(CurrentSession session) {
- this.crawled = session.getCrawledCount();
- this.ignored = session.getIgnoredCount();
- this.error = session.getErrorCount();
- this.current_uri = session.getCurrentURI();
- this.current_depth = session.getCurrentDepth();
- }
- }
-}
diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlThread.java b/src/main/java/com/qwazr/crawler/web/WebCrawlThread.java
index 6b5d0e1..82876d6 100644
--- a/src/main/java/com/qwazr/crawler/web/WebCrawlThread.java
+++ b/src/main/java/com/qwazr/crawler/web/WebCrawlThread.java
@@ -1,5 +1,5 @@
/**
- * Copyright 2014-2016 Emmanuel Keller / QWAZR
+ * Copyright 2015-2017 Emmanuel Keller / QWAZR
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -16,10 +16,13 @@
package com.qwazr.crawler.web;
import com.google.common.net.InternetDomainName;
+import com.qwazr.crawler.common.CrawlDefinition.EventEnum;
+import com.qwazr.crawler.common.CrawlDefinition.Script;
+import com.qwazr.crawler.common.CrawlSessionImpl;
+import com.qwazr.crawler.common.CrawlStatus;
+import com.qwazr.crawler.common.CrawlThread;
import com.qwazr.crawler.web.driver.BrowserDriver;
import com.qwazr.crawler.web.driver.BrowserDriverBuilder;
-import com.qwazr.crawler.web.WebCrawlDefinition.EventEnum;
-import com.qwazr.crawler.web.WebCrawlDefinition.Script;
import com.qwazr.scripts.ScriptRunThread;
import com.qwazr.server.ServerException;
import com.qwazr.utils.RegExpUtils;
@@ -55,12 +58,10 @@
import java.util.regex.Matcher;
import java.util.regex.PatternSyntaxException;
-public class WebCrawlThread implements Runnable {
+public class WebCrawlThread extends CrawlThread {
private static final Logger LOGGER = LoggerFactory.getLogger(WebCrawlThread.class);
- private final WebCrawlerManager webCrawlerManager;
- private final CurrentSessionImpl session;
private final WebCrawlDefinition crawlDefinition;
private final InternetDomainName internetDomainName;
@@ -78,10 +79,9 @@ public class WebCrawlThread implements Runnable {
WebCrawlThread(final WebCrawlerManager webCrawlerManager, final String sessionName,
final WebCrawlDefinition crawlDefinition) throws ServerException {
- timeTracker = new TimeTracker();
- this.webCrawlerManager = webCrawlerManager;
- this.session = new CurrentSessionImpl(crawlDefinition, sessionName, timeTracker);
+ super(webCrawlerManager, new CrawlSessionImpl<>(crawlDefinition, sessionName), LOGGER);
this.crawlDefinition = crawlDefinition;
+ this.timeTracker = session.getTimeTracker();
if (crawlDefinition.browser_type == null)
throw new ServerException(Status.NOT_ACCEPTABLE, "The browser_type is missing");
if (crawlDefinition.entry_url == null && crawlDefinition.entry_request == null)
@@ -115,16 +115,9 @@ public class WebCrawlThread implements Runnable {
}
}
- String getSessionName() {
- return session.getName();
- }
-
- WebCrawlStatus getStatus() {
- return new WebCrawlStatus(webCrawlerManager.myAddress, crawlDefinition.entry_url, session);
- }
-
- void abort(String reason) {
- session.abort(reason);
+ @Override
+ public CrawlStatus getStatus() {
+ return new CrawlStatus(manager.myAddress, crawlDefinition.entry_url, session);
}
/**
@@ -510,7 +503,7 @@ private boolean script(EventEnum event, CurrentURI currentURI)
objects.put("driver", driver);
if (currentURI != null)
objects.put("current", currentURI);
- final ScriptRunThread scriptRunThread = webCrawlerManager.scriptManager.runSync(script.name, objects);
+ final ScriptRunThread scriptRunThread = manager.scriptManager.runSync(script.name, objects);
if (scriptRunThread.getException() != null)
throw new ServerException(scriptRunThread.getException());
return true;
@@ -519,7 +512,7 @@ private boolean script(EventEnum event, CurrentURI currentURI)
}
}
- private void runner()
+ protected void runner()
throws URISyntaxException, IOException, ScriptException, ServerException, ReflectiveOperationException,
NoSuchAlgorithmException, KeyStoreException, KeyManagementException, InterruptedException {
try {
@@ -547,17 +540,6 @@ else if (crawlDefinition.entry_request != null)
}
}
- @Override
- final public void run() {
- try {
- runner();
- } catch (Exception e) {
- LOGGER.error(e.getMessage(), e);
- } finally {
- webCrawlerManager.removeSession(this);
- }
- }
-
private abstract class CrawlProvider {
protected final URI uri;
diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlerManager.java b/src/main/java/com/qwazr/crawler/web/WebCrawlerManager.java
index 1ba9653..d73fffb 100644
--- a/src/main/java/com/qwazr/crawler/web/WebCrawlerManager.java
+++ b/src/main/java/com/qwazr/crawler/web/WebCrawlerManager.java
@@ -16,43 +16,29 @@
package com.qwazr.crawler.web;
import com.qwazr.cluster.ClusterManager;
+import com.qwazr.crawler.common.CrawlManager;
import com.qwazr.scripts.ScriptManager;
import com.qwazr.server.ApplicationBuilder;
import com.qwazr.server.GenericServer;
-import com.qwazr.server.ServerException;
-import com.qwazr.utils.LockUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import javax.ws.rs.core.Response.Status;
import java.io.IOException;
import java.net.URISyntaxException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.TreeMap;
import java.util.concurrent.ExecutorService;
-public class WebCrawlerManager {
+public class WebCrawlerManager extends CrawlManager {
private static final Logger LOGGER = LoggerFactory.getLogger(WebCrawlerManager.class);
- final String myAddress;
- final ClusterManager clusterManager;
final ScriptManager scriptManager;
- private final ExecutorService executorService;
-
- private final LockUtils.ReadWriteLock rwlSessionMap = new LockUtils.ReadWriteLock();
- private final HashMap crawlSessionMap;
private WebCrawlerServiceImpl service;
public WebCrawlerManager(final ClusterManager clusterManager, final ScriptManager scriptManager,
final ExecutorService executor) throws IOException, URISyntaxException {
+ super(clusterManager, executor, LOGGER);
this.scriptManager = scriptManager;
- this.clusterManager = clusterManager;
- myAddress = clusterManager.getServiceBuilder().local().getStatus().me;
- this.executorService = executor;
- crawlSessionMap = new HashMap<>();
service = new WebCrawlerServiceImpl(this);
}
@@ -70,56 +56,9 @@ public WebCrawlerServiceInterface getService() {
return service;
}
- TreeMap getSessions() {
- return rwlSessionMap.read(() -> {
- final TreeMap map = new TreeMap<>();
- for (Map.Entry entry : crawlSessionMap.entrySet())
- map.put(entry.getKey(), entry.getValue().getStatus());
- return map;
- });
- }
-
- WebCrawlStatus getSession(final String sessionName) {
- return rwlSessionMap.read(() -> {
- final WebCrawlThread crawlThread = crawlSessionMap.get(sessionName);
- if (crawlThread == null)
- return null;
- return crawlThread.getStatus();
- });
- }
-
- void abortSession(final String sessionName, final String abortingReason) throws ServerException {
- rwlSessionMap.readEx(() -> {
- final WebCrawlThread crawlThread = crawlSessionMap.get(sessionName);
- if (crawlThread == null)
- throw new ServerException(Status.NOT_FOUND, "Session not found: " + sessionName);
- if (LOGGER.isInfoEnabled())
- LOGGER.info("Aborting session: " + sessionName + " - " + abortingReason);
- crawlThread.abort(abortingReason);
- });
- }
-
- WebCrawlStatus runSession(final String sessionName, final WebCrawlDefinition crawlJson) throws ServerException {
- return rwlSessionMap.writeEx(() -> {
- if (crawlSessionMap.containsKey(sessionName))
- throw new ServerException(Status.CONFLICT, "The session already exists: " + sessionName);
- if (LOGGER.isInfoEnabled())
- LOGGER.info("Create session: " + sessionName);
-
- WebCrawlThread crawlThread = new WebCrawlThread(this, sessionName, crawlJson);
- crawlSessionMap.put(sessionName, crawlThread);
- executorService.execute(crawlThread);
- return crawlThread.getStatus();
- });
- }
-
- void removeSession(final WebCrawlThread crawlThread) {
- rwlSessionMap.writeEx(() -> {
- final String sessionName = crawlThread.getSessionName();
- if (LOGGER.isInfoEnabled())
- LOGGER.info("Remove session: " + sessionName);
- crawlSessionMap.remove(sessionName, crawlThread);
- });
+ @Override
+ protected WebCrawlThread newCrawlThread(String sessionName, WebCrawlDefinition crawlDef) {
+ return new WebCrawlThread(this, sessionName, crawlDef);
}
}
diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlerMultiClient.java b/src/main/java/com/qwazr/crawler/web/WebCrawlerMultiClient.java
index 9d68547..0f1aa02 100644
--- a/src/main/java/com/qwazr/crawler/web/WebCrawlerMultiClient.java
+++ b/src/main/java/com/qwazr/crawler/web/WebCrawlerMultiClient.java
@@ -1,5 +1,5 @@
-/**
- * Copyright 2014-2016 Emmanuel Keller / QWAZR
+/*
+ * Copyright 2015-2017 Emmanuel Keller / QWAZR
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
**/
package com.qwazr.crawler.web;
-import com.qwazr.utils.ExceptionUtils;
-import com.qwazr.utils.http.HttpResponseEntityException;
-import com.qwazr.server.client.JsonMultiClientAbstract;
+import com.qwazr.crawler.common.CrawlStatus;
import com.qwazr.server.RemoteService;
import com.qwazr.server.ServerException;
+import com.qwazr.server.client.JsonMultiClientAbstract;
+import com.qwazr.utils.ExceptionUtils;
+import com.qwazr.utils.http.HttpResponseEntityException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -44,13 +45,13 @@ protected WebCrawlerSingleClient newClient(final RemoteService remote) {
}
@Override
- public TreeMap getSessions(String group) {
+ public TreeMap getSessions(String group) {
// We merge the result of all the nodes
- TreeMap globalSessions = new TreeMap();
+ TreeMap globalSessions = new TreeMap();
for (WebCrawlerSingleClient client : this) {
try {
- TreeMap localSessions = client.getSessions(group);
+ TreeMap localSessions = client.getSessions(group);
if (localSessions == null)
continue;
globalSessions.putAll(localSessions);
@@ -62,7 +63,7 @@ public TreeMap getSessions(String group) {
}
@Override
- public WebCrawlStatus getSession(String session_name, String group) {
+ public CrawlStatus getSession(String session_name, String group) {
for (WebCrawlerSingleClient client : this) {
try {
@@ -93,7 +94,7 @@ public Response abortSession(String session_name, String reason, String group) {
}
@Override
- public WebCrawlStatus runSession(final String session_name, final WebCrawlDefinition crawlDefinition) {
+ public CrawlStatus runSession(final String session_name, final WebCrawlDefinition crawlDefinition) {
final ExceptionUtils.Holder exceptionHolder = new ExceptionUtils.Holder(logger);
for (WebCrawlerSingleClient client : this) {
try {
@@ -112,7 +113,7 @@ public WebCrawlStatus runSession(final String session_name, final WebCrawlDefini
}
@Override
- public WebCrawlStatus runSession(String session_name, String jsonCrawlDefinition) throws IOException {
+ public CrawlStatus runSession(String session_name, String jsonCrawlDefinition) throws IOException {
return runSession(session_name, WebCrawlDefinition.newInstance(jsonCrawlDefinition));
}
diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceImpl.java b/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceImpl.java
index 30f8ace..986b3d3 100644
--- a/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceImpl.java
+++ b/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceImpl.java
@@ -15,6 +15,7 @@
**/
package com.qwazr.crawler.web;
+import com.qwazr.crawler.common.CrawlStatus;
import com.qwazr.server.AbstractServiceImpl;
import com.qwazr.server.ServerException;
import org.slf4j.Logger;
@@ -45,7 +46,7 @@ public void init() {
}
@Override
- public TreeMap getSessions(final String group) {
+ public TreeMap getSessions(final String group) {
// Read the sessions in the local node
if (!webrawlerManager.clusterManager.isGroup(group))
return new TreeMap<>();
@@ -53,10 +54,10 @@ public TreeMap getSessions(final String group) {
}
@Override
- public WebCrawlStatus getSession(final String session_name, final String group) {
+ public CrawlStatus getSession(final String session_name, final String group) {
try {
- final WebCrawlStatus status =
- webrawlerManager.clusterManager.isGroup(group) ? webrawlerManager.getSession(session_name) : null;
+ final CrawlStatus status = webrawlerManager.clusterManager.isGroup(group) ? webrawlerManager.getSession(
+ session_name) : null;
if (status != null)
return status;
throw new ServerException(Status.NOT_FOUND, "Session not found");
@@ -78,7 +79,7 @@ public Response abortSession(final String session_name, final String reason, fin
}
@Override
- public WebCrawlStatus runSession(final String session_name, final WebCrawlDefinition crawlDefinition) {
+ public CrawlStatus runSession(final String session_name, final WebCrawlDefinition crawlDefinition) {
try {
return webrawlerManager.runSession(session_name, crawlDefinition);
} catch (ServerException e) {
@@ -86,7 +87,7 @@ public WebCrawlStatus runSession(final String session_name, final WebCrawlDefini
}
}
- public WebCrawlStatus runSession(final String session_name, final String jsonCrawlDefinition) throws IOException {
+ public CrawlStatus runSession(final String session_name, final String jsonCrawlDefinition) throws IOException {
return runSession(session_name, WebCrawlDefinition.newInstance(jsonCrawlDefinition));
}
diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceInterface.java b/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceInterface.java
index 3428cce..4c61401 100644
--- a/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceInterface.java
+++ b/src/main/java/com/qwazr/crawler/web/WebCrawlerServiceInterface.java
@@ -16,6 +16,7 @@
package com.qwazr.crawler.web;
import com.fasterxml.jackson.core.type.TypeReference;
+import com.qwazr.crawler.common.CrawlStatus;
import com.qwazr.server.ServiceInterface;
import javax.annotation.security.RolesAllowed;
@@ -40,12 +41,12 @@ public interface WebCrawlerServiceInterface extends ServiceInterface {
@GET
@Path("/sessions")
@Produces(ServiceInterface.APPLICATION_JSON_UTF8)
- TreeMap getSessions(@QueryParam("group") String group);
+ TreeMap getSessions(@QueryParam("group") String group);
@GET
@Path("/sessions/{session_name}")
@Produces(ServiceInterface.APPLICATION_JSON_UTF8)
- WebCrawlStatus getSession(@PathParam("session_name") String session_name, @QueryParam("group") String group);
+ CrawlStatus getSession(@PathParam("session_name") String session_name, @QueryParam("group") String group);
@DELETE
@Path("/sessions/{session_name}")
@@ -56,12 +57,12 @@ Response abortSession(@PathParam("session_name") String session_name, @QueryPara
@Path("/sessions/{session_name}")
@Consumes(ServiceInterface.APPLICATION_JSON_UTF8)
@Produces(ServiceInterface.APPLICATION_JSON_UTF8)
- WebCrawlStatus runSession(@PathParam("session_name") String session_name, WebCrawlDefinition crawlDefinition);
+ CrawlStatus runSession(@PathParam("session_name") String session_name, WebCrawlDefinition crawlDefinition);
- WebCrawlStatus runSession(String session_name, String jsonCrawlDefinition) throws IOException;
+ CrawlStatus runSession(String session_name, String jsonCrawlDefinition) throws IOException;
- TypeReference> TreeMapStringCrawlTypeRef =
- new TypeReference>() {
+ TypeReference> TreeMapStringCrawlTypeRef =
+ new TypeReference>() {
};
}
diff --git a/src/main/java/com/qwazr/crawler/web/WebCrawlerSingleClient.java b/src/main/java/com/qwazr/crawler/web/WebCrawlerSingleClient.java
index 65a08f4..a4dfa78 100644
--- a/src/main/java/com/qwazr/crawler/web/WebCrawlerSingleClient.java
+++ b/src/main/java/com/qwazr/crawler/web/WebCrawlerSingleClient.java
@@ -1,5 +1,5 @@
-/**
- * Copyright 2014-2016 Emmanuel Keller / QWAZR
+/*
+ * Copyright 2015-2017 Emmanuel Keller / QWAZR
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
**/
package com.qwazr.crawler.web;
+import com.qwazr.crawler.common.CrawlStatus;
import com.qwazr.server.RemoteService;
import com.qwazr.server.client.JsonClientAbstract;
import com.qwazr.utils.UBuilder;
@@ -31,19 +32,19 @@ class WebCrawlerSingleClient extends JsonClientAbstract implements WebCrawlerSer
}
@Override
- public TreeMap getSessions(final String group) {
- final UBuilder uriBuilder =
- RemoteService.getNewUBuilder(remote, "/crawler/web/sessions").setParameter("group", group);
+ public TreeMap getSessions(final String group) {
+ final UBuilder uriBuilder = RemoteService.getNewUBuilder(remote, "/crawler/web/sessions").setParameter("group",
+ group);
HttpRequest request = HttpRequest.Get(uriBuilder.buildNoEx());
return executeJson(request, null, null, TreeMapStringCrawlTypeRef, valid200Json);
}
@Override
- public WebCrawlStatus getSession(final String session_name, final String group) {
+ public CrawlStatus getSession(final String session_name, final String group) {
final UBuilder uriBuilder = RemoteService.getNewUBuilder(remote, "/crawler/web/sessions/", session_name)
.setParameter("group", group);
HttpRequest request = HttpRequest.Get(uriBuilder.buildNoEx());
- return executeJson(request, null, null, WebCrawlStatus.class, valid200Json);
+ return executeJson(request, null, null, CrawlStatus.class, valid200Json);
}
@Override
@@ -57,14 +58,14 @@ public Response abortSession(final String session_name, final String reason, fin
}
@Override
- public WebCrawlStatus runSession(final String session_name, final WebCrawlDefinition crawlDefinition) {
+ public CrawlStatus runSession(final String session_name, final WebCrawlDefinition crawlDefinition) {
final UBuilder uriBuilder = RemoteService.getNewUBuilder(remote, "/crawler/web/sessions/", session_name);
final HttpRequest request = HttpRequest.Post(uriBuilder.buildNoEx());
- return executeJson(request, crawlDefinition, null, WebCrawlStatus.class, valid200202Json);
+ return executeJson(request, crawlDefinition, null, CrawlStatus.class, valid200202Json);
}
@Override
- public WebCrawlStatus runSession(final String session_name, final String jsonCrawlDefinition) throws IOException {
+ public CrawlStatus runSession(final String session_name, final String jsonCrawlDefinition) throws IOException {
return runSession(session_name, WebCrawlDefinition.newInstance(jsonCrawlDefinition));
}
diff --git a/src/test/java/com/qwazr/crawler/web/test/WebCrawlerTest.java b/src/test/java/com/qwazr/crawler/web/test/WebCrawlerTest.java
index 89d1171..f59b6f3 100644
--- a/src/test/java/com/qwazr/crawler/web/test/WebCrawlerTest.java
+++ b/src/test/java/com/qwazr/crawler/web/test/WebCrawlerTest.java
@@ -15,8 +15,8 @@
**/
package com.qwazr.crawler.web.test;
+import com.qwazr.crawler.common.CrawlStatus;
import com.qwazr.crawler.web.WebCrawlDefinition;
-import com.qwazr.crawler.web.WebCrawlStatus;
import com.qwazr.crawler.web.WebCrawlerServer;
import com.qwazr.crawler.web.WebCrawlerServiceBuilder;
import com.qwazr.crawler.web.WebCrawlerServiceInterface;
@@ -31,8 +31,8 @@
import org.junit.Test;
import org.junit.runners.MethodSorters;
-import java.util.TreeMap;
import java.util.HashMap;
+import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
@FixMethodOrder(MethodSorters.NAME_ASCENDING)
@@ -62,7 +62,7 @@ public void test100startServer() throws Exception {
@Test
public void test200emptySessions() {
- TreeMap sessions = remote.getSessions(null);
+ TreeMap sessions = remote.getSessions(null);
Assert.assertNotNull(sessions);
Assert.assertTrue(sessions.isEmpty());
}
@@ -81,7 +81,7 @@ private WebCrawlDefinition getNewWebCrawl() {
private void crawlWait(final String sessionName, final int crawlCount) throws InterruptedException {
WaitFor.of().timeOut(TimeUnit.MINUTES, 2).until(() -> {
- WebCrawlStatus status = ErrorWrapper.bypass(() -> remote.getSession(sessionName, null), 404);
+ CrawlStatus status = ErrorWrapper.bypass(() -> remote.getSession(sessionName, null), 404);
if (status == null)
return false;
return status.urls.crawled == crawlCount;
@@ -101,7 +101,7 @@ public void test400CrawlEvent() throws InterruptedException {
final WebCrawlDefinition webCrawl = getNewWebCrawl();
webCrawl.scripts = new HashMap<>();
webCrawl.scripts.put(WebCrawlDefinition.EventEnum.before_crawl,
- new WebCrawlDefinition.Script(BeforeCrawl.class.getName()));
+ new WebCrawlDefinition.Script(BeforeCrawl.class.getName()));
remote.runSession(sessionName, webCrawl);
crawlWait(sessionName, 3);
Assert.assertEquals(4, BeforeCrawl.count.get());