Skip to content

Commit

Permalink
Introduce FileCrawler
Browse files Browse the repository at this point in the history
  • Loading branch information
emmanuel-keller committed Jun 8, 2017
1 parent b87c83b commit 69f7330
Show file tree
Hide file tree
Showing 18 changed files with 639 additions and 334 deletions.
131 changes: 131 additions & 0 deletions src/main/java/com/qwazr/crawler/common/CrawlDefinition.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
/**
* Copyright 2015-2017 Emmanuel Keller / QWAZR
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package com.qwazr.crawler.common;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonInclude.Include;

import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;

@JsonInclude(Include.NON_EMPTY)
public class CrawlDefinition implements Cloneable {

/**
* The global variables shared by all the scripts.
*/
public LinkedHashMap<String, String> variables = null;

/**
* A list of scripts paths mapped with the events which fire the scripts.
*/
public Map<EventEnum, Script> scripts = null;

public enum EventEnum {

/**
* Executed before the crawl session start
*/
before_session,

/**
* Executed after the crawl session ends
*/
after_session,

/**
* Executed before an URL is crawled
*/
before_crawl,

/**
* Executed after an URL has been crawled
*/
after_crawl
}

@JsonInclude(Include.NON_EMPTY)
public static class Script implements Cloneable {

/**
* The path to the scripts
*/
public String name = null;

/**
* The local variables passed to the scripts
*/
public Map<String, String> variables = null;

public Script() {
}

public Script(String name) {
this.name = name;
}

protected Script(Script src) {
this.name = src.name;
this.variables = src.variables == null ? null : new HashMap<String, String>(src.variables);
}

@Override
final public Object clone() {
return new Script(this);
}

public Script addVariable(String name, String value) {
if (variables == null)
variables = new HashMap<>();
variables.put(name, value);
return this;
}

}

public CrawlDefinition() {
}

protected CrawlDefinition(CrawlDefinition src) {
variables = src.variables == null ? null : new LinkedHashMap<>(src.variables);
if (src.scripts == null) {
scripts = null;
} else {
scripts = new HashMap<>();
src.scripts.forEach((eventEnum, script) -> scripts.put(eventEnum, new Script(script)));
}
}

public Object clone() {
return new CrawlDefinition(this);
}

@JsonIgnore
public Script addScript(final String event, final String name) {
if (scripts == null)
scripts = new LinkedHashMap<>();
Script script = new Script(name);
scripts.put(EventEnum.valueOf(event), script);
return script;
}

public Map<EventEnum, Script> getScripts() {
return scripts;
}

}
87 changes: 87 additions & 0 deletions src/main/java/com/qwazr/crawler/common/CrawlManager.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/*
* Copyright 2017 Emmanuel Keller / QWAZR
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.qwazr.crawler.common;

import com.qwazr.cluster.ClusterManager;
import com.qwazr.server.ServerException;
import org.slf4j.Logger;

import javax.ws.rs.core.Response;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicBoolean;

public abstract class CrawlManager<T extends CrawlThread<?>, D extends CrawlDefinition> {

private final ConcurrentHashMap<String, T> crawlSessionMap;
private final ExecutorService executorService;
private final Logger logger;
public final String myAddress;
public final ClusterManager clusterManager;

protected CrawlManager(final ClusterManager clusterManager, final ExecutorService executorService,
final Logger logger) {
crawlSessionMap = new ConcurrentHashMap<>();
this.clusterManager = clusterManager;
this.myAddress = clusterManager.getServiceBuilder().local().getStatus().me;
this.executorService = executorService;
this.logger = logger;
}

public TreeMap<String, CrawlStatus> getSessions() {
final TreeMap<String, CrawlStatus> map = new TreeMap<>();
crawlSessionMap.forEach((key, crawl) -> map.put(key, crawl.getStatus()));
return map;
}

public CrawlStatus getSession(final String sessionName) {
final T crawlThread = crawlSessionMap.get(sessionName);
return crawlThread == null ? null : crawlThread.getStatus();
}

public void abortSession(final String sessionName, final String abortingReason) throws ServerException {
final T crawlThread = crawlSessionMap.get(sessionName);
if (crawlThread == null)
throw new ServerException(Response.Status.NOT_FOUND, "Session not found: " + sessionName);
logger.info("Aborting session: {} - {}", sessionName, abortingReason);
crawlThread.abort(abortingReason);
}

protected abstract T newCrawlThread(final String sessionName, final D crawlDefinition);

public CrawlStatus runSession(final String sessionName, final D crawlDefinition) throws ServerException {

final AtomicBoolean newThread = new AtomicBoolean(false);

final T crawlThread = crawlSessionMap.computeIfAbsent(sessionName, key -> {
logger.info("Create session: {}", sessionName);
newThread.set(true);
return newCrawlThread(sessionName, crawlDefinition);
});

if (!newThread.get())
throw new ServerException(Response.Status.CONFLICT, "The session already exists: " + sessionName);
executorService.execute(crawlThread);
return crawlThread.getStatus();
}

public void removeSession(final T crawlThread) {
logger.info("Remove session: {}", crawlThread.getSessionName());
crawlSessionMap.remove(crawlThread.getSessionName(), crawlThread);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package com.qwazr.crawler.web;
package com.qwazr.crawler.common;

import com.qwazr.utils.TimeTracker;

import java.util.Map;

public interface CurrentSession {
public interface CrawlSession {

Map<String, Object> getVariables();

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/**
/*
* Copyright 2015-2017 Emmanuel Keller / QWAZR
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -13,18 +13,17 @@
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package com.qwazr.crawler.web;
package com.qwazr.crawler.common;

import com.qwazr.utils.TimeTracker;
import org.apache.commons.collections4.CollectionUtils;

import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicBoolean;

class CurrentSessionImpl implements CurrentSession {
public class CrawlSessionImpl<T extends CrawlDefinition> implements CrawlSession {

private final WebCrawlDefinition crawlDefinition;
private final T crawlDefinition;
private final String name;
private final AtomicBoolean abort;
private final TimeTracker timeTracker;
Expand All @@ -37,19 +36,20 @@ class CurrentSessionImpl implements CurrentSession {
private volatile Integer currentDepth = null;
private volatile String abortingReason = null;

CurrentSessionImpl(WebCrawlDefinition crawlDefinition, String name, TimeTracker timeTracker) {
public CrawlSessionImpl(T crawlDefinition, String name) {
this.crawlDefinition = crawlDefinition;
this.timeTracker = timeTracker;
this.timeTracker = new TimeTracker();
this.name = name;
abort = new AtomicBoolean(false);
this.variables = new ConcurrentHashMap<>();
if (crawlDefinition.variables != null)
for (Map.Entry<String, String> entry : crawlDefinition.variables.entrySet())
if (entry.getKey() != null && entry.getValue() != null)
this.variables.put(entry.getKey(), entry.getValue());
if (crawlDefinition.variables != null) {
crawlDefinition.variables.forEach((key, value) -> {
if (key != null && value != null)
this.variables.put(key, value);
});
}
}

@Override
public Map<String, Object> getVariables() {
return variables;
}
Expand All @@ -59,7 +59,10 @@ public Object getVariable(String name) {
return variables.get(name);
}

@Override
/**
* @param name the name of the variable
* @return the value of the variable
*/
public Object setVariable(String name, Object value) {
if (value == null)
return removeVariable(name);
Expand Down Expand Up @@ -104,7 +107,7 @@ public String getAbortingReason() {
return abortingReason;
}

synchronized int incIgnoredCount() {
public synchronized int incIgnoredCount() {
return ++ignoredCount;
}

Expand All @@ -113,7 +116,7 @@ public Integer getIgnoredCount() {
return ignoredCount;
}

synchronized int incCrawledCount() {
public synchronized int incCrawledCount() {
return ++crawledCount;
}

Expand All @@ -122,7 +125,7 @@ public Integer getCrawledCount() {
return crawledCount;
}

synchronized int incErrorCount() {
public synchronized int incErrorCount() {
return ++errorCount;
}

Expand All @@ -146,20 +149,15 @@ public Integer getCurrentDepth() {
return currentDepth;
}

synchronized void setCurrentURI(String currentURI, Integer currentDepth) {
public synchronized void setCurrentURI(String currentURI, Integer currentDepth) {
this.currentURI = currentURI;
this.currentDepth = currentDepth;
}

public WebCrawlDefinition getCrawlDefinition() {
public T getCrawlDefinition() {
return crawlDefinition;
}

public boolean isURLPatterns() {
return crawlDefinition != null && !CollectionUtils.isEmpty(crawlDefinition.inclusion_patterns)
&& !CollectionUtils.isEmpty(crawlDefinition.exclusion_patterns);
}

public TimeTracker getTimeTracker() {
return timeTracker;
}
Expand Down
Loading

0 comments on commit 69f7330

Please sign in to comment.