Skip to content

Commit

Permalink
Move UBuilder from utils to crawler, fix casting issues
Browse files Browse the repository at this point in the history
  • Loading branch information
emmanuel-keller committed Aug 30, 2017
1 parent 41aa47d commit d64ec31
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 18 deletions.
32 changes: 18 additions & 14 deletions src/main/java/com/qwazr/crawler/common/CrawlDefinition.java
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,14 @@ public static abstract class AbstractBuilder<D extends CrawlDefinition, B extend
protected LinkedHashSet<String> inclusionPatterns;
protected LinkedHashSet<String> exclusionPatterns;

protected AbstractBuilder() {
private final Class<B> builderClass;

protected AbstractBuilder(final Class<B> builderClass) {
this.builderClass = builderClass;
}

protected AbstractBuilder(D src) {
protected AbstractBuilder(final Class<B> builderClass, D src) {
this(builderClass);
variables = src.variables == null ? null : new LinkedHashMap<>(src.variables);
scripts = src.scripts == null ? null : new LinkedHashMap<>(src.scripts);
inclusionPatterns = src.inclusionPatterns == null || src.inclusionPatterns.isEmpty() ?
Expand All @@ -140,72 +144,72 @@ public B variable(final String name, final String value) {
if (variables == null)
variables = new LinkedHashMap<>();
variables.put(name, value);
return (B) this;
return builderClass.cast(this);
}

public B script(final EventEnum event, final ScriptDefinition script) {
if (scripts == null)
scripts = new LinkedHashMap<>();
scripts.put(event, script);
return (B) this;
return builderClass.cast(this);
}

public B setInclusionPatterns(final Collection<String> inclusionPatterns) {
this.inclusionPatterns = inclusionPatterns == null || inclusionPatterns.isEmpty() ?
null :
new LinkedHashSet<>(inclusionPatterns);
return (B) this;
return builderClass.cast(this);
}

public B setInclusionPatterns(final String inclusionPatternText) throws IOException {
if (StringUtils.isBlank(inclusionPatternText)) {
inclusionPatterns = null;
return (B) this;
return builderClass.cast(this);
}
if (inclusionPatterns != null)
inclusionPatterns.clear();
else
inclusionPatterns = new LinkedHashSet<>();
StringUtils.linesCollector(inclusionPatternText, false, inclusionPatterns);
return (B) this;
return builderClass.cast(this);
}

public B addInclusionPattern(final String inclusionPattern) {
if (StringUtils.isBlank(inclusionPattern))
return (B) this;
return builderClass.cast(this);
if (inclusionPatterns == null)
inclusionPatterns = new LinkedHashSet<>();
inclusionPatterns.add(inclusionPattern);
return (B) this;
return builderClass.cast(this);
}

public B setExclusionPatterns(final Collection<String> exclusionPatterns) {
this.exclusionPatterns = exclusionPatterns == null || exclusionPatterns.isEmpty() ?
null :
new LinkedHashSet<>(exclusionPatterns);
return (B) this;
return builderClass.cast(this);
}

public B setExclusionPatterns(final String exclusionPatternText) throws IOException {
if (StringUtils.isBlank(exclusionPatternText)) {
exclusionPatterns = null;
return (B) this;
return builderClass.cast(this);
}
if (exclusionPatterns != null)
exclusionPatterns.clear();
else
exclusionPatterns = new LinkedHashSet<>();
StringUtils.linesCollector(exclusionPatternText, false, exclusionPatterns);
return (B) this;
return builderClass.cast(this);
}

public B addExclusionPattern(final String exclusionPattern) {
if (StringUtils.isBlank(exclusionPattern))
return (B) this;
return builderClass.cast(this);
if (exclusionPatterns == null)
exclusionPatterns = new LinkedHashSet<>();
exclusionPatterns.add(exclusionPattern);
return (B) this;
return builderClass.cast(this);
}

public abstract D build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,11 @@ public static class Builder extends AbstractBuilder<FileCrawlDefinition, Builder
private Integer crawlWaitMs;

protected Builder() {
super(Builder.class);
}

protected Builder(FileCrawlDefinition crawlDefinition) {
super(crawlDefinition);
super(Builder.class, crawlDefinition);
entryPath = crawlDefinition.entryPath;
maxDepth = crawlDefinition.maxDepth;
crawlWaitMs = crawlDefinition.crawlWaitMs;
Expand Down
53 changes: 53 additions & 0 deletions src/main/java/com/qwazr/crawler/web/UBuilder.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright 2015-2017 Emmanuel Keller / QWAZR
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.qwazr.crawler.web;

import com.qwazr.utils.RegExpUtils;
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URIBuilder;

import java.net.URI;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;

public class UBuilder extends URIBuilder {

public UBuilder(URI uri) {
super(uri);
}

final public void removeMatchingParameters(final Collection<Matcher> matcherList) {
if (matcherList == null || matcherList.isEmpty())
return;
final List<NameValuePair> oldParams = getQueryParams();
if (oldParams == null || oldParams.isEmpty())
return;
clearParameters();
for (NameValuePair param : oldParams)
if (!RegExpUtils.anyMatch(param.getName() + "=" + param.getValue(), matcherList))
addParameter(param.getName(), param.getValue());
}

final public void cleanPath(final Collection<Matcher> matcherList) {
if (matcherList == null || matcherList.isEmpty())
return;
String path = getPath();
if (path == null || path.isEmpty())
return;
setPath(RegExpUtils.removeAllMatches(path, matcherList));
}
}
3 changes: 2 additions & 1 deletion src/main/java/com/qwazr/crawler/web/WebCrawlDefinition.java
Original file line number Diff line number Diff line change
Expand Up @@ -456,10 +456,11 @@ public static class Builder extends AbstractBuilder<WebCrawlDefinition, Builder>
private Integer pageLoadTimeout;

protected Builder() {
super(Builder.class);
}

protected Builder(WebCrawlDefinition src) {
super(src);
super(Builder.class, src);
this.preUrl = src.preUrl;
this.entryUrl = src.entryUrl;
this.entryRequest = src.entryRequest;
Expand Down
3 changes: 1 addition & 2 deletions src/main/java/com/qwazr/crawler/web/WebCrawlThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import com.qwazr.utils.LoggerUtils;
import com.qwazr.utils.RegExpUtils;
import com.qwazr.utils.TimeTracker;
import com.qwazr.utils.UBuilder;
import org.apache.commons.lang3.StringUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
Expand Down Expand Up @@ -117,7 +116,7 @@ public class WebCrawlThread extends CrawlThread<WebCrawlDefinition, WebCrawlStat
* @return
*/
private URI checkLink(final URI uri) {
UBuilder uriBuilder = new UBuilder(uri);
final UBuilder uriBuilder = new UBuilder(uri);
if (crawlDefinition.removeFragments != null && crawlDefinition.removeFragments)
uriBuilder.setFragment(null);
if (parametersMatcherList != null && !parametersMatcherList.isEmpty())
Expand Down

0 comments on commit d64ec31

Please sign in to comment.