Skip to content

Commit

Permalink
Merge pull request #125 from sul-dlss/resume
Browse files Browse the repository at this point in the history
Added ability to resume download. Added support for Java 1.8.
  • Loading branch information
justinlittman committed Dec 17, 2018
2 parents 6f15979 + 6feeddb commit ae826c1
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 24 deletions.
3 changes: 1 addition & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
language: java

jdk:
- openjdk7
- openjdk8

script: ./gradlew check

Expand Down
13 changes: 10 additions & 3 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,22 @@ dependencies {
compile 'org.apache.httpcomponents:httpclient:4.5.3'
compile 'org.apache.httpcomponents:httpcore:4.4.6'

// JAXB
compile 'javax.xml.bind:jaxb-api:2.3.0'
compile 'com.sun.xml.bind:jaxb-core:2.3.0'
compile 'com.sun.xml.bind:jaxb-impl:2.3.0'

//Unit testing framework.
testCompile 'junit:junit:4.12'

//For creating mock objects in unit tests
testCompile "org.mockito:mockito-core:2.+"
// See https://github.com/powermock/powermock/wiki/Mockito#supported-versions
testCompile 'org.mockito:mockito-core:2.23.0+'

// need PowerMock for mocking constructors (local objects within methods)
testCompile "org.powermock:powermock-module-junit4:1.7+"
testCompile "org.powermock:powermock-api-mockito2:1.7+"
testCompile "org.powermock:powermock-module-junit4:2.0.0-RC.4"
testCompile "org.powermock:powermock-api-mockito2:2.0.0-RC.4"

}

sourceSets {
Expand Down
46 changes: 34 additions & 12 deletions src/edu/stanford/dlss/was/WasapiDownloader.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ WasapiConnection getWasapiConn() throws IOException {
}

// package level method for testing
@SuppressWarnings("checkstyle:MethodLength")
@SuppressWarnings({"checkstyle:MethodLength", "checkstyle:CyclomaticComplexity"})
void downloadAndValidateFile(WasapiFile file) throws NoSuchAlgorithmException {
String fullFilePath = prepareOutputLocation(file);
if (fullFilePath == null) {
Expand All @@ -68,7 +68,11 @@ void downloadAndValidateFile(WasapiFile file) throws NoSuchAlgorithmException {
int numRetries = Integer.parseInt(settings.retries());
int attempts = 0;
boolean checksumValidated = false;
do {
if (tryResume(fullFilePath, file)) {
checksumValidated = true;
System.out.println("file already retrieved: " + file.getLocations()[0]);
}
while (attempts <= numRetries && !checksumValidated) {
attempts++;
try {
boolean downloadSuccess = getWasapiConn().downloadQuery(file.getLocations()[0], fullFilePath);
Expand All @@ -90,12 +94,27 @@ void downloadAndValidateFile(WasapiFile file) throws NoSuchAlgorithmException {
System.err.println("WARNING: exception downloading file (will retry): " + file.getLocations()[0]);
e.printStackTrace(System.err);
}
} while (attempts <= numRetries && !checksumValidated);
}

if (attempts == numRetries + 1) // RE-tries, not number of attempts
System.err.println("file not retrieved or unable to validate checksum: " + file.getLocations()[0]);
}

boolean tryResume(String fullFilePath, WasapiFile file) throws NoSuchAlgorithmException {
if (!settings.shouldResume())
return false;

File fullFile = new File(fullFilePath);
if (!fullFile.exists())
return false;

if (!checksumValidate(settings.checksumAlgorithm(), file, fullFilePath)) {
fullFile.delete();
return false;
}
return true;
}

// package level method for testing
String prepareOutputLocation(WasapiFile file) {
String outputPath = settings.outputBaseDir() + "AIT_" + file.getCollectionId() +
Expand All @@ -105,22 +124,25 @@ String prepareOutputLocation(WasapiFile file) {
}

// package level method for testing
boolean checksumValidate(String algorithm, WasapiFile file, String fullFilePath)
throws NoSuchAlgorithmException, IOException {
boolean checksumValidate(String algorithm, WasapiFile file, String fullFilePath) throws NoSuchAlgorithmException {
String checksum = file.getChecksums().get(algorithm);
if (checksum == null) {
System.err.println("No checksum of type: " + algorithm + " available: " + file.getChecksums().toString());
return false;
}

if ("md5".equals(algorithm))
return WasapiValidator.validateMd5(checksum, fullFilePath);
else if ("sha1".equals(algorithm))
return WasapiValidator.validateSha1(checksum, fullFilePath);
else {
System.err.println("Unsupported checksum algorithm: " + algorithm + ". Options are 'md5' or 'sha1'");
return false;
try {
if ("md5".equals(algorithm))
return WasapiValidator.validateMd5(checksum, fullFilePath);
else if ("sha1".equals(algorithm))
return WasapiValidator.validateSha1(checksum, fullFilePath);
else {
System.err.println("Unsupported checksum algorithm: " + algorithm + ". Options are 'md5' or 'sha1'");
}
} catch (IOException e) {
// Somethings wrong, so fail validate
}
return false;
}

private List<Integer> desiredCrawlIds(WasapiCrawlSelector crawlSelector) {
Expand Down
14 changes: 11 additions & 3 deletions src/edu/stanford/dlss/was/WasapiDownloaderSettings.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import org.apache.commons.validator.routines.IntegerValidator;
import org.apache.commons.validator.routines.UrlValidator;

@SuppressWarnings({"checkstyle:ClassDataAbstractionCoupling", "checkstyle:ClassFanOutComplexity", "checkstyle:LineLength"})
@SuppressWarnings({"checkstyle:ClassDataAbstractionCoupling", "checkstyle:ClassFanOutComplexity", "checkstyle:LineLength", "checkstyle:MethodCount"})
public class WasapiDownloaderSettings {
// to add a new setting:
// * add a String constant for the setting/arg name
Expand All @@ -47,6 +47,7 @@ public class WasapiDownloaderSettings {
public static final String PASSWORD_PARAM_NAME = "password";
public static final String RETRIES_PARAM_NAME = "retries";
public static final String USERNAME_PARAM_NAME = "username";
public static final String RESUME_PARAM_NAME = "resume";

protected PrintStream errStream = System.err;
protected Properties settings;
Expand All @@ -70,7 +71,8 @@ public class WasapiDownloaderSettings {
buildArgOption(OUTPUT_BASE_DIR_PARAM_NAME, "destination directory for downloaded files (expects ending slash)"),
buildArgOption(PASSWORD_PARAM_NAME, "password for WASAPI server login"),
buildArgOption(RETRIES_PARAM_NAME, "how many times to retry a download for each file (retries + 1 = total tries)"),
buildArgOption(USERNAME_PARAM_NAME, "username for WASAPI server login")
buildArgOption(USERNAME_PARAM_NAME, "username for WASAPI server login"),
Option.builder().longOpt(RESUME_PARAM_NAME).desc("Skip files that have been successfully downloaded").build()
};

static {
Expand Down Expand Up @@ -161,6 +163,9 @@ public String username() {
return settings.getProperty(USERNAME_PARAM_NAME);
}

public boolean shouldResume() {
return settings.getProperty(RESUME_PARAM_NAME) != null;
}

public String getHelpAndSettingsMessage() {
if (helpAndSettingsMessage == null)
Expand Down Expand Up @@ -326,7 +331,10 @@ private void addParsedArgsToSettings(CommandLine parsedArgs) {
}

if (parsedArgs.hasOption(HELP_PARAM_NAME))
settings.setProperty(HELP_PARAM_NAME, "true");
settings.setProperty(HELP_PARAM_NAME, Boolean.TRUE.toString());

if (parsedArgs.hasOption(RESUME_PARAM_NAME))
settings.setProperty(RESUME_PARAM_NAME, Boolean.TRUE.toString());
}

private void parseArgsIntoSettings(String[] args) throws ParseException {
Expand Down
6 changes: 4 additions & 2 deletions test/edu/stanford/dlss/was/TestWasapiDownloaderSettings.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public void constructor_readsPropertiesFileAndArgs() throws SettingsLoadExceptio
// the below args array would come from something like:
// wasapi-downloader -h --collectionId 123 --crawlId=456 --crawlStartAfter 2014-03-14 --crawlStartBefore=2017-03-14
// The Apache CLI parser should be able to handle all of those different styles of argument without any trouble.
String[] args = { "-h", "--collectionId", "123", "--crawlId=456", "--crawlStartAfter", "2014-03-14", "--crawlStartBefore=2017-03-14" };
String[] args = { "-h", "--collectionId", "123", "--crawlId=456", "--crawlStartAfter", "2014-03-14", "--crawlStartBefore=2017-03-14", "--resume" };
WasapiDownloaderSettings settings = new WasapiDownloaderSettings(WasapiDownloader.SETTINGS_FILE_LOCATION, args);

assertEquals("baseurl value should have come from settings file", "https://example.org/", settings.baseUrlString());
Expand All @@ -43,6 +43,7 @@ public void constructor_readsPropertiesFileAndArgs() throws SettingsLoadExceptio
assertEquals("crawlStartAfter value should have come from args", "2014-03-14", settings.crawlStartAfter());
assertEquals("crawlStartBefore value should have come from args", "2017-03-14", settings.crawlStartBefore());
assertTrue("shouldDisplayHelp value should have come from args", settings.shouldDisplayHelp());
assertTrue("shouldResume value should have come from args", settings.shouldResume());
}

@Test
Expand All @@ -51,7 +52,7 @@ public void getHelpAndSettingsMessage_containsUsageAndSettingsInfo() throws Sett
//TODO: if settings validation flags possibly nonsensical/redundant combos like crawlId and crawlIdLowerBound,
// then this test might have to be broken up a bit.
String[] args = { "-h", "--collectionId", "123", "--crawlId=456", "--crawlIdLowerBound=400",
"--crawlStartAfter", "2014-03-14", "--crawlStartBefore=2017-03-14", "--filename=filename.warc.gz" };
"--crawlStartAfter", "2014-03-14", "--crawlStartBefore=2017-03-14", "--filename=filename.warc.gz", "--resume" };
WasapiDownloaderSettings settings = new WasapiDownloaderSettings(WasapiDownloader.SETTINGS_FILE_LOCATION, args);

String helpAndSettingsMsg = settings.getHelpAndSettingsMessage();
Expand All @@ -73,6 +74,7 @@ public void getHelpAndSettingsMessage_containsUsageAndSettingsInfo() throws Sett
assertThat("helpAndSettingsMsg lists password arg", helpAndSettingsMsg, containsString("--password <arg>"));
assertThat("helpAndSettingsMsg lists retries arg", helpAndSettingsMsg, containsString("--retries <arg>"));
assertThat("helpAndSettingsMsg lists username arg", helpAndSettingsMsg, containsString("--username <arg>"));
assertThat("helpAndSettingsMsg lists resume flag", helpAndSettingsMsg, containsString("--resume"));

// values
assertThat("helpAndSettingsMsg lists accountId value", helpAndSettingsMsg, containsString("accountId : 1"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,22 @@
import java.io.IOException;
import java.io.PrintStream;
import java.security.NoSuchAlgorithmException;
import java.io.File;

import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpResponseException;
import org.hamcrest.core.StringStartsWith;
import org.junit.*;
import org.mockito.Mockito;
import org.junit.runner.RunWith;
import org.powermock.core.classloader.annotations.PrepareForTest;
import org.powermock.modules.junit4.PowerMockRunner;

/**
* WasapiDownloader tests for downloadAndValidateFile() method
*/
@RunWith(PowerMockRunner.class)
@PrepareForTest({WasapiDownloader.class, WasapiConnection.class})
@SuppressWarnings({"TypeName", "MethodLength"})
public class TestWasapiDownloader_DownloadAndValidateFile {

Expand Down Expand Up @@ -306,6 +312,117 @@ public void downloadAndValidateFile_IOException_handled() throws Exception{
assertThat("SYSERR should have stacktrace", errContent.toString(), containsString("java.io.IOException: reason"));
}

@Test
public void downloadAndValidateFile_resume_skip() throws Exception {
WasapiFile wfile = new WasapiFile();
String firstLocation = "out there";
String[] locations = new String[]{firstLocation};
wfile.setLocations(locations);
String fullFilePath = "somewhere";

WasapiConnection mockConn = Mockito.mock(WasapiConnection.class);
// Mockito.when(mockConn.downloadQuery(firstLocation, fullFilePath)).thenReturn(true);

WasapiDownloader downloaderSpy = Mockito.spy(new WasapiDownloader(WasapiDownloader.SETTINGS_FILE_LOCATION, null));
Mockito.doReturn(fullFilePath).when(downloaderSpy).prepareOutputLocation(wfile);
Mockito.doReturn(true).when(downloaderSpy).tryResume(fullFilePath, wfile);
Mockito.doReturn(mockConn).when(downloaderSpy).getWasapiConn();

ByteArrayOutputStream outContent = new ByteArrayOutputStream();
System.setOut(new PrintStream(outContent));
ByteArrayOutputStream errContent = new ByteArrayOutputStream();
System.setErr(new PrintStream(errContent));

downloaderSpy.downloadAndValidateFile(wfile);
verify(mockConn, never()).downloadQuery(firstLocation, fullFilePath);
verify(downloaderSpy, times(1)).tryResume(fullFilePath, wfile);
assertEquals("Wrong SYSOUT output", "file already retrieved: " + firstLocation + "\n", outContent.toString());
assertEquals("No SYSERR output for success", "", errContent.toString());
}

private static final char SEP = File.separatorChar;
private static final String FIXTURE_WARC_PATH = "test" + SEP + "fixtures" + SEP + "small-file.warc.gz";

@Test
public void tryResume_no_file() throws Exception {
WasapiFile wfile = new WasapiFile();
String firstLocation = "out there";
String[] locations = new String[]{firstLocation};
wfile.setLocations(locations);
String fullFilePath = "somewhere";

WasapiDownloaderSettings settingsSpy = Mockito.spy(defaultSettings());

WasapiDownloader downloaderSpy = Mockito.spy(new WasapiDownloader(WasapiDownloader.SETTINGS_FILE_LOCATION, null));
downloaderSpy.settings = settingsSpy;

Mockito.doReturn(false).when(settingsSpy).shouldResume();

assertFalse("Should not resume when file does not exist", downloaderSpy.tryResume(fullFilePath, wfile));
verify(downloaderSpy, never()).checksumValidate(defaultSettings().checksumAlgorithm(), wfile, fullFilePath);

}

@Test
public void tryResume_not_resume() throws Exception {
WasapiFile wfile = new WasapiFile();
String firstLocation = "out there";
String[] locations = new String[]{firstLocation};
wfile.setLocations(locations);
String fullFilePath = FIXTURE_WARC_PATH;

WasapiDownloader downloaderSpy = Mockito.spy(new WasapiDownloader(WasapiDownloader.SETTINGS_FILE_LOCATION, null));

assertFalse("Should not resume when not set to resume", downloaderSpy.tryResume(fullFilePath, wfile));
verify(downloaderSpy, never()).checksumValidate(defaultSettings().checksumAlgorithm(), wfile, fullFilePath);
}

@Test
public void tryResume_validate_fails() throws Exception {
WasapiFile wfile = new WasapiFile();
String firstLocation = "out there";
String[] locations = new String[]{firstLocation};
wfile.setLocations(locations);
// Create a tempfile
File fullFile = File.createTempFile("test", ".warc.gz");
String fullFilePath = fullFile.getAbsolutePath();
assertTrue("Warc exists before resume", fullFile.exists());

WasapiDownloaderSettings settingsSpy = Mockito.spy(defaultSettings());

WasapiDownloader downloaderSpy = Mockito.spy(new WasapiDownloader(WasapiDownloader.SETTINGS_FILE_LOCATION, null));
downloaderSpy.settings = settingsSpy;

Mockito.doReturn(true).when(settingsSpy).shouldResume();
Mockito.doReturn(false).when(downloaderSpy).checksumValidate(defaultSettings().checksumAlgorithm(), wfile, fullFilePath);


assertFalse("Should not resume when file exists and checksums do not match", downloaderSpy.tryResume(fullFilePath, wfile));
verify(downloaderSpy, times(1)).checksumValidate(defaultSettings().checksumAlgorithm(), wfile, fullFilePath);
assertFalse("Warc deleted", fullFile.exists());
}

@Test
public void tryResume_success() throws Exception {
WasapiFile wfile = new WasapiFile();
String firstLocation = "out there";
String[] locations = new String[]{firstLocation};
wfile.setLocations(locations);
String fullFilePath = FIXTURE_WARC_PATH;

WasapiDownloaderSettings settingsSpy = Mockito.spy(defaultSettings());

WasapiDownloader downloaderSpy = Mockito.spy(new WasapiDownloader(WasapiDownloader.SETTINGS_FILE_LOCATION, null));
downloaderSpy.settings = settingsSpy;

Mockito.doReturn(true).when(settingsSpy).shouldResume();
Mockito.doReturn(true).when(downloaderSpy).checksumValidate(defaultSettings().checksumAlgorithm(), wfile, fullFilePath);


assertTrue("Should resume when file exists and checksums match", downloaderSpy.tryResume(fullFilePath, wfile));
verify(downloaderSpy, times(1)).checksumValidate(defaultSettings().checksumAlgorithm(), wfile, fullFilePath);
}

private WasapiDownloaderSettings defaultSettings() throws SettingsLoadException {
return new WasapiDownloaderSettings(WasapiDownloader.SETTINGS_FILE_LOCATION, null);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@
import org.mockito.Mockito;
import org.powermock.api.mockito.PowerMockito;
import org.powermock.core.classloader.annotations.PrepareForTest;
import org.powermock.core.classloader.annotations.PowerMockIgnore;
import org.powermock.modules.junit4.PowerMockRunner;

@RunWith(PowerMockRunner.class)
@PrepareForTest({WasapiDownloader.class, WasapiValidator.class})
@PowerMockIgnore({"com.sun.org.apache.xerces.*", "javax.xml.*", "org.xml.*", "org.w3c.dom.*"})
@SuppressWarnings("TypeName")
/**
* Tests for WasapiDownloader that require PowerMock
Expand Down Expand Up @@ -176,7 +178,7 @@ public void checksumValidate_md5_calls_wasapiValidator_validateMd5() throws Sett
WasapiDownloader wd = new WasapiDownloader(WasapiDownloader.SETTINGS_FILE_LOCATION, null);
wd.checksumValidate("md5", wfile, anyString());

PowerMockito.verifyStatic();
PowerMockito.verifyStatic(WasapiValidator.class);
WasapiValidator.validateMd5(expectedChecksum, "");
}

Expand All @@ -194,7 +196,7 @@ public void checksumValidate_sha1_calls_wasapiValidator_validateSha1() throws Se
WasapiDownloader wd = new WasapiDownloader(WasapiDownloader.SETTINGS_FILE_LOCATION, null);
wd.checksumValidate("sha1", wfile, anyString());

PowerMockito.verifyStatic();
PowerMockito.verifyStatic(WasapiValidator.class);
WasapiValidator.validateSha1(expectedChecksum, "");
}
}

0 comments on commit ae826c1

Please sign in to comment.