Skip to content

Commit

Permalink
propagate attachment processing
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark-Kerzner-2 committed Dec 31, 2013
1 parent 1db6afc commit 8efdf38
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 50 deletions.
Expand Up @@ -47,7 +47,7 @@ public AttachmentFileProcessor(String singleFileName, Context context, LuceneInd
* @throws InterruptedException
*/
@Override
public void process(boolean isAttachment, File parent) throws IOException, InterruptedException {
public void process(boolean hasAttachments, File parent) throws IOException, InterruptedException {
String emailPath = getSingleFileName();
String emailName = new File(emailPath).getName();
// if the file already has an extension - then it is an attachment
Expand All @@ -58,8 +58,8 @@ public void process(boolean isAttachment, File parent) throws IOException, Inter
System.out.println("Warning: Processing " + emailName
+ ". expected no-extension emails");
}
// TODO use isAttachmenet and parent in the call below
processFileEntry(emailPath, emailName);
// TODO use isAttachment and parent in the call below
processFileEntry(emailPath, emailName, hasAttachments, parent);
}

@Override
Expand Down
@@ -1,138 +1,186 @@
package org.freeeed.main;

import com.google.common.annotations.VisibleForTesting;
import org.apache.tika.metadata.Metadata;

/**
*
* @author mark Class to hold specific document metadata of interest in discovery.
* For field definition it uses a number of keys defined in Tika metadata interfaces, as well as some custom keys.
* @author mark Class to hold specific document metadata of interest in discovery. For field definition it uses a number
* of keys defined in Tika metadata interfaces, as well as some custom keys.
*
*/
public class DocumentMetadata extends Metadata {

private static final String DOCUMENT_ORIGINAL_PATH = "document_original_path";
private static final String DOCUMENT_TEXT = "text";
private static final String HAS_ATTACHMENTS = "has_attachments";
private static final String PROCESSING_EXCEPTION = "processing_exception";
private static final String MASTER_DUPLICATE = "master_duplicate";
private static final String CUSTODIAN = "Custodian";
private static final String LINK_NATIVE = "native_link";
private static final String LINK_TEXT = "text_link";
private static final String LINK_EXCEPTION = "exception_link";

// TODO the following group of fields hides fields inherited from interfaces. Decide what to do with it.
public static final String SUBJECT = "subject";
public static final String MESSAGE_FROM = "Message-From";
public static final String MESSAGE_CREATION_DATE = "Creation-Date";
public static final String MESSAGE_TO = "Message-To";
public static final String MESSAGE_CC = "Message-Cc";

public static final String DATE = "date";
public static final String DATE_RECEIVED = "Date Received";
public static final String TIME_RECEIVED = "Time Received";

public static final String DATE_SENT = "Date Sent";
public static final String TIME_SENT = "Time Sent";

public String getOriginalPath() {
return get(DOCUMENT_ORIGINAL_PATH);
}

public void setOriginalPath(String originalPath) {
set(DOCUMENT_ORIGINAL_PATH, originalPath);
}

public String getDocumentText() {
return get(DOCUMENT_TEXT);
}

public void setDocumentText(String documentText) {
set(DOCUMENT_TEXT, documentText);
}

public String getMessageSubject() {
return get(SUBJECT);
}

public void setMessageSubject(String subject) {
set(SUBJECT, subject);
}

public String getMessageFrom() {
return get(MESSAGE_FROM);
}

public void setMessageFrom(String messageFrom) {
set(MESSAGE_FROM, messageFrom);
set(MESSAGE_FROM, messageFrom);
}

public String getMessageCreationDate() {
return get(MESSAGE_CREATION_DATE);
}

public void setMessageCreationDate(String messageCreationDate) {
set(MESSAGE_CREATION_DATE, messageCreationDate);
}

public String getMessageTo() {
return get(MESSAGE_TO);
}

public void setMessageTo(String messageTo) {
set(MESSAGE_TO, messageTo);
}

public String getMessageCC() {
return get(MESSAGE_CC);
}

public void setMessageCC(String messageCC) {
set(MESSAGE_CC, messageCC);
}

public String getMessageDate() {
return get(DATE);
}

public void setMessageDate(String date) {
set(DATE, date);
}

public String getMessageDateReceived() {
return get(DATE_RECEIVED);
}

public void setMessageDateReceived(String m) {
set(DATE_RECEIVED, m);
}

public String getMessageTimeReceived() {
return get(TIME_RECEIVED);
}

public void setMessageTimeReceived(String s) {
set(TIME_RECEIVED, s);
}

public String getMessageDateSent() {
return get(DATE_SENT);
}

public void setMessageDateSent(String s) {
set(DATE_SENT, s);
}

public String getMessageTimeSent() {
return get(TIME_SENT);
}

public void setMessageTimeSent(String s) {
set(TIME_SENT, s);
}

/**
* Similar to super.add(), but with an additional return type, for fluent interface pattern.
*
* @param key key in the hashmap to be added.
* @param value value in the hashmap to be added.
* @return
* @return
*/
public DocumentMetadata addField(String key, String value) {
this.add(key, value);
return this;
}

/**
* Does the document have attachments?
*
* @return true if yes, false if no.
*/
public boolean hasAttachments() {
return isPropertyTrue(HAS_ATTACHMENTS);
}

/**
* Set a flag to indicate if the document has attachments.
* @param b true if it has attachments, false if it does not.
*/
public void setHasAttachments(boolean b) {
setProperty(HAS_ATTACHMENTS, b);
}
/**
* Return the true or false for a specific property. All true properties in the Project setup are coded with either
* property-key=yes. Anything else, such as key absent, value="no" or value = "false" results in false
*
* @param propertyKey the key we are checking
* @return true if the property is present and its values is "true", and false otherwise
*/
private boolean isPropertyTrue(String propertyKey) {
String propertyValue = get(propertyKey);
if (propertyValue != null) {
return Boolean.valueOf(propertyValue);
} else {
return false;
}
}
/**
* Convenience function to set boolean properties as strings.
* @param propertyKey key to set.
* @param b for true, set "true", for false, remove the key from the underlying map.
*/
private void setProperty(String propertyKey, boolean b) {
if (b) {
set(propertyKey, Boolean.TRUE.toString());
} else {
remove(propertyKey);
}
}
}
Expand Up @@ -51,7 +51,7 @@ public EmlFileProcessor(String singleFileName, Context context, LuceneIndex luce
* @throws InterruptedException
*/
@Override
public void process(boolean isParent, File parent) throws IOException, InterruptedException {
public void process(boolean hasAttachments, File parent) throws IOException, InterruptedException {
String emailPath = getSingleFileName();
String emailName = new File(emailPath).getName();
// TODO this is a little more complex, there are attachments without extensions
Expand All @@ -63,7 +63,7 @@ public void process(boolean isParent, File parent) throws IOException, Interrupt

logger.debug("Processing eml file with path: " + emailPath + ", name: " + emailName);

processFileEntry(emailPath, emailName);
processFileEntry(emailPath, emailName, hasAttachments, parent);
}

@Override
Expand Down
Expand Up @@ -109,7 +109,7 @@ public FileProcessor(Context context, LuceneIndex luceneIndex) {
* @throws IOException
* @throws InterruptedException
*/
abstract public void process(boolean isParent, File parent) throws IOException, InterruptedException;
abstract public void process(boolean hasAttachments, File parent) throws IOException, InterruptedException;

/**
* Cull, then emit responsive files
Expand All @@ -119,7 +119,7 @@ public FileProcessor(Context context, LuceneIndex luceneIndex) {
* @throws IOException
* @throws InterruptedException
*/
protected void processFileEntry(String tempFile, String originalFileName)
protected void processFileEntry(String tempFile, String originalFileName, boolean hasAttachments, File parent)
throws IOException, InterruptedException {
Project project = Project.getProject();
project.incrementCurrentMapCount();
Expand All @@ -135,8 +135,9 @@ protected void processFileEntry(String tempFile, String originalFileName)
// Document metadata, derived from Tika metadata class
DocumentMetadata metadata = new DocumentMetadata();
try {
metadata.set(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH,
getOriginalDocumentPath(tempFile, originalFileName));
metadata.setOriginalPath(getOriginalDocumentPath(tempFile, originalFileName));
metadata.setHasAttachments(hasAttachments);

// extract file contents with Tika
// Tika metadata class contains references to metadata and file text
extractMetadata(tempFile, metadata, originalFileName);
Expand Down Expand Up @@ -405,8 +406,8 @@ private static String parseQueryString(String queryString) {
* Extracts document metadata. Text is part of it. Forensics information is
* part of it.
*
* @param tempFile
* @return DocumentMetadata
* @param tempFile (temporary) file from which we extract metadata.
* @return DocumentMetadata container receiving metadata.
*/
private void extractMetadata(String tempFile, DocumentMetadata metadata, String originalFileName) {
DocumentParser.getInstance().parse(tempFile, metadata, originalFileName);
Expand Down
Expand Up @@ -99,10 +99,10 @@ public void process() throws IOException, Exception {
* @throws IOException on any problem reading those emails from the directory
* @throws InterruptedException on any MR problem (throws by Context)
*/
private void collectEmails(String emailDir, boolean isParent, File parent) throws IOException, InterruptedException {
private void collectEmails(String emailDir, boolean hasAttachments, File parent) throws IOException, InterruptedException {
if (new File(emailDir).isFile()) {
EmlFileProcessor fileProcessor = new EmlFileProcessor(emailDir, context, luceneIndex);
fileProcessor.process(isParent, parent);
fileProcessor.process(hasAttachments, parent);
} else {
File files[] = new File(emailDir).listFiles();
Arrays.sort(files, new MailWithAttachmentsComparator());
Expand All @@ -117,12 +117,12 @@ private void collectEmails(String emailDir, boolean isParent, File parent) throw
} else {
logger.debug("File {} is attachment to {}", file.getName(), parentFile.getName());
}
boolean hasAttachments = hasAttachments(f, files);
if (hasAttachments) {
boolean isParent = hasAttachments(f, files);
if (isParent) {
logger.debug("File {} has attachments", file.getName());
parentFile = file;
}
collectEmails(file.getPath(), hasAttachments, parentFile);
collectEmails(file.getPath(), isParent, parentFile);
}
}
}
Expand Down
Expand Up @@ -182,7 +182,7 @@ private void processArchivesRecursively(TFile tfile)
if (originalFileName.startsWith(getZipFileName())) {
originalFileName = originalFileName.substring(getZipFileName().length() + 1);
}
processFileEntry(tempFile, originalFileName);
processFileEntry(tempFile, originalFileName, false, null);
}
} catch (Exception e) {
Metadata metadata = new Metadata();
Expand All @@ -202,7 +202,7 @@ private void processZipEntry(ZipInputStream zipInputStream, ZipEntry zipEntry) t
} else if (NSFProcessor.isNSF(tempFile)) {
new NSFProcessor(tempFile, getContext(), getLuceneIndex()).process();
} else {
processFileEntry(tempFile, zipEntry.getName());
processFileEntry(tempFile, zipEntry.getName(), false, null);
}
}

Expand Down
Expand Up @@ -23,11 +23,8 @@
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Test;
import static org.junit.Assert.*;
import org.junit.Before;
import org.junit.BeforeClass;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down
Expand Up @@ -12,7 +12,6 @@
import org.apache.commons.io.FileUtils;
import org.freeeed.services.Project;
import org.freeeed.util.CsvMetadataParser;
import org.junit.Test;

import com.google.common.io.Files;
import java.io.IOException;
Expand Down

0 comments on commit 8efdf38

Please sign in to comment.