updated

simsong · Oct 6, 2014 · 6f800ec · 6f800ec
1 parent 04c028a
commit 6f800ec
Show file tree

Hide file tree

Showing 9 changed files with 110 additions and 52 deletions.
diff --git a/configure.ac b/configure.ac
@@ -150,7 +150,7 @@ AM_CONDITIONAL([LIGHTGREP_ENABLED], [test "yes" = "$lightgrep"])
 AC_ARG_ENABLE([flexscanners],
               AS_HELP_STRING([--disable-flexscanners], [disable FLEX-based scanners]),
               [],
-              [AC_DEFINE(FLEXSCANNERS_ENABLED, 1, [Use FLEX-based scanners]), flexscanners='yes'])
+              [AC_DEFINE(USE_FLEXSCANNERS, 1, [Use FLEX-based scanners]) flexscanners='yes'])
 AM_CONDITIONAL([FLEXSCANNERS_ENABLED], [test "yes" = "$flexscanners"])
 
 

diff --git a/python/bulk_extractor_reader.py b/python/bulk_extractor_reader.py
@@ -230,7 +230,6 @@ def peak_memory(self):
 
     def open(self,fname,mode='r'):
         """Opens a named file in the bulk report. Default is text mode.
-        Returns .bulk_extractor_reader as a pointer to self.
         """
         # zipfile always opens in Binary mode, but generates an error
         # if the 'b' is present, so remove it if present.
@@ -241,7 +240,6 @@ def open(self,fname,mode='r'):
             mode = mode.replace("b","")+"b"
             fn = os.path.join(self.dname,fname)
             f = open(fn,mode=mode)
-        f.bulk_extractor_reader = self
         return f
 
     def count_lines(self,fname):
@@ -308,7 +306,7 @@ def read_histogram(self,fn):
     def read_features(self,fname):
         """Just read the features out of a feature file"""
         """Usage: for (pos,feature,context) in br.read_features("fname")"""
-        for line in self.open(fname):
+        for line in self.open(fname,"rb"):
             r = parse_feature_line(line)
             if r:
                 yield r

diff --git a/python/cda2_tool.py b/python/cda2_tool.py
@@ -2,21 +2,8 @@
 # coding=UTF-8
 #
 # Cross Drive Analysis tool for bulk extractor.
-# V4
-# Features of this program:
-# --netmap  -- makes a map of which computers exchanged packets (ethernet maps)
-# --makestop  -- Creates a stoplist of features that are on more than a fraction of the drives
-# --threshold -- sets the fraction of drives necessary for a feature to be ignored
-# --idfeatures  -- spcifies which feature files are used for identity operations
-#
-# reads multiple bulk_extractor histogram files and outputs:
-# stoplist.txt - list of email addresses on more than 1/3 of the disks
-# targets.txt  - list of email addresses not on stoplist and the # of drives on which they appear.
-#
-# Version 1.3 - Complete rewrite; elimiantes driveids and featureids, since strings
-#               in Python are hashable (and thus integers). Also uses bulk_extractor_reader
 
-__version__='1.3.1'
+__version__='2.0.0'
 import os.path,sys
 
 #if sys.version_info < (3,2):

diff --git a/src/Makefile.am b/src/Makefile.am
@@ -77,8 +77,11 @@ bulk_scanners = \
 	scan_winprefetch.cpp \
 	scan_wordlist.cpp \
 	scan_xor.cpp \
-	scan_zip.cpp \
-	$(flex_scanners) 
+	scan_zip.cpp 
+
+if FLEXSCANNERS_ENABLED
+  bulk_scanners += $(flex_scanners) 
+endif
 
 if LIGHTGREP_ENABLED
   bulk_scanners += $(lightgrep_scanners)

diff --git a/src/be13_api b/src/be13_api
diff --git a/src/bulk_extractor_scanners.cpp b/src/bulk_extractor_scanners.cpp
@@ -37,18 +37,20 @@
 
 /* An array of the built-in scanners */
 scanner_t *scanners_builtin[] = {
-    scan_accts,
-    scan_base16,
     scan_base64,
     scan_kml,
-    scan_email,
     scan_httplogs,
-    scan_gps,
     scan_net,
     scan_find,
     scan_wordlist,
     scan_aes,
     scan_json,
+#if defined(USE_FLEXSCANNERS)
+    scan_accts,
+    scan_base16,
+    scan_email,
+    scan_gps,
+#endif
 #if defined(HAVE_LIBLIGHTGREP) && defined(USE_LIGHTGREP)
     scan_accts_lg,
     scan_base16_lg,

diff --git a/src/dfxml b/src/dfxml
diff --git a/src/scan_accts.flex b/src/scan_accts.flex
@@ -249,6 +249,23 @@ DATEFORMAT	({DATEA}|{DATEB}|{DATEC}|{DATED})
     s.pos += yyleng;
 }
 
+[ \t\n][+][1-9]([0-9]{10,18})/[^0-9] {
+     /* Experimental Regex to find phone numbers beginning with a + */
+     /* Phone numbers can be a maximum of 15 digits */
+    accts_scanner &s = *yyaccts_get_extra(yyscanner);
+    s.telephone_recorder->write_buf(SBUF,s.pos+1,yyleng-1);
+    s.pos += yyleng;
+}
+
+[ \t\n][0]([1-9][0-9]{9,15})/[^0-9] {
+     /* Experimental Regex to find phone numbers beginning with a 0 */
+    accts_scanner &s = *yyaccts_get_extra(yyscanner);
+    s.telephone_recorder->write_buf(SBUF,s.pos+1,yyleng-1);
+    s.pos += yyleng;
+}
+
+
+
 [^0-9]([0-9]{6}-){7}([0-9]{6})/[\r\n] {
     accts_scanner &s = *yyaccts_get_extra(yyscanner);
     s.alert_recorder->write(SBUF.pos0+s.pos,yytext+1,"Possible BitLocker Recovery Key (ASCII).");

diff --git a/src/scan_hashdb.cpp b/src/scan_hashdb.cpp
@@ -53,13 +53,6 @@ static uint32_t hashdb_import_max_duplicates=0;                 // import only
 enum mode_type_t {MODE_NONE, MODE_SCAN, MODE_IMPORT};
 static mode_type_t mode = MODE_NONE;
 
-// internal helper functions
-static void do_import(const class scanner_params &sp,
-                      const recursion_control_block &rcb);
-static void do_scan(const class scanner_params &sp,
-                    const recursion_control_block &rcb);
-inline bool is_empty_block(const uint8_t *buf);
-
 // global state
 
 // hashdb directory, import only
@@ -73,6 +66,59 @@ typedef md5_generator hash_generator;
 typedef hashdb_t__<hash_t> hashdb_t;
 hashdb_t* hashdb;
 
+static void do_import(const class scanner_params &sp,
+                      const recursion_control_block &rcb);
+static void do_scan(const class scanner_params &sp,
+                    const recursion_control_block &rcb);
+
+
+// rules for determining if a sector should be ignored
+static bool ramp_sector(const sbuf_t &sbuf)
+{
+    uint32_t count = 0;
+    for(size_t i=0;i<sbuf.pagesize-8;i+=4){
+        if (sbuf.get32u(i)+1 == sbuf.get32u(i+4)) {
+            count += 1;
+        }
+    }
+    return count > hashdb_block_size/8;
+}
+
+static bool hist_sector(const sbuf_t &sbuf)
+{
+    std::map<uint32_t,uint32_t> hist;
+    for(size_t i=0;i<sbuf.pagesize-4;i+=4){
+        hist[sbuf.get32uBE(i)] += 1;
+    }
+    if (hist.size() < 3) return true;
+    for (std::map<uint32_t,uint32_t>::const_iterator it = hist.begin();it != hist.end(); it++){
+        if ((it->second) > hashdb_block_size/16){
+            return true;
+        }
+    }
+    return false;
+}
+
+static bool whitespace_sector(const sbuf_t &sbuf)
+{
+    for(size_t i=0;i<sbuf.pagesize;i++){
+        if (!isspace(sbuf[i])) return false;
+    }
+    return true;
+}
+
+// detect if block is empty
+inline bool is_empty_block(const uint8_t *buf) {
+    for (size_t i=1; i<hashdb_block_size; i++) {
+        if (buf[i] != buf[0]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+
+
 extern "C"
 void scan_hashdb(const class scanner_params &sp,
                  const recursion_control_block &rcb) {
@@ -285,7 +331,7 @@ void scan_hashdb(const class scanner_params &sp,
         case scanner_params::PHASE_SCAN: {
             switch(mode) {
                 case MODE_IMPORT:
-                     do_import(sp, rcb);
+                    do_import(sp, rcb);
                      return;
 
                 case MODE_SCAN:
@@ -333,6 +379,8 @@ static void do_import(const class scanner_params &sp,
     }
 
     // get count of blocks to process
+    // BRUCE - This is not a very efficient way to divide...
+
     size_t count = sbuf.bufsize / hashdb_import_sector_size;
     while ((count * hashdb_import_sector_size) +
            (hashdb_block_size - hashdb_import_sector_size) > sbuf.pagesize) {
@@ -344,6 +392,11 @@ static void do_import(const class scanner_params &sp,
        new std::vector<hashdb_t::import_element_t>;
 
     // compose the filename based on the forensic path
+    // BRUCE - I don't like the way that "4" is hard-coded. I know that's the length of
+    // the unicode character, but hard-coding magic numbers is not good. If we need to do this,
+    // there should be methods in sbuf_t for separating the filename from the path.
+    // And what is a "map file delimiter" ?
+
     std::string path_without_map_file_delimiter =
               (sbuf.pos0.path.size() > 4) ?
               std::string(sbuf.pos0.path, 0, sbuf.pos0.path.size() - 4) : "";
@@ -369,9 +422,9 @@ static void do_import(const class scanner_params &sp,
         size_t offset = i * hashdb_import_sector_size;
 
         // calculate the hash for this sector-aligned hash block
-    hash_t hash = hash_generator::hash_buf(
-                                 sbuf.buf + offset,
-                                 hashdb_block_size);
+        hash_t hash = hash_generator::hash_buf(
+                                               sbuf.buf + offset,
+                                               hashdb_block_size);
 
         // ignore empty blocks
         if (hashdb_ignore_empty_blocks && is_empty_block(sbuf.buf + offset)) {
@@ -413,6 +466,7 @@ static void do_scan(const class scanner_params &sp,
     }
 
     // get count of blocks to process
+    // BRUCE --- This is a poor way to do a division...
     size_t count = sbuf.bufsize / hashdb_scan_sector_size;
     while ((count * hashdb_scan_sector_size) +
            (hashdb_block_size - hashdb_scan_sector_size) > sbuf.pagesize) {
@@ -423,7 +477,8 @@ static void do_scan(const class scanner_params &sp,
     std::vector<hash_t>* scan_input = new std::vector<hash_t>;
 
     // allocate space on heap for the offset lookup table
-    std::vector<uint32_t>* offset_lookup_table = new std::vector<uint32_t>;
+    // BRUCE - offset_lookup_table should be a vector of size_t, not uint32_t.
+    std::vector<size_t>* offset_lookup_table = new std::vector<size_t>;
 
     // get the cryptograph hash values of all the blocks along
     // sector boundaries from sbuf
@@ -466,18 +521,23 @@ static void do_scan(const class scanner_params &sp,
         // as (pos0, hash_string, count_string)
 
         // pos0
-        pos0_t pos0 = sbuf.pos0 + offset_lookup_table->at(it->first);
+        size_t offset = offset_lookup_table->at(it->first);
+        pos0_t pos0 = sbuf.pos0 + offset;
 
         // hash_string
         std::string hash_string = scan_input->at(it->first).hexdigest();
 
-        // count
         std::stringstream ss;
-        ss << it->second;
-        std::string count_string = ss.str();
+        ss << it->second;        // count
+
+        // Construct an sbuf from the sector and subject it to the other tests
+        const sbuf_t s = sbuf_t(sbuf,offset,hashdb_block_size);
+        if (ramp_sector(s)) ss << " R";
+        if (hist_sector(s)) ss << " H";
+        if (whitespace_sector(s)) ss << " W";
 
         // record the feature
-        identified_blocks_recorder->write(pos0, hash_string, count_string);
+        identified_blocks_recorder->write(pos0, hash_string, ss.str());
     }
 
     // clean up
@@ -486,15 +546,6 @@ static void do_scan(const class scanner_params &sp,
     delete scan_output;
 }
 
-// detect if block is empty
-inline bool is_empty_block(const uint8_t *buf) {
-    for (size_t i=1; i<hashdb_block_size; i++) {
-        if (buf[i] != buf[0]) {
-            return false;
-        }
-    }
-    return true;
-}
 
 #endif