Permalink
Browse files

Use libmagic instead of spawning a process to run `file`

  • Loading branch information...
1 parent ae92e3f commit 6bed45d6fb7c080ae5c163c12b4eb8749a3492ac @robinluckey robinluckey committed Mar 8, 2012
Showing with 96 additions and 71 deletions.
  1. +2 −1 README.md
  2. +3 −3 build
  3. +91 −67 src/detector.c
View
@@ -63,7 +63,8 @@ Ohcount source code is available as a Git repository:
Building Ohcount
----------------
-You will need ragel 6.3 or higher, bash, pcre, gcc (version 4.1.2 or greater) and SWIG to build ohcount. Once you have them, go to the top directory of ohcount and run
+You will need ragel 6.3 or higher, bash, pcre, magic, gcc (version 4.1.2 or greater)
+and SWIG. Once you have them, go to the top directory of ohcount and run
```
./build
View
@@ -96,15 +96,15 @@ build_ohcount()
build_parser_o
echo "Building Ohcount"
mkdir -p bin/
- sh -c "$cc src/ohcount.c $files -o bin/ohcount -lpcre" || exit 1
+ sh -c "$cc src/ohcount.c $files -o bin/ohcount -lpcre -lmagic" || exit 1
}
build_test_suite()
{
build_hash_headers
build_parser_o
echo "Building test suite"
- sh -c "$cc test/unit/all_tests.c $files -o test/unit/run_tests -lpcre" \
+ sh -c "$cc test/unit/all_tests.c $files -o test/unit/run_tests -lpcre -lmagic" \
|| exit 1
}
@@ -121,7 +121,7 @@ build_ruby_bindings()
mkdir -p ruby/$arch
sh -c "$cc $RB_SHARED ruby/ohcount_wrap.c $files -o ruby/$arch/$RB_SHARED_NAME \
-I`ruby -rmkmf -e 'print Config::expand(CONFIG["archdir"])'` \
- -lpcre" || exit 1
+ -lpcre -lmagic" || exit 1
sh -c "cd test/unit/ruby && ruby ruby_test.rb" || exit 1
}
View
@@ -2,6 +2,7 @@
// See COPYING for license information.
#include <ctype.h>
+#include <magic.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -44,6 +45,94 @@ void escape_path(char *safe, const char *unsafe) {
} while (*unsafe++);
}
+/* Parse the output of libmagic and return a language, if any.
+ * The contents of string `line` will be destroyed.
+ */
+const char *magic_parse(char *line) {
+ char *p, *pe;
+ char *eol = line + strlen(line);
+
+ char buf[80];
+ size_t length;
+
+ for (p = line; p < eol; p++) *p = tolower(*p);
+ p = strstr(line, "script text");
+ if (p && p == line) { // /^script text(?: executable)? for \w/
+ p = strstr(line, "for ");
+ if (p) {
+ p += 4;
+ pe = p;
+ while (isalnum(*pe)) pe++;
+ length = pe - p;
+ strncpy(buf, p, length);
+ buf[length] = '\0';
+ struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
+ if (rl) return(rl->name);
+ }
+ } else if (p) { // /(\w+)(?: -\w+)* script text/
+ do {
+ p--;
+ pe = p;
+ while (*p == ' ') p--;
+ while (p != line && isalnum(*(p - 1))) p--;
+ if (p != line && *(p - 1) == '-') p--;
+ } while (*p == '-'); // Skip over any switches.
+ length = pe - p;
+ strncpy(buf, p, length);
+ buf[length] = '\0';
+ struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
+ if (rl) return(rl->name);
+ } else if (strstr(line, "xml")) return(LANG_XML);
+
+ return NULL;
+}
+
+/* Use libmagic to detect file language
+ */
+const char *detect_language_magic(SourceFile *sourcefile) {
+ char line[80];
+
+ magic_t cookie = magic_open(MAGIC_NONE);
+ if (cookie == NULL) {
+ fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
+ exit(1);
+ }
+ if (magic_load(cookie, NULL) != 0) {
+ fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
+ magic_close(cookie);
+ exit(1);
+ }
+
+ if (sourcefile->diskpath) {
+ const char *magic = magic_file(cookie, sourcefile->diskpath);
+ if (magic == NULL) {
+ fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
+ magic_close(cookie);
+ exit(1);
+ }
+ strncpy(line, magic, sizeof(line));
+ line[sizeof(line)-1] = '\0';
+ } else {
+ char *p = ohcount_sourcefile_get_contents(sourcefile);
+ if (!p) return NULL;
+
+ const char *magic = magic_buffer(cookie, p, strlen(p));
+ if (magic == NULL) {
+ fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
+ magic_close(cookie);
+ exit(1);
+ }
+ strncpy(line, magic, sizeof(line));
+ line[sizeof(line)-1] = '\0';
+ }
+
+ magic_close(cookie);
+
+ return magic_parse(line);
+}
+
+/* Use all available means to detect file language
+ */
const char *ohcount_detect_language(SourceFile *sourcefile) {
const char *language = NULL;
char *p, *pe;
@@ -135,74 +224,9 @@ const char *ohcount_detect_language(SourceFile *sourcefile) {
// Attempt to detect based on Unix 'file' command.
if(!language) {
- int tmpfile = 0;
- char *path = sourcefile->filepath;
- if (sourcefile->diskpath)
- path = sourcefile->diskpath;
- if (access(path, F_OK) != 0) { // create temporary file
- path = malloc(21);
- strncpy(path, "/tmp/ohcount_XXXXXXX\0", 21);
- int fd = mkstemp(path);
- char *contents = ohcount_sourcefile_get_contents(sourcefile);
- log_it("contents:");
- log_it(contents);
- length = contents ? strlen(contents) : 0;
- if (write(fd, contents, length) != length) {
- fprintf(stderr, "src/detector.c: Could not write temporary file %s.\n", path);
- exit(1);
- }
- close(fd);
- tmpfile = 1;
- }
-
- /* Filenames may include single quotes, which must be escaped */
- char escaped_path[strlen(path) * 4 + 1];
- escape_path(escaped_path, path);
-
- char command[strlen(escaped_path) + 11];
- sprintf(command, "file -b '%s'", escaped_path);
- FILE *f = popen(command, "r");
- if (f) {
- if (fgets(line, sizeof(line), f) == NULL) {
- fprintf(stderr, "src/detector.c: fgets() failed\n");
- exit(1);
- }
- char *eol = line + strlen(line);
- for (p = line; p < eol; p++) *p = tolower(*p);
- p = strstr(line, "script text");
- if (p && p == line) { // /^script text(?: executable)? for \w/
- p = strstr(line, "for ");
- if (p) {
- p += 4;
- pe = p;
- while (isalnum(*pe)) pe++;
- length = pe - p;
- strncpy(buf, p, length);
- buf[length] = '\0';
- struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
- if (rl) language = rl->name;
- }
- } else if (p) { // /(\w+)(?: -\w+)* script text/
- do {
- p--;
- pe = p;
- while (*p == ' ') p--;
- while (p != line && isalnum(*(p - 1))) p--;
- if (p != line && *(p - 1) == '-') p--;
- } while (*p == '-'); // Skip over any switches.
- length = pe - p;
- strncpy(buf, p, length);
- buf[length] = '\0';
- struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
- if (rl) language = rl->name;
- } else if (strstr(line, "xml")) language = LANG_XML;
- pclose(f);
- if (tmpfile) {
- remove(path);
- free(path);
- }
- }
+ language = detect_language_magic(sourcefile);
}
+
if (language) {
if (ISAMBIGUOUS(language)) {
// Call the appropriate function for disambiguation.

0 comments on commit 6bed45d

Please sign in to comment.