From 6aadcbc9306bb1ac08e22f657fea258ab8f4db75 Mon Sep 17 00:00:00 2001 From: agusgroh Date: Fri, 12 Apr 2024 10:19:20 -0300 Subject: [PATCH 1/3] SP-537 Updates filters --- src/main/java/com/scanoss/ScanossConstants.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/scanoss/ScanossConstants.java b/src/main/java/com/scanoss/ScanossConstants.java index c7f190b..1099dae 100644 --- a/src/main/java/com/scanoss/ScanossConstants.java +++ b/src/main/java/com/scanoss/ScanossConstants.java @@ -36,7 +36,7 @@ public class ScanossConstants { ".o", ".a", ".so", ".obj", ".dll", ".lib", ".out", ".app", ".bin", ".lst", ".dat", ".json", ".htm", ".html", ".xml", ".md", ".txt", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".pages", ".key", ".numbers", - ".pdf", ".min.js", ".mf", ".sum", ".woff", ".woff2" + ".pdf", ".min.js", ".mf", ".sum", ".woff", ".woff2", ".xsd", ".pom" ); // Folders to skip From ffbe590b4bc4556d13993a1d88a702164ed2ee15 Mon Sep 17 00:00:00 2001 From: agusgroh Date: Fri, 12 Apr 2024 10:20:21 -0300 Subject: [PATCH 2/3] SP-536 Removes filter for '.whl' extensions --- CHANGELOG.md | 6 ++++++ pom.xml | 2 +- src/main/java/com/scanoss/ScanossConstants.java | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d3a599..6ef3e1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Upcoming changes... +## [0.7.1] - 2024-04-12 +### Changed +- Update file and winnowing filters +- Remove filter for '.whl' file extensions + ## [0.7.0] - 2024-04-04 ### Added - Add HPSM support @@ -79,3 +84,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 [0.6.0]: https://github.com/scanoss/scanoss.java/compare/v0.5.5...v0.6.0 [0.6.1]: https://github.com/scanoss/scanoss.java/compare/v0.6.0...v0.6.1 [0.7.0]: https://github.com/scanoss/scanoss.java/compare/v0.6.1...v0.7.0 +[0.7.1]: https://github.com/scanoss/scanoss.java/compare/v0.7.0...v0.7.1 diff --git a/pom.xml b/pom.xml index cd825ca..9b057fd 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.scanoss scanoss - 0.7.0 + 0.7.1 jar scanoss.java https://github.com/scanoss/scanoss.java diff --git a/src/main/java/com/scanoss/ScanossConstants.java b/src/main/java/com/scanoss/ScanossConstants.java index 1099dae..bd581e4 100644 --- a/src/main/java/com/scanoss/ScanossConstants.java +++ b/src/main/java/com/scanoss/ScanossConstants.java @@ -59,7 +59,7 @@ public class ScanossConstants { ".po", ".ppt", ".prefs", ".properties", ".pyc", ".qdoc", ".result", ".rgb", ".rst", ".scss", ".sha", ".sha1", ".sha2", ".sha256", ".sln", ".spec", ".sql", ".sub", ".svg", ".svn-base", ".tab", ".template", ".test", ".tex", ".tiff", - ".toml", ".ttf", ".txt", ".utf-8", ".vim", ".wav", ".whl", ".woff", ".woff2", ".xht", + ".toml", ".ttf", ".txt", ".utf-8", ".vim", ".wav", ".woff", ".woff2", ".xht", ".xhtml", ".xls", ".xlsx", ".xml", ".xpm", ".xsd", ".xul", ".yaml", ".yml", ".wfp", ".editorconfig", ".dotcover", ".pid", ".lcov", ".egg", ".manifest", ".cache", ".coverage", ".cover", ".gem", ".lst", ".pickle", ".pdb", ".gml", ".pot", ".plt", From e85aa3fc137dc7f42765ecd5e8deb379855bd22f Mon Sep 17 00:00:00 2001 From: agusgroh Date: Fri, 12 Apr 2024 15:12:09 -0300 Subject: [PATCH 3/3] CLIS-111 Adds dir extension filter --- CHANGELOG.md | 1 + src/main/java/com/scanoss/Scanner.java | 16 +- .../java/com/scanoss/ScanossConstants.java | 6 +- src/test/java/com/scanoss/TestScanner.java | 20 +- .../test-no-ignore/scanner.c | 958 ++++++++++++++++++ .../test.egg-info/scanner.c | 958 ++++++++++++++++++ 6 files changed, 1954 insertions(+), 5 deletions(-) create mode 100644 testing/data/test-folder-ignore/test-no-ignore/scanner.c create mode 100644 testing/data/test-folder-ignore/test.egg-info/scanner.c diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ef3e1a..fb4eece 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Update file and winnowing filters - Remove filter for '.whl' file extensions +- Added dir extension filter ## [0.7.0] - 2024-04-04 ### Added diff --git a/src/main/java/com/scanoss/Scanner.java b/src/main/java/com/scanoss/Scanner.java index 97afd1a..c859228 100644 --- a/src/main/java/com/scanoss/Scanner.java +++ b/src/main/java/com/scanoss/Scanner.java @@ -156,19 +156,29 @@ public String wfpFile(@NonNull String filename) throws ScannerException, Winnowi * @return true if the folder should be skipped, false otherwise */ private Boolean filterFolder(String name) { + String nameLower = name.toLowerCase(); if (!hiddenFilesFolders && name.startsWith(".") && !name.equals(".")) { log.trace("Skipping hidden folder: {}", name); return true; } + boolean ignore = false; if (!allFolders) { // skip this check if all folders is selected for (String ending : ScanossConstants.FILTERED_DIRS) { - if (name.endsWith(ending)) { + if (nameLower.endsWith(ending)) { log.trace("Skipping folder due to ending: {} - {}", name, ending); - return true; + ignore = true; + } + } + if(!ignore){ + for (String ending : ScanossConstants.FILTERED_DIR_EXT) { + if (nameLower.endsWith(ending)) { + log.trace("Skipping folder due to ending: {} - {}", name, ending); + ignore = true; + } } } } - return false; + return ignore; } /** diff --git a/src/main/java/com/scanoss/ScanossConstants.java b/src/main/java/com/scanoss/ScanossConstants.java index bd581e4..c1ee615 100644 --- a/src/main/java/com/scanoss/ScanossConstants.java +++ b/src/main/java/com/scanoss/ScanossConstants.java @@ -36,7 +36,7 @@ public class ScanossConstants { ".o", ".a", ".so", ".obj", ".dll", ".lib", ".out", ".app", ".bin", ".lst", ".dat", ".json", ".htm", ".html", ".xml", ".md", ".txt", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".pages", ".key", ".numbers", - ".pdf", ".min.js", ".mf", ".sum", ".woff", ".woff2", ".xsd", ".pom" + ".pdf", ".min.js", ".mf", ".sum", ".woff", ".woff2", ".xsd", ".pom", ".whl" ); // Folders to skip @@ -45,6 +45,10 @@ public class ScanossConstants { "__pypackages__", "target" ); + // Folder endings to skip + static final List FILTERED_DIR_EXT = List.of(".egg-info"); + + // File extensions to skip static final List FILTERED_EXTENSIONS = Arrays.asList( ".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".ac", ".adoc", ".am", diff --git a/src/test/java/com/scanoss/TestScanner.java b/src/test/java/com/scanoss/TestScanner.java index e883a1c..efc4269 100644 --- a/src/test/java/com/scanoss/TestScanner.java +++ b/src/test/java/com/scanoss/TestScanner.java @@ -242,7 +242,7 @@ public void TestScannerScanFileListNegative() { log.info("Finished {} -->", methodName); } - @Test + @Test public void TestScannerTemplate() { String methodName = new Object() { }.getClass().getEnclosingMethod().getName(); @@ -250,4 +250,22 @@ public void TestScannerTemplate() { log.info("Finished {} -->", methodName); } + + @Test + public void TestIgnoreFolderExtension() { + String methodName = new Object() { + }.getClass().getEnclosingMethod().getName(); + log.info("<-- Starting {}", methodName); + + Scanner scanner = Scanner.builder().build(); + String folder = "testing/data/test-folder-ignore"; + List results = scanner.scanFolder(folder); + log.info("Received {} results", results.size()); + assertFalse("Scan results should be empty", results.isEmpty()); + assertEquals("Results should be one", results.size() , 1); + + log.info("Finished {} -->", methodName); + } + + } diff --git a/testing/data/test-folder-ignore/test-no-ignore/scanner.c b/testing/data/test-folder-ignore/test-no-ignore/scanner.c new file mode 100644 index 0000000..0ed7256 --- /dev/null +++ b/testing/data/test-folder-ignore/test-no-ignore/scanner.c @@ -0,0 +1,958 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * src/scanner.c + * + * A simple SCANOSS client in C for direct file scanning + * + * Copyright (C) 2022, SCANOSS + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "scanner.h" +#include "blacklist_ext.h" +#include "winnowing.h" +#include "log.h" +#include "format_utils.h" +/*SCANNER PRIVATE PROPERTIES*/ + + +#define MAX_FILES_CHUNK (1<<31) + +#define DEFAULT_WFP_SCAN_FILE_NAME "scan.wfp" +#define DEFAULT_RESULT_NAME "scanner_output.txt" + +const char EXCLUDED_DIR[] = " .git, .svn, .eggs, __pycache__, node_modules,"; +const char EXCLUDED_EXTENSIONS[] = " .1, .2, .3, .4, .5, .6, .7, .8, .9, .ac, .adoc, .am," + " .asciidoc, .bmp, .build, .cfg, .chm, .class, .cmake, .cnf," + " .conf, .config, .contributors, .copying, .crt, .csproj, .css," + " .csv, .cvsignore, .dat, .data, .doc, .ds_store, .dtd, .dts," + " .dtsi, .dump, .eot, .eps, .geojson, .gdoc, .gif, .gitignore," + " .glif, .gmo, .gradle, .guess, .hex, .htm, .html, .ico, .in," + " .inc, .info, .ini, .ipynb, .jpeg, .jpg, .json, .jsonld," + " .log, .m4, .map, .markdown, .md, .md5, .meta, .mk, .mxml," + " .o, .otf, .out, .pbtxt, .pdf, .pem, .phtml, .plist, .png," + " .po, .ppt, .prefs, .properties, .pyc, .qdoc, .result, .rgb," + " .rst, .scss, .sha, .sha1, .sha2, .sha256, .sln, .spec, .sql," + " .sub, .svg, .svn-base, .tab, .template, .test, .tex, .tiff," + " .toml, .ttf, .txt, .utf-8, .vim, .wav, .whl, .woff, .xht," + " .xhtml, .xls, .xml, .xpm, .xsd, .xul, .yaml, .yml,"; + + +static int curl_request(int api_req, char* endpoint, char* data,scanner_object_t *s); + +/* Returns a hexadecimal representation of the first "len" bytes in "bin" */ +static char *bin_to_hex(uint8_t *bin, uint32_t len) +{ + char digits[] = "0123456789abcdef"; + char *out = malloc(2 * len + 1); + uint32_t ptr = 0; + + for (uint32_t i = 0; i < len; i++) + { + out[ptr++] = digits[(bin[i] & 0xF0) >> 4]; + out[ptr++] = digits[bin[i] & 0x0F]; + } + + out[ptr] = 0; + return out; +} + +static char *read_file(char *path, long *length) +{ + /* Read file into memory */ + FILE *fp = fopen(path, "rb"); + fseek(fp, 0, SEEK_END); + *length = ftell(fp); + char *src = calloc(*length + 2, 1); + fseek(fp, 0, SEEK_SET); + fread(src, 1, *length, fp); + fclose(fp); + return src; +} + +static long millis() +{ + struct timespec _t; + clock_gettime(CLOCK_REALTIME, &_t); + return _t.tv_sec*1000 + lround(_t.tv_nsec/1.0e6); +} + +static bool scanner_is_dir(char *path) +{ + struct stat pstat; + if (!stat(path, &pstat)) + if (S_ISDIR(pstat.st_mode)) + return true; + return false; +} + +static bool scanner_is_file(char *path) +{ + struct stat pstat; + if (!stat(path, &pstat)) + if (S_ISREG(pstat.st_mode)) + return true; + return false; +} + + +static void scanner_write_none_result(scanner_object_t *s, char * path) +{ + fprintf(s->output, "\"%s\":[{\n\"id\":\"none\"\n}]\n,\n", path); +} + +static uint key_count(char * buffer, const char * key) +{ + char *found = strstr(buffer, key);; + uint count = 0; + while(found) + { + found += strlen(key); + found = strstr(found,key); + count++; + } + return count; +} + +static uint wfp_files_count(scanner_object_t *s) +{ + const char file_key[] = "file="; + long buffer_size = 0; //size of wfp file + char *wfp_buffer = read_file(s->wfp_path, &buffer_size); + uint count =key_count(wfp_buffer,file_key); + free(wfp_buffer); + s->status.wfp_files = count; + return count; +} + +/* Scan a file */ +static bool scanner_file_proc(scanner_object_t *s, char *path) +{ + bool state = true; + char *wfp_buffer; + char *ext = strrchr(path, '.'); + if (!ext) + return state; + + char f_extension[strlen(ext) + 3]; + + /*File extension filter*/ + sprintf(f_extension, " %s,", ext); + + if (strstr(EXCLUDED_EXTENSIONS, f_extension)) + { + log_trace("Excluded extension: %s", ext); + scanner_write_none_result(s, path); //add none id to ignored files + return true; //avoid filtered extensions + } + + s->status.state = SCANNER_STATE_WFP_CALC; //update scanner state + + //If we have a wfp file, add the content to the main wfp file. + if (!strcmp(ext, ".wfp")) + { + log_debug("is a wfp file: %s", path); + long len = 0; + wfp_buffer = read_file(path, &len); + + //ensure line end character + wfp_buffer[len] = '\n'; + s->status.wfp_files += key_count(wfp_buffer,"file=") - 1; //correct the total files number + } + else + { + wfp_buffer = calloc(MAX_FILE_SIZE, 1); + *wfp_buffer = 0; + scanner_wfp_capture(path,NULL, wfp_buffer); + } + + if (*wfp_buffer) + { + FILE *wfp_f = fopen(s->wfp_path, "a+"); + fprintf(wfp_f, "%s", wfp_buffer); + fclose(wfp_f); + state = false; + s->status.wfp_files++; //update scanner proc. files + } + else + { + scanner_write_none_result(s, path); //add none id to ignored files + log_trace("No wfp: %s", path); + } + + free(wfp_buffer); + + if (s->callback && s->status.wfp_files % 100 == 0) + s->callback(&s->status,SCANNER_EVT_WFP_CALC_IT); + + return state; +} + +static bool get_last_component(char * buffer, char * component) +{ + bool state = true; + + char * last = buffer; + const char key[] = "\"component\":"; + + while (last < buffer + strlen(buffer) && last != NULL) + { + last = strstr(last, key); + + if (last) + { + char * comp_first_letter = strchr(last,':') + 2; + + if (*comp_first_letter != ' ') + { + char * comp_last_letter = strchr(last,','); + memset(component,0,MAX_COMPONENT_SIZE); + strncpy(component,comp_first_letter+1,comp_last_letter-comp_first_letter-2); + state = false; + } + + last += strlen(key); + } + + } + return state; +} + +void json_correct(char * target) +{ + size_t file_length = strlen(target); + + char buffer[file_length]; + char *insert_point = &buffer[0]; + const char *tmp = target; + + char * needle; + char * replacement; + + asprintf(&needle,"}\n\r\n{"); + asprintf(&replacement,"\n\r,\r\n"); + + size_t needle_len = strlen(needle); + size_t repl_len = strlen(replacement); + + while (1) { + const char *p = strstr(tmp, needle); + + // walked past last occurrence of needle; copy remaining part + if (p == NULL) + { + strcpy(insert_point, tmp); + break; + } + + // copy part before needle + memcpy(insert_point, tmp, p - tmp); + insert_point += p - tmp; + + // copy replacement string + memcpy(insert_point, replacement, repl_len); + insert_point += repl_len; + + // adjust pointers, move on + tmp = p + needle_len; + } + memset(target,0,file_length); + strcpy(target,buffer); + free(needle); + free(replacement); +} + +static bool scan_request_by_chunks(scanner_object_t *s) +{ +#define START_FIND_COMP_FROM_END 36864 + + const char file_key[] = "file="; + bool state = true; + + int files_count = 0; + + long buffer_size = 0; //size of wfp file + char *wfp_buffer = read_file(s->wfp_path, &buffer_size); + wfp_buffer[buffer_size] = 0; + + char * last_file = wfp_buffer; + char * prev_file = wfp_buffer; + char * last_chunk = wfp_buffer; + + char post_response_buffer[START_FIND_COMP_FROM_END+1]; + int post_response_pos = 0; + long chunk_start_time = 0; + fpos_t file_pos; + + asprintf(&s->curl_temp_path,"%s.tmp",s->output_path); + + s->status.state = SCANNER_STATE_ANALIZING; + log_debug("ID: %s - Scanning, it could take some time, please be patient",s->status.id); + //walk over wfp buffer search for file key + s->status.total_response_time = millis(); + while(last_file - wfp_buffer < buffer_size) + { + chunk_start_time = millis(); + last_file = strstr(last_file,file_key); + + if (last_file - last_chunk > s->files_chunk_size || (last_file == NULL)) + { + if (last_file == NULL) + prev_file = &wfp_buffer[buffer_size]; + + //exact a new chunk from wfp file + uint size = prev_file - last_chunk; + if (size == 0) + { + size = s->files_chunk_size -1; + prev_file = last_file; + } + + char *chunk_buffer = calloc(size + 1, 1); + strncpy(chunk_buffer,last_chunk,size); + + s->status.scanned_files = files_count; //update proc. files + last_chunk = prev_file; + last_file = prev_file; + //define the component context, find the last component in the output file. + post_response_pos = ftell(s->output); + + memset(post_response_buffer,0,sizeof(post_response_buffer)); + + if (post_response_pos < START_FIND_COMP_FROM_END) + { + fseek(s->output,0L,SEEK_SET); + } + else + { + fseek(s->output,-1*START_FIND_COMP_FROM_END,SEEK_END); + } + + //go back in the output file and find the last component + fread(post_response_buffer,1,START_FIND_COMP_FROM_END,s->output); + get_last_component(post_response_buffer,s->status.component_last); + + log_debug("Last found component: %s", s->status.component_last); + + fseek(s->output,0L,SEEK_END); + + //get the result from the last chunk - It will be append to the output file + fgetpos(s->output, &file_pos); + curl_request(API_REQ_POST,"scan/direct",chunk_buffer,s); + + /*read curl response and correct the json */ + long chunk_resp_size; + char * chunk_resp = read_file(s->curl_temp_path, &chunk_resp_size); + char * last_bracket = strrchr(chunk_resp,'}'); + *last_bracket = ','; //replace } by , + fwrite(chunk_resp+1, 1, chunk_resp_size - 1, s->output); // avoid first { + free(chunk_resp); + + free(chunk_buffer); + state = false; + s->status.last_chunk_response_time = millis() - chunk_start_time; + log_debug("ID: %s - Chunk proc. end, %u processed files in %ld ms", s->status.id, s->status.scanned_files,millis() - s->status.total_response_time); + sprintf(s->status.message, "CHUNK_PROC_%lu_ms", s->status.last_chunk_response_time); + if (s->callback) + { + s->callback(&s->status,SCANNER_EVT_CHUNK_PROC); + } + + } + else + { + files_count++; + } + prev_file = last_file; + last_file += strlen(file_key); + } + s->status.total_response_time = millis() - s->status.total_response_time; + + fseek(s->output,-4L,SEEK_END); + fprintf(s->output,"\n}"); + + if (s->callback) + { + s->callback(&s->status,SCANNER_EVT_CHUNK_PROC_END); + } + free(wfp_buffer); + remove(s->curl_temp_path); //delete tmp file + free(s->curl_temp_path); + + s->status.state = SCANNER_STATE_OK; + return state; + +} + +/* Scan all files from a Directory*/ +static bool scanner_dir_proc(scanner_object_t *s, char *path) +{ + + bool state = true; //true if were a error + + DIR *d = opendir(path); + if (d == NULL) + return false; + struct dirent *entry; // for the directory entries + + //remove "./" from path + if (path[0] == '.' && path[1] == '/') + { + path+=2; + } + + while ((entry = readdir(d)) != NULL) + { + char temp[strlen(path) + strlen(entry->d_name) + 2]; + + sprintf(temp, "%s/%s", path, entry->d_name); + + if (entry->d_type == DT_DIR) + { + + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) + continue; + + /*Directory filter */ + char f_dir[strlen(entry->d_name) + 3]; + sprintf(f_dir, " %s,", entry->d_name); + if (strstr(EXCLUDED_DIR, f_dir)) + { + log_trace("Excluded Directory: %s", entry->d_name); + continue; + } + scanner_dir_proc(s, temp); //If its a valid directory, then process it + } + else if (scanner_is_file(temp)) + { + if (!scanner_file_proc(s ,temp)) + { + log_trace("Scan: %s", temp); + } + state = false; + } + } + + closedir(d); + return state; +} + + +static int curl_request(int api_req,char * endpoint, char* data, scanner_object_t *s) +{ + char *m_host; + char *user_version; + char *user_session; + char *flags; + curl_mime *mime = NULL; + + long m_port = strtol(s->API_port, NULL, 10); + + asprintf(&user_session, "X-session: %s", s->API_session); + asprintf(&user_version, "User-Agent: "SCANOSS_CLIENT_NAME"/"VERSION); + asprintf(&flags,"%u",s->flags); + + s->curl_temp = fopen(s->curl_temp_path, "w+"); + + if (api_req == API_REQ_POST) + asprintf(&m_host, "%s/%s", s->API_host,endpoint); + + else + asprintf(&m_host,"%s/%s/%s",s->API_host,endpoint,data); + + CURL *curl; + CURLcode res; + /* In windows, this will init the winsock stuff */ + res = curl_global_init(CURL_GLOBAL_DEFAULT); + /* Check for errors */ + if (res != CURLE_OK) + { + log_fatal("curl_global_init() failed: %s\n", + curl_easy_strerror(res)); + return 1; + } + + /* get a curl handle */ + curl = curl_easy_init(); + if (curl) + { + /* First set the URL that is about to receive our POST. */ + curl_easy_setopt(curl, CURLOPT_URL, m_host); + curl_easy_setopt(curl, CURLOPT_PORT, m_port); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L); //curl ignore certificates + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L); //curl ignore certificates + curl_easy_setopt(curl, CURLOPT_WRITEDATA, s->curl_temp); + + if (log_level_is_enabled(LOG_TRACE)) + curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); + + curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https"); + + struct curl_slist *chunk = NULL; + chunk = curl_slist_append(chunk, "Connection: close"); + chunk = curl_slist_append(chunk, user_version); + chunk = curl_slist_append(chunk, user_session); + chunk = curl_slist_append(chunk, "Expect:"); + chunk = curl_slist_append(chunk, "Accept: */*"); + + res = curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk); + + if (api_req == API_REQ_POST) + { + curl_mimepart *part; + mime = curl_mime_init(curl); + + part = curl_mime_addpart(mime); + curl_mime_name(part, "format"); + //curl_mime_data(part, s->format, CURL_ZERO_TERMINATED); + //we are forcing to plain format because spdx and cyclonedx are processing local. + curl_mime_data(part, "plain", CURL_ZERO_TERMINATED); + + part = curl_mime_addpart(mime); + curl_mime_name(part, "flags"); + curl_mime_data(part, flags, CURL_ZERO_TERMINATED); + + part = curl_mime_addpart(mime); + curl_mime_name(part, "context"); + curl_mime_data(part, s->status.component_last, CURL_ZERO_TERMINATED); + + part = curl_mime_addpart(mime); + curl_mime_name(part, "file"); + curl_mime_filename(part, "scan.wfp"); + curl_mime_type(part,"application/octet-stream"); + curl_mime_data(part, data, CURL_ZERO_TERMINATED); + + curl_easy_setopt(curl, CURLOPT_MIMEPOST, mime); + } + + /* Perform the request, res will get the return code */ + res = curl_easy_perform(curl); + + /* Check for errors */ + if (res != CURLE_OK) + { + log_error("curl_easy_perform() failed: %s\n", curl_easy_strerror(res)); + if (s->callback) + { + s->callback(&s->status,SCANNER_EVT_ERROR_CURL); + } + } + + fclose(s->curl_temp); + /* always cleanup */ + curl_easy_cleanup(curl); + curl_slist_free_all(chunk); + } + + curl_global_cleanup(); + free(flags); + free(m_host); + free(user_session); + free(user_version); + if (API_REQ_POST) + curl_mime_free(mime); + + return 0; + +} +bool print_format(scanner_object_t * s) +{ + if (!strcmp(s->format, SCANNER_FORMAT_PLAIN)) + return false; + + fprintf(stderr, "\nPrinting the selected format: %s", s->format); + if(scan_parse_v2(s->output_path)) + { + log_error("There was a error parsing the json file, please check the output: %s", s->output_path); + return true; + } + + //rewrite output path with the selected format + s->output = fopen(s->output_path, "w+"); + print_matches(s->output, s->format); + fclose(s->output); + return false; +} + +/********* PUBLIC FUNTIONS DEFINITION ************/ + +void scanner_set_format(scanner_object_t *s, char *form) +{ + if (!form) + return; + + if (strstr(form, "plain") || strstr(form, "spdx") || strstr(form, "cyclonedx")) + { + strncpy(s->format, form, sizeof(s->format)); + } + else + log_debug("%s is not a valid output format, using plain\n", form); + +} + +void scanner_set_host(scanner_object_t *s, char *host) +{ + if (!host || strcmp(host," ") == 0) + return; + + memset(s->API_host, '\0', sizeof(s->API_host)); + strncpy(s->API_host, host, sizeof(s->API_host)); + log_debug("Host set: %s", s->API_host); +} + +void scanner_set_port(scanner_object_t *s, char *port) +{ + if (!port || strcmp(port," ") == 0) + return; + + memset(s->API_port, '\0', sizeof(s->API_port)); + strncpy(s->API_port, port, sizeof(s->API_port)); + log_debug("Port set: %s", s->API_port); +} + +void scanner_set_session(scanner_object_t *s, char *session) +{ + if (!session || strcmp(session," ") == 0) + return; + + memset(s->API_session, '\0', sizeof(s->API_session)); + strncpy(s->API_session, session, sizeof(s->API_session)); + log_debug("Session set: %s", s->API_session); +} + +void scanner_set_log_level(int level) +{ + log_set_level(level); +} + +void scanner_set_log_file(char *log) +{ + log_set_file(log); +} + +void scanner_set_output(scanner_object_t * e, char * f) +{ + if (!f) + { + asprintf(&e->output_path,"%s", DEFAULT_RESULT_NAME); + } + else + e->output_path = f; + + e->output = fopen(e->output_path, "w+"); + if (!e->output) + log_fatal("Failed to open the output file. Check the if the permmisions are right and if the directory exist"); + + //open json file + fprintf(e->output,"{\n"); + log_debug("ID: %s -File open: %s", e->status.id, e->output_path); +} + +void scanner_wfp_capture(char *path, char **md5, char *wfp_buffer) +{ + char *hex_md5 = NULL; + long length = 0; + char *src = read_file(path, &length); + //no external memory parameter, normal execution + if (md5 == NULL) + { + if (length > MIN_FILE_SIZE && !unwanted_header(src)) + { + /* Calculate MD5 */ + uint8_t bin_md5[16] = "\0"; + MD5((uint8_t *)src, length, bin_md5); + hex_md5 = bin_to_hex(bin_md5, 16); + } + } + //external reference, but null. Reserve memory and calc md5. + else if (*md5 == NULL) + { + /* Calculate MD5 */ + uint8_t bin_md5[16] = "\0"; + MD5((uint8_t *)src, length, bin_md5); + hex_md5 = bin_to_hex(bin_md5, 16); + *md5 = strdup(hex_md5); + } + //external md5, use it + else + { + hex_md5 = *md5; + } + + /* Skip if file is under threshold or if content is not wanted*/ + if (length < MIN_FILE_SIZE || unwanted_header(src)) + { + free(src); + return; + } + + /* Save file information to buffer */ + sprintf(wfp_buffer + strlen(wfp_buffer), "file=%s,%lu,%s\n", hex_md5, length, path); + free(hex_md5); + + /* If it is not binary (chr(0) found), calculate snippet wfps */ + if (strlen(src) == length && length < MAX_FILE_SIZE) + { + /* Capture hashes (Winnowing) */ + uint32_t *hashes = malloc(MAX_FILE_SIZE); + uint32_t *lines = malloc(MAX_FILE_SIZE); + uint32_t last_line = 0; + + /* Calculate hashes */ + uint32_t size = winnowing(src, hashes, lines, MAX_FILE_SIZE); + + /* Write hashes to buffer */ + for (int i = 0; i < size; i++) + { + if (last_line != lines[i]) + { + if (last_line != 0) + strcat(wfp_buffer, "\n"); + sprintf(wfp_buffer + strlen(wfp_buffer), "%d=%08x", lines[i], hashes[i]); + last_line = lines[i]; + } + else + sprintf(wfp_buffer + strlen(wfp_buffer), ",%08x", hashes[i]); + } + strcat(wfp_buffer, "\n"); + free(hashes); + free(lines); + } + free(src); +} + +int scanner_recursive_scan(scanner_object_t * scanner, bool wfp_only) +{ + if (!scanner) + { + log_fatal("Scanner object need to proceed"); + } + scanner->status.state = SCANNER_STATE_INIT; + scanner->status.wfp_files = 0; + scanner->status.scanned_files = 0; + scanner->status.wfp_total_time = millis(); + scanner->status.last_chunk_response_time = 0; + scanner->status.total_response_time = 0; + asprintf(&scanner->wfp_path,"%s.wfp",scanner->output_path); + strcpy(scanner->status.message, "WFP_CALC_START\0"); + log_debug("ID: %s - Scan start - WFP Calculation", scanner->status.id); + + if (scanner->callback) + { + scanner->callback(&scanner->status,SCANNER_EVT_START); + } + //check if exist the output file + if (!scanner->output) + scanner_set_output(scanner, NULL); + + /*create blank wfp file*/ + FILE *wfp_f = fopen(scanner->wfp_path, "w+"); + fclose(wfp_f); + + if (scanner_is_file(scanner->scan_path)) + { + scanner_file_proc(scanner, scanner->scan_path); + } + else if (scanner_is_dir(scanner->scan_path)) + { + int path_len = strlen(scanner->scan_path); + if (path_len > 1 && scanner->scan_path[path_len - 1] == '/') //remove extra '/' + scanner->scan_path[path_len - 1] = '\0'; + + scanner_dir_proc(scanner, scanner->scan_path); + } + else + { + scanner->status.state = SCANNER_STATE_ERROR; + log_error("\"%s\" is not a file\n", scanner->scan_path); + if (scanner->callback) + { + scanner->callback(&scanner->status,SCANNER_EVT_ERROR); + } + } + scanner->status.wfp_total_time = millis() - scanner->status.wfp_total_time; + log_debug("ID: %s - WFP calculation end, %u processed files in %ld ms", scanner->status.id, scanner->status.wfp_files, scanner->status.wfp_total_time); + if (scanner->callback) + { + scanner->callback(&scanner->status,SCANNER_EVT_WFP_CALC_END); + } + + strcpy(scanner->status.message, "WFP_CALC_END\0"); + + if (wfp_only) + return scanner->status.state; + + scan_request_by_chunks(scanner); + free(scanner->wfp_path); + + if (scanner->output) + { + fclose(scanner->output); + } + //print the selected format or do nothing if it is plain. + print_format(scanner); + + if (scanner->callback) + { + scanner->callback(&scanner->status,SCANNER_EVT_END); + } + strcpy(scanner->status.message, "FINISHED\0"); + + return scanner->status.state; +} + +int scanner_wfp_scan(scanner_object_t * scanner) +{ + if (!scanner) + { + log_fatal("Scanner object need to proceed"); + } + scanner->status.state = SCANNER_STATE_INIT; + scanner->status.wfp_files = 0; + scanner->status.scanned_files = 0; + scanner->status.wfp_total_time = millis(); + scanner->status.last_chunk_response_time = 0; + scanner->status.total_response_time = 0; + + if(!scanner_is_file(scanner->scan_path)) + { + log_debug("wfp_scan only works with wfp files"); + return SCANNER_STATE_ERROR; + } + + scanner->wfp_path = scanner->scan_path; + + if (wfp_files_count(scanner) == 0) + return SCANNER_STATE_ERROR; + + strcpy(scanner->status.message, "WFP_PROC_START\0"); + log_debug("ID: %s - Scan start - Scanning WFP file by chunks", scanner->status.id); + if (scanner->callback) + { + scanner->callback(&scanner->status,SCANNER_EVT_CHUNK_PROC); + } + + scan_request_by_chunks(scanner); + + if (scanner->output) + fclose(scanner->output); + + if (scanner->callback) + { + scanner->callback(&scanner->status,SCANNER_EVT_END); + } + strcpy(scanner->status.message, "FINISHED\0"); + + return scanner->status.state; + +} + + +int scanner_get_file_contents(scanner_object_t *scanner, char * hash) +{ + scanner->curl_temp_path = scanner->output_path; + int err_code = curl_request(API_REQ_GET,"file_contents",hash,scanner); + + return err_code; +} + +int scanner_get_license_obligations(scanner_object_t *scanner, char * license_name) +{ + scanner->curl_temp_path = scanner->output_path; + int err_code = curl_request(API_REQ_GET,"license/obligations",license_name,scanner); + + return err_code; +} + +bool scanner_get_attribution(scanner_object_t *scanner, char * path) +{ + long len; + char * data = read_file(path,&len); + + scanner->curl_temp_path = scanner->output_path; + int state = curl_request(API_REQ_POST,"sbom/attribution", data, scanner); + + free(data); + return state; +} + + +int scanner_print_output(scanner_object_t *scanner) +{ + bool state = true; + + if (!scanner->output_path) + return 1; + + FILE * output = fopen(scanner->output_path, "r"); + char c; + + if (output) + { + while ((c = getc(output)) != EOF) + putchar(c); + + fclose(output); + state = false; + } + + free(scanner->output_path); + return state; +} +scanner_object_t * scanner_create(char * id, char * host, char * port, char * session, char * format, char * path, char * file, scanner_flags_t flags, scanner_evt_handler callback) +{ + scanner_object_t *scanner = calloc(1, sizeof(scanner_object_t)); + scanner_object_t init = __SCANNER_OBJECT_INIT(path,file); + init.callback = callback; + strncpy(init.status.id, id, MAX_ID_LEN); + + //copy default config + memcpy(scanner,&init,sizeof(scanner_object_t)); + + scanner_set_host(scanner, host); + scanner_set_port(scanner, port); + scanner_set_session(scanner, session); + + if (flags > 0) + scanner->flags = flags; + + scanner_set_format(scanner, format); + scanner_set_output(scanner, file); + + + strcpy(scanner->status.message, "SCANNER_CREATED\0"); + return scanner; +} + + +void scanner_object_free(scanner_object_t * scanner) +{ + free(scanner); +} \ No newline at end of file diff --git a/testing/data/test-folder-ignore/test.egg-info/scanner.c b/testing/data/test-folder-ignore/test.egg-info/scanner.c new file mode 100644 index 0000000..0ed7256 --- /dev/null +++ b/testing/data/test-folder-ignore/test.egg-info/scanner.c @@ -0,0 +1,958 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * src/scanner.c + * + * A simple SCANOSS client in C for direct file scanning + * + * Copyright (C) 2022, SCANOSS + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "scanner.h" +#include "blacklist_ext.h" +#include "winnowing.h" +#include "log.h" +#include "format_utils.h" +/*SCANNER PRIVATE PROPERTIES*/ + + +#define MAX_FILES_CHUNK (1<<31) + +#define DEFAULT_WFP_SCAN_FILE_NAME "scan.wfp" +#define DEFAULT_RESULT_NAME "scanner_output.txt" + +const char EXCLUDED_DIR[] = " .git, .svn, .eggs, __pycache__, node_modules,"; +const char EXCLUDED_EXTENSIONS[] = " .1, .2, .3, .4, .5, .6, .7, .8, .9, .ac, .adoc, .am," + " .asciidoc, .bmp, .build, .cfg, .chm, .class, .cmake, .cnf," + " .conf, .config, .contributors, .copying, .crt, .csproj, .css," + " .csv, .cvsignore, .dat, .data, .doc, .ds_store, .dtd, .dts," + " .dtsi, .dump, .eot, .eps, .geojson, .gdoc, .gif, .gitignore," + " .glif, .gmo, .gradle, .guess, .hex, .htm, .html, .ico, .in," + " .inc, .info, .ini, .ipynb, .jpeg, .jpg, .json, .jsonld," + " .log, .m4, .map, .markdown, .md, .md5, .meta, .mk, .mxml," + " .o, .otf, .out, .pbtxt, .pdf, .pem, .phtml, .plist, .png," + " .po, .ppt, .prefs, .properties, .pyc, .qdoc, .result, .rgb," + " .rst, .scss, .sha, .sha1, .sha2, .sha256, .sln, .spec, .sql," + " .sub, .svg, .svn-base, .tab, .template, .test, .tex, .tiff," + " .toml, .ttf, .txt, .utf-8, .vim, .wav, .whl, .woff, .xht," + " .xhtml, .xls, .xml, .xpm, .xsd, .xul, .yaml, .yml,"; + + +static int curl_request(int api_req, char* endpoint, char* data,scanner_object_t *s); + +/* Returns a hexadecimal representation of the first "len" bytes in "bin" */ +static char *bin_to_hex(uint8_t *bin, uint32_t len) +{ + char digits[] = "0123456789abcdef"; + char *out = malloc(2 * len + 1); + uint32_t ptr = 0; + + for (uint32_t i = 0; i < len; i++) + { + out[ptr++] = digits[(bin[i] & 0xF0) >> 4]; + out[ptr++] = digits[bin[i] & 0x0F]; + } + + out[ptr] = 0; + return out; +} + +static char *read_file(char *path, long *length) +{ + /* Read file into memory */ + FILE *fp = fopen(path, "rb"); + fseek(fp, 0, SEEK_END); + *length = ftell(fp); + char *src = calloc(*length + 2, 1); + fseek(fp, 0, SEEK_SET); + fread(src, 1, *length, fp); + fclose(fp); + return src; +} + +static long millis() +{ + struct timespec _t; + clock_gettime(CLOCK_REALTIME, &_t); + return _t.tv_sec*1000 + lround(_t.tv_nsec/1.0e6); +} + +static bool scanner_is_dir(char *path) +{ + struct stat pstat; + if (!stat(path, &pstat)) + if (S_ISDIR(pstat.st_mode)) + return true; + return false; +} + +static bool scanner_is_file(char *path) +{ + struct stat pstat; + if (!stat(path, &pstat)) + if (S_ISREG(pstat.st_mode)) + return true; + return false; +} + + +static void scanner_write_none_result(scanner_object_t *s, char * path) +{ + fprintf(s->output, "\"%s\":[{\n\"id\":\"none\"\n}]\n,\n", path); +} + +static uint key_count(char * buffer, const char * key) +{ + char *found = strstr(buffer, key);; + uint count = 0; + while(found) + { + found += strlen(key); + found = strstr(found,key); + count++; + } + return count; +} + +static uint wfp_files_count(scanner_object_t *s) +{ + const char file_key[] = "file="; + long buffer_size = 0; //size of wfp file + char *wfp_buffer = read_file(s->wfp_path, &buffer_size); + uint count =key_count(wfp_buffer,file_key); + free(wfp_buffer); + s->status.wfp_files = count; + return count; +} + +/* Scan a file */ +static bool scanner_file_proc(scanner_object_t *s, char *path) +{ + bool state = true; + char *wfp_buffer; + char *ext = strrchr(path, '.'); + if (!ext) + return state; + + char f_extension[strlen(ext) + 3]; + + /*File extension filter*/ + sprintf(f_extension, " %s,", ext); + + if (strstr(EXCLUDED_EXTENSIONS, f_extension)) + { + log_trace("Excluded extension: %s", ext); + scanner_write_none_result(s, path); //add none id to ignored files + return true; //avoid filtered extensions + } + + s->status.state = SCANNER_STATE_WFP_CALC; //update scanner state + + //If we have a wfp file, add the content to the main wfp file. + if (!strcmp(ext, ".wfp")) + { + log_debug("is a wfp file: %s", path); + long len = 0; + wfp_buffer = read_file(path, &len); + + //ensure line end character + wfp_buffer[len] = '\n'; + s->status.wfp_files += key_count(wfp_buffer,"file=") - 1; //correct the total files number + } + else + { + wfp_buffer = calloc(MAX_FILE_SIZE, 1); + *wfp_buffer = 0; + scanner_wfp_capture(path,NULL, wfp_buffer); + } + + if (*wfp_buffer) + { + FILE *wfp_f = fopen(s->wfp_path, "a+"); + fprintf(wfp_f, "%s", wfp_buffer); + fclose(wfp_f); + state = false; + s->status.wfp_files++; //update scanner proc. files + } + else + { + scanner_write_none_result(s, path); //add none id to ignored files + log_trace("No wfp: %s", path); + } + + free(wfp_buffer); + + if (s->callback && s->status.wfp_files % 100 == 0) + s->callback(&s->status,SCANNER_EVT_WFP_CALC_IT); + + return state; +} + +static bool get_last_component(char * buffer, char * component) +{ + bool state = true; + + char * last = buffer; + const char key[] = "\"component\":"; + + while (last < buffer + strlen(buffer) && last != NULL) + { + last = strstr(last, key); + + if (last) + { + char * comp_first_letter = strchr(last,':') + 2; + + if (*comp_first_letter != ' ') + { + char * comp_last_letter = strchr(last,','); + memset(component,0,MAX_COMPONENT_SIZE); + strncpy(component,comp_first_letter+1,comp_last_letter-comp_first_letter-2); + state = false; + } + + last += strlen(key); + } + + } + return state; +} + +void json_correct(char * target) +{ + size_t file_length = strlen(target); + + char buffer[file_length]; + char *insert_point = &buffer[0]; + const char *tmp = target; + + char * needle; + char * replacement; + + asprintf(&needle,"}\n\r\n{"); + asprintf(&replacement,"\n\r,\r\n"); + + size_t needle_len = strlen(needle); + size_t repl_len = strlen(replacement); + + while (1) { + const char *p = strstr(tmp, needle); + + // walked past last occurrence of needle; copy remaining part + if (p == NULL) + { + strcpy(insert_point, tmp); + break; + } + + // copy part before needle + memcpy(insert_point, tmp, p - tmp); + insert_point += p - tmp; + + // copy replacement string + memcpy(insert_point, replacement, repl_len); + insert_point += repl_len; + + // adjust pointers, move on + tmp = p + needle_len; + } + memset(target,0,file_length); + strcpy(target,buffer); + free(needle); + free(replacement); +} + +static bool scan_request_by_chunks(scanner_object_t *s) +{ +#define START_FIND_COMP_FROM_END 36864 + + const char file_key[] = "file="; + bool state = true; + + int files_count = 0; + + long buffer_size = 0; //size of wfp file + char *wfp_buffer = read_file(s->wfp_path, &buffer_size); + wfp_buffer[buffer_size] = 0; + + char * last_file = wfp_buffer; + char * prev_file = wfp_buffer; + char * last_chunk = wfp_buffer; + + char post_response_buffer[START_FIND_COMP_FROM_END+1]; + int post_response_pos = 0; + long chunk_start_time = 0; + fpos_t file_pos; + + asprintf(&s->curl_temp_path,"%s.tmp",s->output_path); + + s->status.state = SCANNER_STATE_ANALIZING; + log_debug("ID: %s - Scanning, it could take some time, please be patient",s->status.id); + //walk over wfp buffer search for file key + s->status.total_response_time = millis(); + while(last_file - wfp_buffer < buffer_size) + { + chunk_start_time = millis(); + last_file = strstr(last_file,file_key); + + if (last_file - last_chunk > s->files_chunk_size || (last_file == NULL)) + { + if (last_file == NULL) + prev_file = &wfp_buffer[buffer_size]; + + //exact a new chunk from wfp file + uint size = prev_file - last_chunk; + if (size == 0) + { + size = s->files_chunk_size -1; + prev_file = last_file; + } + + char *chunk_buffer = calloc(size + 1, 1); + strncpy(chunk_buffer,last_chunk,size); + + s->status.scanned_files = files_count; //update proc. files + last_chunk = prev_file; + last_file = prev_file; + //define the component context, find the last component in the output file. + post_response_pos = ftell(s->output); + + memset(post_response_buffer,0,sizeof(post_response_buffer)); + + if (post_response_pos < START_FIND_COMP_FROM_END) + { + fseek(s->output,0L,SEEK_SET); + } + else + { + fseek(s->output,-1*START_FIND_COMP_FROM_END,SEEK_END); + } + + //go back in the output file and find the last component + fread(post_response_buffer,1,START_FIND_COMP_FROM_END,s->output); + get_last_component(post_response_buffer,s->status.component_last); + + log_debug("Last found component: %s", s->status.component_last); + + fseek(s->output,0L,SEEK_END); + + //get the result from the last chunk - It will be append to the output file + fgetpos(s->output, &file_pos); + curl_request(API_REQ_POST,"scan/direct",chunk_buffer,s); + + /*read curl response and correct the json */ + long chunk_resp_size; + char * chunk_resp = read_file(s->curl_temp_path, &chunk_resp_size); + char * last_bracket = strrchr(chunk_resp,'}'); + *last_bracket = ','; //replace } by , + fwrite(chunk_resp+1, 1, chunk_resp_size - 1, s->output); // avoid first { + free(chunk_resp); + + free(chunk_buffer); + state = false; + s->status.last_chunk_response_time = millis() - chunk_start_time; + log_debug("ID: %s - Chunk proc. end, %u processed files in %ld ms", s->status.id, s->status.scanned_files,millis() - s->status.total_response_time); + sprintf(s->status.message, "CHUNK_PROC_%lu_ms", s->status.last_chunk_response_time); + if (s->callback) + { + s->callback(&s->status,SCANNER_EVT_CHUNK_PROC); + } + + } + else + { + files_count++; + } + prev_file = last_file; + last_file += strlen(file_key); + } + s->status.total_response_time = millis() - s->status.total_response_time; + + fseek(s->output,-4L,SEEK_END); + fprintf(s->output,"\n}"); + + if (s->callback) + { + s->callback(&s->status,SCANNER_EVT_CHUNK_PROC_END); + } + free(wfp_buffer); + remove(s->curl_temp_path); //delete tmp file + free(s->curl_temp_path); + + s->status.state = SCANNER_STATE_OK; + return state; + +} + +/* Scan all files from a Directory*/ +static bool scanner_dir_proc(scanner_object_t *s, char *path) +{ + + bool state = true; //true if were a error + + DIR *d = opendir(path); + if (d == NULL) + return false; + struct dirent *entry; // for the directory entries + + //remove "./" from path + if (path[0] == '.' && path[1] == '/') + { + path+=2; + } + + while ((entry = readdir(d)) != NULL) + { + char temp[strlen(path) + strlen(entry->d_name) + 2]; + + sprintf(temp, "%s/%s", path, entry->d_name); + + if (entry->d_type == DT_DIR) + { + + if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) + continue; + + /*Directory filter */ + char f_dir[strlen(entry->d_name) + 3]; + sprintf(f_dir, " %s,", entry->d_name); + if (strstr(EXCLUDED_DIR, f_dir)) + { + log_trace("Excluded Directory: %s", entry->d_name); + continue; + } + scanner_dir_proc(s, temp); //If its a valid directory, then process it + } + else if (scanner_is_file(temp)) + { + if (!scanner_file_proc(s ,temp)) + { + log_trace("Scan: %s", temp); + } + state = false; + } + } + + closedir(d); + return state; +} + + +static int curl_request(int api_req,char * endpoint, char* data, scanner_object_t *s) +{ + char *m_host; + char *user_version; + char *user_session; + char *flags; + curl_mime *mime = NULL; + + long m_port = strtol(s->API_port, NULL, 10); + + asprintf(&user_session, "X-session: %s", s->API_session); + asprintf(&user_version, "User-Agent: "SCANOSS_CLIENT_NAME"/"VERSION); + asprintf(&flags,"%u",s->flags); + + s->curl_temp = fopen(s->curl_temp_path, "w+"); + + if (api_req == API_REQ_POST) + asprintf(&m_host, "%s/%s", s->API_host,endpoint); + + else + asprintf(&m_host,"%s/%s/%s",s->API_host,endpoint,data); + + CURL *curl; + CURLcode res; + /* In windows, this will init the winsock stuff */ + res = curl_global_init(CURL_GLOBAL_DEFAULT); + /* Check for errors */ + if (res != CURLE_OK) + { + log_fatal("curl_global_init() failed: %s\n", + curl_easy_strerror(res)); + return 1; + } + + /* get a curl handle */ + curl = curl_easy_init(); + if (curl) + { + /* First set the URL that is about to receive our POST. */ + curl_easy_setopt(curl, CURLOPT_URL, m_host); + curl_easy_setopt(curl, CURLOPT_PORT, m_port); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L); //curl ignore certificates + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L); //curl ignore certificates + curl_easy_setopt(curl, CURLOPT_WRITEDATA, s->curl_temp); + + if (log_level_is_enabled(LOG_TRACE)) + curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); + + curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https"); + + struct curl_slist *chunk = NULL; + chunk = curl_slist_append(chunk, "Connection: close"); + chunk = curl_slist_append(chunk, user_version); + chunk = curl_slist_append(chunk, user_session); + chunk = curl_slist_append(chunk, "Expect:"); + chunk = curl_slist_append(chunk, "Accept: */*"); + + res = curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk); + + if (api_req == API_REQ_POST) + { + curl_mimepart *part; + mime = curl_mime_init(curl); + + part = curl_mime_addpart(mime); + curl_mime_name(part, "format"); + //curl_mime_data(part, s->format, CURL_ZERO_TERMINATED); + //we are forcing to plain format because spdx and cyclonedx are processing local. + curl_mime_data(part, "plain", CURL_ZERO_TERMINATED); + + part = curl_mime_addpart(mime); + curl_mime_name(part, "flags"); + curl_mime_data(part, flags, CURL_ZERO_TERMINATED); + + part = curl_mime_addpart(mime); + curl_mime_name(part, "context"); + curl_mime_data(part, s->status.component_last, CURL_ZERO_TERMINATED); + + part = curl_mime_addpart(mime); + curl_mime_name(part, "file"); + curl_mime_filename(part, "scan.wfp"); + curl_mime_type(part,"application/octet-stream"); + curl_mime_data(part, data, CURL_ZERO_TERMINATED); + + curl_easy_setopt(curl, CURLOPT_MIMEPOST, mime); + } + + /* Perform the request, res will get the return code */ + res = curl_easy_perform(curl); + + /* Check for errors */ + if (res != CURLE_OK) + { + log_error("curl_easy_perform() failed: %s\n", curl_easy_strerror(res)); + if (s->callback) + { + s->callback(&s->status,SCANNER_EVT_ERROR_CURL); + } + } + + fclose(s->curl_temp); + /* always cleanup */ + curl_easy_cleanup(curl); + curl_slist_free_all(chunk); + } + + curl_global_cleanup(); + free(flags); + free(m_host); + free(user_session); + free(user_version); + if (API_REQ_POST) + curl_mime_free(mime); + + return 0; + +} +bool print_format(scanner_object_t * s) +{ + if (!strcmp(s->format, SCANNER_FORMAT_PLAIN)) + return false; + + fprintf(stderr, "\nPrinting the selected format: %s", s->format); + if(scan_parse_v2(s->output_path)) + { + log_error("There was a error parsing the json file, please check the output: %s", s->output_path); + return true; + } + + //rewrite output path with the selected format + s->output = fopen(s->output_path, "w+"); + print_matches(s->output, s->format); + fclose(s->output); + return false; +} + +/********* PUBLIC FUNTIONS DEFINITION ************/ + +void scanner_set_format(scanner_object_t *s, char *form) +{ + if (!form) + return; + + if (strstr(form, "plain") || strstr(form, "spdx") || strstr(form, "cyclonedx")) + { + strncpy(s->format, form, sizeof(s->format)); + } + else + log_debug("%s is not a valid output format, using plain\n", form); + +} + +void scanner_set_host(scanner_object_t *s, char *host) +{ + if (!host || strcmp(host," ") == 0) + return; + + memset(s->API_host, '\0', sizeof(s->API_host)); + strncpy(s->API_host, host, sizeof(s->API_host)); + log_debug("Host set: %s", s->API_host); +} + +void scanner_set_port(scanner_object_t *s, char *port) +{ + if (!port || strcmp(port," ") == 0) + return; + + memset(s->API_port, '\0', sizeof(s->API_port)); + strncpy(s->API_port, port, sizeof(s->API_port)); + log_debug("Port set: %s", s->API_port); +} + +void scanner_set_session(scanner_object_t *s, char *session) +{ + if (!session || strcmp(session," ") == 0) + return; + + memset(s->API_session, '\0', sizeof(s->API_session)); + strncpy(s->API_session, session, sizeof(s->API_session)); + log_debug("Session set: %s", s->API_session); +} + +void scanner_set_log_level(int level) +{ + log_set_level(level); +} + +void scanner_set_log_file(char *log) +{ + log_set_file(log); +} + +void scanner_set_output(scanner_object_t * e, char * f) +{ + if (!f) + { + asprintf(&e->output_path,"%s", DEFAULT_RESULT_NAME); + } + else + e->output_path = f; + + e->output = fopen(e->output_path, "w+"); + if (!e->output) + log_fatal("Failed to open the output file. Check the if the permmisions are right and if the directory exist"); + + //open json file + fprintf(e->output,"{\n"); + log_debug("ID: %s -File open: %s", e->status.id, e->output_path); +} + +void scanner_wfp_capture(char *path, char **md5, char *wfp_buffer) +{ + char *hex_md5 = NULL; + long length = 0; + char *src = read_file(path, &length); + //no external memory parameter, normal execution + if (md5 == NULL) + { + if (length > MIN_FILE_SIZE && !unwanted_header(src)) + { + /* Calculate MD5 */ + uint8_t bin_md5[16] = "\0"; + MD5((uint8_t *)src, length, bin_md5); + hex_md5 = bin_to_hex(bin_md5, 16); + } + } + //external reference, but null. Reserve memory and calc md5. + else if (*md5 == NULL) + { + /* Calculate MD5 */ + uint8_t bin_md5[16] = "\0"; + MD5((uint8_t *)src, length, bin_md5); + hex_md5 = bin_to_hex(bin_md5, 16); + *md5 = strdup(hex_md5); + } + //external md5, use it + else + { + hex_md5 = *md5; + } + + /* Skip if file is under threshold or if content is not wanted*/ + if (length < MIN_FILE_SIZE || unwanted_header(src)) + { + free(src); + return; + } + + /* Save file information to buffer */ + sprintf(wfp_buffer + strlen(wfp_buffer), "file=%s,%lu,%s\n", hex_md5, length, path); + free(hex_md5); + + /* If it is not binary (chr(0) found), calculate snippet wfps */ + if (strlen(src) == length && length < MAX_FILE_SIZE) + { + /* Capture hashes (Winnowing) */ + uint32_t *hashes = malloc(MAX_FILE_SIZE); + uint32_t *lines = malloc(MAX_FILE_SIZE); + uint32_t last_line = 0; + + /* Calculate hashes */ + uint32_t size = winnowing(src, hashes, lines, MAX_FILE_SIZE); + + /* Write hashes to buffer */ + for (int i = 0; i < size; i++) + { + if (last_line != lines[i]) + { + if (last_line != 0) + strcat(wfp_buffer, "\n"); + sprintf(wfp_buffer + strlen(wfp_buffer), "%d=%08x", lines[i], hashes[i]); + last_line = lines[i]; + } + else + sprintf(wfp_buffer + strlen(wfp_buffer), ",%08x", hashes[i]); + } + strcat(wfp_buffer, "\n"); + free(hashes); + free(lines); + } + free(src); +} + +int scanner_recursive_scan(scanner_object_t * scanner, bool wfp_only) +{ + if (!scanner) + { + log_fatal("Scanner object need to proceed"); + } + scanner->status.state = SCANNER_STATE_INIT; + scanner->status.wfp_files = 0; + scanner->status.scanned_files = 0; + scanner->status.wfp_total_time = millis(); + scanner->status.last_chunk_response_time = 0; + scanner->status.total_response_time = 0; + asprintf(&scanner->wfp_path,"%s.wfp",scanner->output_path); + strcpy(scanner->status.message, "WFP_CALC_START\0"); + log_debug("ID: %s - Scan start - WFP Calculation", scanner->status.id); + + if (scanner->callback) + { + scanner->callback(&scanner->status,SCANNER_EVT_START); + } + //check if exist the output file + if (!scanner->output) + scanner_set_output(scanner, NULL); + + /*create blank wfp file*/ + FILE *wfp_f = fopen(scanner->wfp_path, "w+"); + fclose(wfp_f); + + if (scanner_is_file(scanner->scan_path)) + { + scanner_file_proc(scanner, scanner->scan_path); + } + else if (scanner_is_dir(scanner->scan_path)) + { + int path_len = strlen(scanner->scan_path); + if (path_len > 1 && scanner->scan_path[path_len - 1] == '/') //remove extra '/' + scanner->scan_path[path_len - 1] = '\0'; + + scanner_dir_proc(scanner, scanner->scan_path); + } + else + { + scanner->status.state = SCANNER_STATE_ERROR; + log_error("\"%s\" is not a file\n", scanner->scan_path); + if (scanner->callback) + { + scanner->callback(&scanner->status,SCANNER_EVT_ERROR); + } + } + scanner->status.wfp_total_time = millis() - scanner->status.wfp_total_time; + log_debug("ID: %s - WFP calculation end, %u processed files in %ld ms", scanner->status.id, scanner->status.wfp_files, scanner->status.wfp_total_time); + if (scanner->callback) + { + scanner->callback(&scanner->status,SCANNER_EVT_WFP_CALC_END); + } + + strcpy(scanner->status.message, "WFP_CALC_END\0"); + + if (wfp_only) + return scanner->status.state; + + scan_request_by_chunks(scanner); + free(scanner->wfp_path); + + if (scanner->output) + { + fclose(scanner->output); + } + //print the selected format or do nothing if it is plain. + print_format(scanner); + + if (scanner->callback) + { + scanner->callback(&scanner->status,SCANNER_EVT_END); + } + strcpy(scanner->status.message, "FINISHED\0"); + + return scanner->status.state; +} + +int scanner_wfp_scan(scanner_object_t * scanner) +{ + if (!scanner) + { + log_fatal("Scanner object need to proceed"); + } + scanner->status.state = SCANNER_STATE_INIT; + scanner->status.wfp_files = 0; + scanner->status.scanned_files = 0; + scanner->status.wfp_total_time = millis(); + scanner->status.last_chunk_response_time = 0; + scanner->status.total_response_time = 0; + + if(!scanner_is_file(scanner->scan_path)) + { + log_debug("wfp_scan only works with wfp files"); + return SCANNER_STATE_ERROR; + } + + scanner->wfp_path = scanner->scan_path; + + if (wfp_files_count(scanner) == 0) + return SCANNER_STATE_ERROR; + + strcpy(scanner->status.message, "WFP_PROC_START\0"); + log_debug("ID: %s - Scan start - Scanning WFP file by chunks", scanner->status.id); + if (scanner->callback) + { + scanner->callback(&scanner->status,SCANNER_EVT_CHUNK_PROC); + } + + scan_request_by_chunks(scanner); + + if (scanner->output) + fclose(scanner->output); + + if (scanner->callback) + { + scanner->callback(&scanner->status,SCANNER_EVT_END); + } + strcpy(scanner->status.message, "FINISHED\0"); + + return scanner->status.state; + +} + + +int scanner_get_file_contents(scanner_object_t *scanner, char * hash) +{ + scanner->curl_temp_path = scanner->output_path; + int err_code = curl_request(API_REQ_GET,"file_contents",hash,scanner); + + return err_code; +} + +int scanner_get_license_obligations(scanner_object_t *scanner, char * license_name) +{ + scanner->curl_temp_path = scanner->output_path; + int err_code = curl_request(API_REQ_GET,"license/obligations",license_name,scanner); + + return err_code; +} + +bool scanner_get_attribution(scanner_object_t *scanner, char * path) +{ + long len; + char * data = read_file(path,&len); + + scanner->curl_temp_path = scanner->output_path; + int state = curl_request(API_REQ_POST,"sbom/attribution", data, scanner); + + free(data); + return state; +} + + +int scanner_print_output(scanner_object_t *scanner) +{ + bool state = true; + + if (!scanner->output_path) + return 1; + + FILE * output = fopen(scanner->output_path, "r"); + char c; + + if (output) + { + while ((c = getc(output)) != EOF) + putchar(c); + + fclose(output); + state = false; + } + + free(scanner->output_path); + return state; +} +scanner_object_t * scanner_create(char * id, char * host, char * port, char * session, char * format, char * path, char * file, scanner_flags_t flags, scanner_evt_handler callback) +{ + scanner_object_t *scanner = calloc(1, sizeof(scanner_object_t)); + scanner_object_t init = __SCANNER_OBJECT_INIT(path,file); + init.callback = callback; + strncpy(init.status.id, id, MAX_ID_LEN); + + //copy default config + memcpy(scanner,&init,sizeof(scanner_object_t)); + + scanner_set_host(scanner, host); + scanner_set_port(scanner, port); + scanner_set_session(scanner, session); + + if (flags > 0) + scanner->flags = flags; + + scanner_set_format(scanner, format); + scanner_set_output(scanner, file); + + + strcpy(scanner->status.message, "SCANNER_CREATED\0"); + return scanner; +} + + +void scanner_object_free(scanner_object_t * scanner) +{ + free(scanner); +} \ No newline at end of file