Skip to content

Commit

Permalink
SP-431 Adds HPSM support
Browse files Browse the repository at this point in the history
Co-authored-by: Alejandro Perez <alejandro.perez@scanoss.com>
  • Loading branch information
agustingroh committed Apr 4, 2024
1 parent db510fb commit 1dc6855
Show file tree
Hide file tree
Showing 9 changed files with 163 additions and 28 deletions.
2 changes: 2 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Upcoming changes...

## [0.7.0] - 2024-04-04
### Added
- Add HPSM support

## [0.6.1] - 2024-04-01
### Changed
- Fixed issue with SBOM ingestion
Expand Down Expand Up @@ -74,3 +78,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
[0.5.5]: https://github.com/scanoss/scanoss.java/compare/v0.5.4...v0.5.5
[0.6.0]: https://github.com/scanoss/scanoss.java/compare/v0.5.5...v0.6.0
[0.6.1]: https://github.com/scanoss/scanoss.java/compare/v0.6.0...v0.6.1
[0.7.0]: https://github.com/scanoss/scanoss.java/compare/v0.6.1...v0.7.0
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Include in a maven project using:
<dependency>
<groupId>com.scanoss</groupId>
<artifactId>scanoss</artifactId>
<version>0.6.0</version>
<version>0.7.0</version>
</dependency>
```

Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>com.scanoss</groupId>
<artifactId>scanoss</artifactId>
<version>0.6.1</version>
<version>0.7.0</version>
<packaging>jar</packaging>
<name>scanoss.java</name>
<url>https://github.com/scanoss/scanoss.java</url>
Expand Down
38 changes: 14 additions & 24 deletions src/main/java/com/scanoss/Winnowing.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@
package com.scanoss;

import com.scanoss.exceptions.WinnowingException;
import lombok.Builder;
import lombok.Getter;
import lombok.NonNull;
import lombok.Setter;
import com.scanoss.utils.Hpsm;
import com.scanoss.utils.WinnowingUtils;
import lombok.*;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.tika.Tika;
Expand Down Expand Up @@ -66,10 +65,11 @@ public class Winnowing {
@Builder.Default
private Boolean obfuscate = Boolean.FALSE; // Obfuscate file path
@Builder.Default
private Boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection
private boolean hpsm = Boolean.FALSE; // Enable High Precision Snippet Matching data collection
@Builder.Default
private int snippetLimit = MAX_LONG_LINE_CHARS; // Enable limiting of size of a single line of snippet generation


/**
* Calculate the WFP (fingerprint) for the given file
*
Expand Down Expand Up @@ -119,7 +119,12 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
if (binFile || this.skipSnippets || this.skipSnippets(filename, fileContents)) {
return wfpBuilder.toString();
}
// TODO add HPSM support here


if(this.isHpsm()){
wfpBuilder.append(String.format("hpsm=%s\n", Hpsm.calcHpsm(contents)));
}

String gram = "";
List<Long> window = new ArrayList<>();
char normalized;
Expand All @@ -133,7 +138,7 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
line++;
normalized = 0;
} else {
normalized = normalize(c);
normalized = WinnowingUtils.normalize(c);
}
if (normalized > 0) {
gram += normalized;
Expand Down Expand Up @@ -178,6 +183,8 @@ public String wfpForContents(@NonNull String filename, Boolean binFile, byte[] c
return wfpBuilder.toString();
}



/**
* Determine if a file/contents should be skipped for snippet generation or not
*
Expand Down Expand Up @@ -316,23 +323,6 @@ private byte[] toLittleEndian(long number) {
return b;
}

/**
* Normalise the given character
*
* @param c character to normalise
* @return normalised character
*/
private char normalize(char c) {
if (c < '0' || c > 'z') {
return 0;
} else if (c <= '9' || c >= 'a') {
return c;
} else if (c >= 'A' && c <= 'Z') {
return (char) (c + 32);
} else {
return 0;
}
}

/**
* Calculate the CRC32 for the given string
Expand Down
6 changes: 5 additions & 1 deletion src/main/java/com/scanoss/cli/ScanCommandLine.java
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,13 @@ class ScanCommandLine implements Runnable {
@picocli.CommandLine.Option(names = {"--proxy"}, description = "HTTP Proxy URL (optional)")
private String proxyString;

@picocli.CommandLine.Option(names = {"-H", "--hpsm"}, description = "Use High Precision Snippet Matching algorithm")
private boolean enableHpsm = false;

@picocli.CommandLine.Parameters(arity = "1", description = "file/folder to scan")
private String fileFolder;


private Scanner scanner;

/**
Expand Down Expand Up @@ -160,7 +164,7 @@ public void run() {
scanner = Scanner.builder().skipSnippets(skipSnippets).allFolders(allFolders).allExtensions(allExtensions)
.hiddenFilesFolders(allHidden).numThreads(numThreads).url(apiUrl).apiKey(apiKey)
.retryLimit(retryLimit).timeout(Duration.ofSeconds(timeoutLimit)).scanFlags(scanFlags)
.sbomType(sbomType).sbom(sbom).snippetLimit(snippetLimit).customCert(caCertPem).proxy(proxy)
.sbomType(sbomType).sbom(sbom).snippetLimit(snippetLimit).customCert(caCertPem).proxy(proxy).hpsm(enableHpsm)
.build();
File f = new File(fileFolder);
if (!f.exists()) {
Expand Down
91 changes: 91 additions & 0 deletions src/main/java/com/scanoss/utils/Hpsm.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
package com.scanoss.utils;

import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

public class Hpsm {
private static final int CRC8_MAXIM_DOW_TABLE_SIZE = 0x100;
private static final int CRC8_MAXIM_DOW_POLYNOMIAL = 0x8C; // 0x31 reflected
private static final int CRC8_MAXIM_DOW_INITIAL = 0x00; // 0x00 reflected
private static final int CRC8_MAXIM_DOW_FINAL = 0x00; // 0x00 reflected
private static int[] crc8MaximDowTable = new int[CRC8_MAXIM_DOW_TABLE_SIZE];

private static final byte[] HEX_ARRAY = "0123456789ABCDEF".getBytes(StandardCharsets.US_ASCII);

public static String calcHpsm(byte[] content) {
List<Integer> listNormalized = new ArrayList<>();
List<Integer> crcLines = new ArrayList<>();

int lastLine = 0;
crc8MaximDowGenerateTable();

for (int i = 0; i < content.length ; i++) {
char c = (char) content[i];
if (c == '\n') { // When there is a new line
if (!listNormalized.isEmpty()) {
crcLines.add(crc8MaximDowBuffer(convertListToByteArray(listNormalized)));
listNormalized.clear();
} else if (lastLine + 1 == i) {
crcLines.add(0xFF);
} else if (i - lastLine > 1) {
crcLines.add(0x00);
}
lastLine = i;
} else {
int cNormalized = WinnowingUtils.normalize(c);
if (cNormalized != 0) listNormalized.add(cNormalized);
}
}

return convertToHex(convertListToByteArray(crcLines));
}

private static int crc8MaximDowByteNoTable(int crc, int b) {
crc ^= b;
for (int count = 0; count < 8; count++) {
boolean isSet = (crc & 0x01) != 0;
crc >>= 1;
if (isSet) crc ^= CRC8_MAXIM_DOW_POLYNOMIAL;
}
return crc;
}

private static void crc8MaximDowGenerateTable() {
for (int i = 0; i < CRC8_MAXIM_DOW_TABLE_SIZE; i++) {
crc8MaximDowTable[i] = crc8MaximDowByteNoTable(0, i);
}
}

private static int crc8MaximDowByte(int crc, int b) {
int index = b ^ crc;
return crc8MaximDowTable[index] ^ (crc >> 8);
}

private static int crc8MaximDowBuffer(byte[] buffer) {
int crc = CRC8_MAXIM_DOW_INITIAL;
for (byte b : buffer) {
crc = crc8MaximDowByte(crc, b & 0xFF); // Convert byte to unsigned integer
}
crc ^= CRC8_MAXIM_DOW_FINAL;
return crc;
}

private static byte[] convertListToByteArray(List<Integer> integerList) {
byte[] byteArray = new byte[integerList.size()];
for (int i = 0; i < integerList.size(); i++) {
byteArray[i] = integerList.get(i).byteValue();
}
return byteArray;
}

private static String convertToHex(byte [] bytes) {
byte[] hexChars = new byte[bytes.length * 2];
for (int j = 0; j < bytes.length; j++) {
int v = bytes[j] & 0xFF;
hexChars[j * 2] = HEX_ARRAY[v >>> 4];
hexChars[j * 2 + 1] = HEX_ARRAY[v & 0x0F];
}
return new String(hexChars, StandardCharsets.UTF_8).toLowerCase();
}
}
22 changes: 22 additions & 0 deletions src/main/java/com/scanoss/utils/WinnowingUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package com.scanoss.utils;

public class WinnowingUtils {

/**
* Normalise the given character
*
* @param c character to normalise
* @return normalised character
*/
public static char normalize(char c) {
if (c < '0' || c > 'z') {
return 0;
} else if (c <= '9' || c >= 'a') {
return c;
} else if (c >= 'A' && c <= 'Z') {
return (char) (c + 32);
} else {
return 0;
}
}
}
23 changes: 22 additions & 1 deletion src/test/java/com/scanoss/TestWinnowing.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
import java.io.IOException;
import java.util.Arrays;
import java.util.regex.Pattern;

import static org.junit.Assert.*;

@Slf4j
Expand Down Expand Up @@ -92,6 +91,28 @@ public void TestWinnowingPositive() {
log.info("Finished {} -->", methodName);
}

@Test
public void TestWinnowingContentsHPSM() {
String methodName = new Object() {
}.getClass().getEnclosingMethod().getName();
log.info("<-- Starting {}", methodName);
Winnowing winnowing = Winnowing.builder().hpsm(true).build();


byte[] contents = "sample c code with lots of code that we should analyse\nAnd even more code to get connected.\nAnd we need to get this as long as possible, in order to trigger snippet matching.\nHere comes more code to help get this working.\nPlease help get this across the line. We need all the help we can get.\n".getBytes();
String wfp = winnowing.wfpForContents("local-file.c", false, contents);
assertNotNull(wfp);
assertFalse(wfp.isEmpty());
assertEquals("file=609a24b6cd27ef8108792ca459db1b28,293,local-file.c\n" +
"hpsm=df13c104d4\n" +
"3=0ed5027a,a9442399,d019b836\n" +
"4=613d56c0\n" +
"5=828b5fe0\n",wfp);
log.info("TestWinnowingContents - WFP contents: {}", wfp);

log.info("Finished {} -->", methodName);
}

@Test
public void TestWinnowingContents() {
String methodName = new Object() {
Expand Down

0 comments on commit 1dc6855

Please sign in to comment.