From f0a93b71f307dc734ee996c04aa92aca39d48f3c Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Thu, 17 Jun 2021 16:17:10 +0200 Subject: [PATCH 01/29] Add files for suffix tree algorithm --- .../ApproximateCloneDetectingSuffixTree.php | 560 ++++++++++++++++++ .../Strategy/SuffixTree/CloneInfo.php | 76 +++ .../SuffixTree/JavaObjectInterface.php | 9 + src/Detector/Strategy/SuffixTree/PairList.php | 260 ++++++++ src/Detector/Strategy/SuffixTree/PhpToken.php | 84 +++ src/Detector/Strategy/SuffixTree/Sentinel.php | 53 ++ .../Strategy/SuffixTree/SuffixTree.php | 348 +++++++++++ .../SuffixTree/SuffixTreeHashTable.php | 241 ++++++++ src/Detector/Strategy/SuffixTree/main.php | 55 ++ src/Detector/Strategy/SuffixTreeStrategy.php | 46 ++ 10 files changed, 1732 insertions(+) create mode 100644 src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php create mode 100644 src/Detector/Strategy/SuffixTree/CloneInfo.php create mode 100644 src/Detector/Strategy/SuffixTree/JavaObjectInterface.php create mode 100644 src/Detector/Strategy/SuffixTree/PairList.php create mode 100644 src/Detector/Strategy/SuffixTree/PhpToken.php create mode 100644 src/Detector/Strategy/SuffixTree/Sentinel.php create mode 100644 src/Detector/Strategy/SuffixTree/SuffixTree.php create mode 100644 src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php create mode 100644 src/Detector/Strategy/SuffixTree/main.php create mode 100644 src/Detector/Strategy/SuffixTreeStrategy.php diff --git a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php new file mode 100644 index 00000000..a23b48e1 --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php @@ -0,0 +1,560 @@ + + */ + //private final ListMap cloneInfos = new ListMap(); + private $cloneInfos = []; + + /** + * The maximal length of a clone. This influences the size of the + * (quadratic) {@link #edBuffer}. + * @var int + */ + private $MAX_LENGTH = 1024; + + /** + * Buffer used for calculating edit distance. + * @var array + */ + private $edBuffer = []; + + /** + * The minimal length of clones to return. + * @var int + */ + protected $minLength; + + /** + * Number of units that must be equal at the start of a clone + * @var int + */ + private $headEquality; + + /** + * Create a new suffix tree from a given word. The word given as parameter + * is used internally and should not be modified anymore, so copy it before + * if required. + *

+ * This only word correctly if the given word is closed using a sentinel + * character. + * + * @param array $word List of tokens to analyze + */ + public function __construct(array $word) + { + $arr = array_fill(0, $this->MAX_LENGTH, 0); + $this->edBuffer = array_fill(0, $this->MAX_LENGTH, $arr); + + parent::__construct($word); + $this->ensureChildLists(); + $this->leafCount = array_fill(0, $this->numNodes, 0); + $this->initLeafCount(0); + } + + /** + * Initializes the {@link #leafCount} array which given for each node the + * number of leaves reachable from it (where leaves obtain a value of 1). + * + * @param int $node + * @return void + */ + private function initLeafCount(int $node) + { + $this->leafCount[$node] = 0; + for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) { + $this->initLeafCount($this->nodeChildNode[$e]); + $this->leafCount[$node] += $this->leafCount[$this->nodeChildNode[$e]]; + } + if ($this->leafCount[$node] == 0) { + $this->leafCount[$node] = 1; + } + } + + /** + * @todo Add options: + * --min-tokens + * --min-lines + * --edit-distance + * @todo Possibly add consumer from original code. + */ + + /** + * Finds all clones in the string (List) used in the constructor. + * + * @param int $minLength the minimal length of a clone in tokens (not lines) + * @param int $maxErrors the maximal number of errors/gaps allowed + * @param int $headEquality the number of elements which have to be the same at the beginning of a clone + * @return void + * @throws ConQATException + */ + public function findClones(int $minLength, int $maxErrors, int $headEquality) + { + $this->minLength = $minLength; + $this->headEquality = $headEquality; + $this->cloneInfos = []; + + for ($i = 0; $i < count($this->word); ++$i) { + // Do quick start, as first character has to match anyway. + $node = $this->nextNode->get(0, $this->word[$i]); + if ($node < 0 || $this->leafCount[$node] <= 1) { + continue; + } + + // we know that we have an exact match of at least 'length' + // characters, as the word itself is part of the suffix tree. + $length = $this->nodeWordEnd[$node] - $this->nodeWordBegin[$node]; + $numReported = 0; + for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) { + if ($this->matchWord($i, $i + $length, $this->nodeChildNode[$e], $length, + $maxErrors)) { + ++$numReported; + } + } + if ($length >= $this->minLength && $numReported != 1) { + $this->reportClone($i, $i + $length, $node, $length, $length); + } + } + + for ($index = 0; $index <= count($this->word); ++$index) { + $existingClones = $this->cloneInfos[$index] ?? null; + if ($existingClones != null) { + foreach ($existingClones as $ci) { + // length = number of tokens + // TODO: min token length + if ($ci->length > 25) { + /** @var CloneInfo */ + $previousCi = $map[$ci->token->line] ?? null; + if ($previousCi == null) { + $map[$ci->token->line] = $ci; + } else if ($ci->length > $previousCi->length) { + $map[$ci->token->line] = $ci; + } + /** @var int[] */ + $others = $ci->otherClones->extractFirstList(); + for ($j = 0; $j < count($others); $j++) { + $otherStart = $others[$j]; + /** @var PhpToken */ + $t = $this->word[$otherStart]; + } + } + } + } + } + + /** @var CloneInfo[] */ + $values = array_values($map); + return $values; + + // TODO: Below moved to SuffixTreeStrategy class. + usort($values, function ($a, $b) { return $b->length - $a->length;}); + printf( + "\nFound %d clones with %d duplicated lines in %d files:\n\n", + count($values), + 0, // TODO: Fix + 0 + ); + // TODO: Filter overlapping clones. + for ($i = 0; $i < count($values); $i++) { + /** @var CloneInfo */ + $ci = $values[$i]; + try { + /** @var PhpToken */ + $lastToken = $this->word[$ci->position + $ci->length]; + $lines = $lastToken->line - $ci->token->line; + printf( + " - %s:%d-%d (%d lines)\n", + $ci->token->file, + $ci->token->line, + $ci->token->line + $lines - 1, + $lines + ); + } catch(IndexOutOfBoundsException $e) { + printf("index out of bounds, ci.position = %d, ci.length = %d", $ci->position, $ci->length); + } + /** @var int[] */ + $others = $ci->otherClones->extractFirstList(); + for ($j = 0; $j < count($others); $j++) { + $otherStart = $others[$j]; + /** @var PhpToken */ + $t = $this->word[$otherStart]; + /** @var PhpToken */ + $lastToken = $this->word[$ci->position + $ci->length]; + $lines = $lastToken->line - $ci->token->line; + printf( + " %s:%d-%d\n", + $t->file, + $t->line, + $t->line + $lines - 1 + ); + } + print("\n"); + } + } + + /** + * Performs the approximative matching between the input word and the tree. + * + * @param int $wordStart the start position of the currently matched word (position in + * the input word). + * @param int $wordPosition the current position along the input word. + * @param int $node the node we are currently at (i.e. the edge leading to this + * node is relevant to us). + * @param int $nodeWordLength the length of the word found along the nodes (this may be + * different from the length along the input word due to gaps). + * @param int $maxErrors the number of errors still allowed. + * @return boolean whether some clone was reported + * @throws ConQATException + */ + private function matchWord(int $wordStart, int $wordPosition, int $node, int $nodeWordLength, int $maxErrors) + { + // We are aware that this method is longer than desirable for code + // reading. However, we currently do not see a refactoring that has a + // sensible cost-benefit ratio. Suggestions are welcome! + + // self match? + if ($this->leafCount[$node] == 1 && $this->nodeWordBegin[$node] == $wordPosition) { + return false; + } + + $currentNodeWordLength = min($this->nodeWordEnd[$node] - $this->nodeWordBegin[$node], $this->MAX_LENGTH - 1); + + // do min edit distance + /** @var int */ + $currentLength = $this->calculateMaxLength($wordStart, $wordPosition, $node, + $maxErrors, $currentNodeWordLength); + + if ($currentLength == 0) { + return false; + } + + if ($currentLength >= $this->MAX_LENGTH - 1) { + $this->reportBufferShortage($this->nodeWordBegin[$node], $currentNodeWordLength); + } + + // calculate cheapest match + $best = $maxErrors + 42; + $iBest = 0; + $jBest = 0; + for ($k = 0; $k <= $currentLength; ++$k) { + $i = $currentLength - $k; + $j = $currentLength; + if ($this->edBuffer[$i][$j] < $best) { + $best = $this->edBuffer[$i][$j]; + $iBest = $i; + $jBest = $j; + } + + $i = $currentLength; + $j = $currentLength - $k; + if ($this->edBuffer[$i][$j] < $best) { + $best = $this->edBuffer[$i][$j]; + $iBest = $i; + $jBest = $j; + } + } + + while ($wordPosition + $iBest < count($this->word) + && $jBest < $currentNodeWordLength + && $this->word[$wordPosition + $iBest] != $this->word[$this->nodeWordBegin[$node] + $jBest] + && $this->word[$wordPosition + $iBest]->equals( + $this->word[$this->nodeWordBegin[$node] + $jBest])) { + ++$iBest; + ++$jBest; + } + + $numReported = 0; + if ($currentLength == $currentNodeWordLength) { + // we may proceed + for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) { + if ($this->matchWord($wordStart, $wordPosition + $iBest, + $this->nodeChildNode[$e], $nodeWordLength + $jBest, $maxErrors + - $best)) { + ++$numReported; + } + } + } + + // do not report locally if had reports in exactly one subtree (would be + // pure subclone) + if ($numReported == 1) { + return true; + } + + // disallow tail changes + while ($iBest > 0 + && $jBest > 0 + && !$this->word[$wordPosition + $iBest - 1]->equals( + $this->word[$this->nodeWordBegin[$node] + $jBest - 1])) { + + if ($iBest > 1 + && $this->word[$wordPosition + $iBest - 2]->equals( + $this->word[$this->nodeWordBegin[$node] + $jBest - 1])) { + --$iBest; + } else if ($jBest > 1 + && $this->word[$wordPosition + $iBest - 1]->equals( + $this->word[$this->nodeWordBegin[$node] + $jBest - 2])) { + --$jBest; + } else { + --$iBest; + --$jBest; + } + } + + // report if real clone + if ($iBest > 0 && $jBest > 0) { + $numReported += 1; + $this->reportClone($wordStart, $wordPosition + $iBest, $node, $jBest, $nodeWordLength + $jBest); + } + + return $numReported > 0; + } + + /** + * Calculates the maximum length we may take along the word to the current + * $node (respecting the number of errors to make). * + * + * @param int $wordStart the start position of the currently matched word (position in + * the input word). + * @param int $wordPosition the current position along the input word. + * @param int $node the node we are currently at (i.e. the edge leading to this + * node is relevant to us). + * @param int $maxErrors the number of errors still allowed. + * @param int $currentNodeWordLength the length of the word found along the nodes (this may be + * different from the actual length due to buffer limits). + * @return int the maximal length that can be taken. + */ + private function calculateMaxLength( + int $wordStart, + int $wordPosition, + int $node, + int $maxErrors, + int $currentNodeWordLength) + { + $this->edBuffer[0][0] = 0; + $currentLength = 1; + for (; $currentLength <= $currentNodeWordLength; ++$currentLength) { + /** @var int */ + $best = $currentLength; + $this->edBuffer[0][$currentLength] = $currentLength; + $this->edBuffer[$currentLength][0] = $currentLength; + + if ($wordPosition + $currentLength >= count($this->word)) { + break; + } + + // deal with case that character may not be matched (sentinel!) + $iChar = $this->word[$wordPosition + $currentLength - 1]; + $jChar = $this->word[$this->nodeWordBegin[$node] + $currentLength - 1]; + if ($this->mayNotMatch($iChar) || $this->mayNotMatch($jChar)) { + break; + } + + // usual matrix completion for edit distance + for ($k = 1; $k < $currentLength; ++$k) { + $best = min( + $best, + $this->fillEDBuffer($k, $currentLength, $wordPosition, + $this->nodeWordBegin[$node])); + } + for ($k = 1; $k < $currentLength; ++$k) { + $best = min( + $best, + $this->fillEDBuffer($currentLength, $k, $wordPosition, + $this->nodeWordBegin[$node])); + } + $best = min( + $best, + $this->fillEDBuffer($currentLength, $currentLength, $wordPosition, + $this->nodeWordBegin[$node])); + + if ($best > $maxErrors + || $wordPosition - $wordStart + $currentLength <= $this->headEquality + && $best > 0) { + break; + } + } + --$currentLength; + return $currentLength; + } + + /** + * @return void + * @throws ConQATException + */ + private function reportClone(int $wordBegin, int $wordEnd, int $currentNode, + int $nodeWordPos, int $nodeWordLength) + { + /** @var int */ + $length = $wordEnd - $wordBegin; + if ($length < $this->minLength || $nodeWordLength < $this->minLength) { + return; + } + + /** @var PairList */ + $otherClones = new PairList(); + $this->findRemainingClones( + $otherClones, + $nodeWordLength, + $currentNode, + $this->nodeWordEnd[$currentNode] - $this->nodeWordBegin[$currentNode] - $nodeWordPos, + $wordBegin + ); + + $occurrences = 1 + $otherClones->size(); + + // check whether we may start from here + /** @var PhpToken */ + $t = $this->word[$wordBegin]; + /** @var CloneInfo */ + $newInfo = new CloneInfo($length, $wordBegin, $occurrences, $t, $otherClones); + for ($index = max(0, $wordBegin - $this->INDEX_SPREAD + 1); $index <= $wordBegin; ++$index) { + /** @var CloneInfo */ + $existingClones = $this->cloneInfos[$index] ?? null; + if ($existingClones != null) { + //for (CloneInfo cloneInfo : $existingClones) { + foreach ($existingClones as $cloneInfo) { + if ($cloneInfo->dominates($newInfo, $wordBegin - $index)) { + // we already have a dominating clone, so ignore + return; + } + } + } + } + + // add clone to $otherClones to avoid getting more duplicates + for ($i = $wordBegin; $i < $wordEnd; $i += $this->INDEX_SPREAD) { + $this->cloneInfos[$i][] = new CloneInfo($length - ($i - $wordBegin), $wordBegin, $occurrences, $t, $otherClones); + } + /** @var PhpToken */ + $t = $this->word[$wordBegin]; + for ($clone = 0; $clone < $otherClones->size(); ++$clone) { + $start = $otherClones->getFirst($clone); + $otherLength = $otherClones->getSecond($clone); + for ($j = 0; $j < $otherLength; $j++) { + /** @var PhpToken */ + $r = $this->word[$j + $start]; + } + for ($i = 0; $i < $otherLength; $i += $this->INDEX_SPREAD) { + //$this->cloneInfos.add($start + $i, new CloneInfo($otherLength - $i, $wordBegin, occurrences, $t, $otherClones)); + $this->cloneInfos[$start + $i][] = new CloneInfo($otherLength - $i, $wordBegin, $occurrences, $t, $otherClones); + } + } + } + + + /** + * Fills the edit distance buffer at position (i,j). + * + * @param int $i the first index of the buffer. + * @param int $j the second index of the buffer. + * @param int $iOffset the offset where the word described by $i starts. + * @param int $jOffset the offset where the word described by $j starts. + * @return int the value inserted into the buffer. + */ + private function fillEDBuffer(int $i, int $j, int $iOffset, int $jOffset) + { + /** @var JavaObjectInterface */ + $iChar = $this->word[$iOffset + $i - 1]; + /** @var JavaObjectInterface */ + $jChar = $this->word[$jOffset + $j - 1]; + + $insertDelete = 1 + min($this->edBuffer[$i - 1][$j], $this->edBuffer[$i][$j - 1]); + $change = $this->edBuffer[$i - 1][$j - 1] + ($iChar->equals($jChar) ? 0 : 1); + return $this->edBuffer[$i][$j] = min($insertDelete, $change); + } + + /** + * Fills a list of pairs giving the start positions and lengths of the + * remaining clones. + * + * @param array $clonePositions the clone positions being filled (start position and length) + * @param int $nodeWordLength the length of the word along the nodes. + * @param int $currentNode the node we are currently at. + * @param int $distance the distance along the word leading to the current node. + * @param int $wordStart the start of the currently searched word. + * @return void + */ + private function findRemainingClones( + PairList $clonePositions, + int $nodeWordLength, + int $currentNode, + int $distance, + int $wordStart) + { + for ($nextNode = $this->nodeChildFirst[$currentNode]; $nextNode >= 0; $nextNode = $this->nodeChildNext[$nextNode]) { + $node = $this->nodeChildNode[$nextNode]; + $this->findRemainingClones($clonePositions, $nodeWordLength, $node, $distance + + $this->nodeWordEnd[$node] - $this->nodeWordBegin[$node], $wordStart); + } + + if ($this->nodeChildFirst[$currentNode] < 0) { + $start = count($this->word) - $distance - $nodeWordLength; + if ($start != $wordStart) { + $clonePositions->add($start, $nodeWordLength); + } + } + } + + /** + * This should return true, if the provided character is not allowed to + * match with anything else (e.g. is a sentinel). + */ + protected function mayNotMatch(JavaObjectInterface $character) + { + return $character instanceof Sentinel; + } + + /** + * This method is called whenever the {@link #MAX_LENGTH} is to small and + * hence the {@link #edBuffer} was not large enough. This may cause that a + * really large clone is reported in multiple chunks of size + * {@link #MAX_LENGTH} and potentially minor parts of such a clone might be + * lost. + */ + protected function reportBufferShortage(int $leafStart, int $leafLength) { + echo "Encountered buffer shortage: " . $leafStart . " " . $leafLength . "\n"; + } +} diff --git a/src/Detector/Strategy/SuffixTree/CloneInfo.php b/src/Detector/Strategy/SuffixTree/CloneInfo.php new file mode 100644 index 00000000..82f10efb --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/CloneInfo.php @@ -0,0 +1,76 @@ +length = $length; + $this->position = $position; + $this->occurrences = $occurrences; + $this->token = $token; + $this->otherClones = $otherClones; + } + + /** + * Returns whether this clone info dominates the given one, i.e. whether + * both {@link #length} and {@link #occurrences} s not smaller. + * + * @param CloneInfo $ci + * @param later The amount the given clone starts later than the "this" clone. + * @return boolean + */ + public function dominates(CloneInfo $ci, int $later): bool + { + return $this->length - $later >= $ci->length && $this->occurrences >= $ci->occurrences; + } +} diff --git a/src/Detector/Strategy/SuffixTree/JavaObjectInterface.php b/src/Detector/Strategy/SuffixTree/JavaObjectInterface.php new file mode 100644 index 00000000..2227696a --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/JavaObjectInterface.php @@ -0,0 +1,9 @@ +firstElements = array_fill(0, $initialCapacity, null); + $this->secondElements = array_fill(0, $initialCapacity, null); + } + + + /** Returns whether the list is empty. */ + public function isEmpty(): bool + { + return $this->size == 0; + } + + /** Returns the size of the list. */ + public function size(): int + { + return $this->size; + } + + /** + * Add the given pair to the list. + * @return void + */ + public function add($first, $second): void + { + $this->firstElements[$this->size] = $first; + $this->secondElements[$this->size] = $second; + ++$this->size; + } + + /** Adds all pairs from another list. */ + public function addAll(PairList $other): void + { + // we have to store this in a local var, as other.$this->size may change if + // other == this + $otherSize = $other->size; + + for ($i = 0; $i < $otherSize; ++$i) { + $this->firstElements[$this->size] = $other->firstElements[$i]; + $this->secondElements[$this->size] = $other->secondElements[$i]; + ++$this->size; + } + } + + /** Make sure there is space for at least the given amount of elements. */ + protected function ensureSpace(int $space): void + { + if ($space <= count($this->firstElements)) { + return; + } + + $oldFirst = $this->firstElements; + $oldSecond = $this->secondElements; + $newSize = count($this->firstElements) * 2; + while ($newSize < $space) { + $newSize *= 2; + } + } + + /** Returns the first element at given index. */ + public function getFirst(int $i) + { + $this->checkWithinBounds($i); + return $this->firstElements[$i]; + } + + /** + * Checks whether the given $i is within the bounds. Throws an + * exception otherwise. + */ + private function checkWithinBounds(int $i): void + { + if ($i < 0 || $i >= $this->size) { + throw new Exception("Out of bounds: " + $i); + } + } + + /** Sets the first element at given index. */ + public function setFirst(int $i, $value): void + { + $this->checkWithinBounds($i); + $this->firstElements[$i] = $value; + } + + /** Returns the second element at given index. */ + public function getSecond(int $i) + { + $this->checkWithinBounds($i); + return $this->secondElements[$i]; + } + + /** Sets the first element at given index. */ + public function setSecond(int $i, $value): void + { + $this->checkWithinBounds($i); + $this->secondElements[$i] = $value; + } + + /** Creates a new list containing all first elements. */ + public function extractFirstList(): array + { + //array $result = new ArrayList($this->size + 1); + $result = []; + for ($i = 0; $i < $this->size; ++$i) { + $result[] = $this->firstElements[$i]; + } + return $result; + } + + /** Creates a new list containing all second elements. */ + public function extractSecondList(): array + { + //$result = new ArrayList($this->size + 1); + $result = []; + for ($i = 0; $i < $this->size; ++$i) { + $result[] = $this->secondElements[$i]; + } + return $result; + } + + /** + * Swaps the pairs of this list. Is S and T are different types, this will + * be extremely dangerous. + */ + public function swapPairs(): void + { + $temp = $this->firstElements; + $this->firstElements = $this->secondElements; + $this->secondElements = $temp; + } + + /** Swaps the entries located at indexes $i and $j. */ + public function swapEntries(int $i, int $j): void + { + $tmp1 = $this->getFirst($i); + $tmp2 = $this->getSecond($i); + $this->setFirst($i, $this->getFirst($j)); + $this->setSecond($i, $this->getSecond($j)); + $this->setFirst($j, $tmp1); + $this->setSecond($j, $tmp2); + } + + /** Clears this list. */ + public function clear(): void + { + $this->size = 0; + } + + /** Removes the last element of the list. */ + public function removeLast(): void + { + $this->size -= 1; + } + + public function toString(): string + { + $result = ''; + $result += ('['); + for ($i = 0; $i < $this->size; $i++) { + if ($i != 0) { + $result .= ','; + } + $result .= '('; + $result .= (string) $this->firstElements[$i]; + $result .= ','; + $result .= (string) $this->secondElements[$i]; + $result .= ')'; + } + $result .= ']'; + return $result; + } + + public function hashCode(): int + { + $prime = 31; + $hash = $this->size; + $hash = $prime * $hash + crc32(serialize($this->firstElements)); + return $prime * $hash + crc32(serialize($this->secondElements)); + } + + public function equals(PairList $obj): bool + { + // TODO: Doesn't work in PHP + if ($this === $obj) { + return true; + } + if (!($obj instanceof PairList)) { + return false; + } + + $other = $obj; + if ($this->size !== $other->size) { + return false; + } + for ($i = 0; $i < $this->size; $i++) { + if (!($this->firstElements[$i] == $other->firstElements[$i]) + || !($this->secondElements[$i] != $this->secondElements[$i])) { + return false; + } + } + return true; + } +} diff --git a/src/Detector/Strategy/SuffixTree/PhpToken.php b/src/Detector/Strategy/SuffixTree/PhpToken.php new file mode 100644 index 00000000..bb8a559d --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/PhpToken.php @@ -0,0 +1,84 @@ +tokenCode = $tokenCode; + $this->tokenName = $tokenName; + $this->line = $line; + $this->content = $content; + $this->file = $file; + } + + /** + * @return int + */ + public function hashCode(): int + { + return (int) crc32($this->content); + + //static $cashedHashCode = null; + //if ($cashedHashCode !== null) { + //return $cashedHashCode; + //} + + // Code below mimics 32-bit integer. Probably not needed. + /* + $value = $this->content; + $hashCode = 0; + $offset= 0; + $limit = strlen($value) + $offset; + for ($i = $offset; $i < $limit; $i++) { + $hashCode = $hashCode * 31 + ord($value[$i]); + //if (is_float($hashCode)) { + //die('nooo'); + //} + // NB: Simulate 32-bit int. + // @see https://stackoverflow.com/questions/15557407/how-to-use-a-32bit-integer-on-a-64bit-installation-of-php + //$hashCode = $hashCode & 0xFFFFFFFF; + $hashCode = $hashCode & 0xFFFFFFFF; + if ($hashCode & 0x80000000) { + $hashCode = $hashCode & ~0x80000000; + $hashCode = -2147483648 + $hashCode; + } + + } + //$cashedHashCode = $hashCode; + return $hashCode; + */ + //return $this->content->hashCode(); + //return $tokenCode; + } + + /** + * @return boolean + */ + public function equals(JavaObjectInterface $token): bool { + return $token->hashCode() === $this->hashCode(); + } + + /** + * @return string + */ + public function toString() { + return $this->tokenName; + } + + public function __tostring() { + return $this->tokenName; + } +} diff --git a/src/Detector/Strategy/SuffixTree/Sentinel.php b/src/Detector/Strategy/SuffixTree/Sentinel.php new file mode 100644 index 00000000..378dff0f --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/Sentinel.php @@ -0,0 +1,53 @@ +hash = (int) rand(0, PHP_INT_MAX); + } + + public function hashCode(): int + { + return $this->hash; + } + + public function equals(object $obj): bool + { + // Original code uses physical object equality, not present in PHP. + return $obj instanceof Sentinel; + } + + public function toString(): string + { + return "$"; + } +} diff --git a/src/Detector/Strategy/SuffixTree/SuffixTree.php b/src/Detector/Strategy/SuffixTree/SuffixTree.php new file mode 100644 index 00000000..d96571d9 --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/SuffixTree.php @@ -0,0 +1,348 @@ + + * We use some conventions which are slightly different from the paper however: + *

    + *
  • The names of the variables are different, but we give a translation into + * Ukkonen's names.
  • + *
  • Many variables are made "global" by realizing them as fields. This way we + * can easily deal with those tuple return values without constructing extra + * classes.
  • + *
  • String indices start at 0 (not at 1).
  • + *
  • Substrings are marked by the first index and the index after the last one + * (just as in C++ STL) instead of the first and the last index (i.e. intervals + * are right-open instead of closed). This makes it more intuitive to express + * the empty string (i.e. (i,i) instead of (i,i-1)).
  • + *
+ *

+ * Everything but the construction itself is protected to simplify increasing + * its functionality by subclassing but without introducing new method calls. + * + * @author Benjamin Hummel + * @author $Author: kinnen $ + * + * @version $Revision: 41751 $ + * @ConQAT.Rating GREEN Hash: 4B2EF0606B3085A6831764ED042FF20D + */ +class SuffixTree +{ + /** + * Infinity in this context. + * @var int + */ + protected $INFTY; + + /** The word we are working on. + * @var array */ + protected $word; + + /** The number of nodes created so far. + * @var int */ + protected $numNodes = 0; + + /** + * For each node this holds the index of the first character of + * {@link #word} labeling the transition to this node. This + * corresponds to the k for a transition used in Ukkonen's paper. + * + * @var int[] + */ + protected $nodeWordBegin; + + /** + * For each node this holds the index of the one after the last character of + * {@link #word} labeling the transition to this node. This + * corresponds to the p for a transition used in Ukkonen's paper. + * + * @var int[] + */ + protected $nodeWordEnd; + + /** For each node its suffix link (called function f by Ukkonen). + * @var int[] */ + protected $suffixLink; + + /** + * The next node function realized as a hash table. This corresponds to the + * g function used in Ukkonen's paper. + * + * @var SuffixTreeHashTable + */ + protected $nextNode; + + /** + * An array giving for each node the index where the first child will be + * stored (or -1 if it has no children). It is initially empty and will be + * filled "on demand" using + * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])} + * . + * + * @var int[] + */ + protected $nodeChildFirst; + + /** + * This array gives the next index of the child list or -1 if this is the + * last one. It is initially empty and will be filled "on demand" using + * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])} + * . + * + * @var int[] + */ + protected $nodeChildNext; + + /** + * This array stores the actual name (=number) of the mode in the child + * list. It is initially empty and will be filled "on demand" using + * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])} + * . + * + * @var int[] + */ + protected $nodeChildNode; + + /** + * The node we are currently at as a "global" variable (as it is always + * passed unchanged). This is called s in Ukkonen's paper. + * + * @var int + */ + private $currentNode = 0; + + /** + * Beginning of the word part of the reference pair. This is kept "global" + * (in constrast to the end) as this is passed unchanged to all functions. + * Ukkonen calls this k. + * + * @var int + */ + private $refWordBegin = 0; + + /** + * This is the new (or old) explicit state as returned by + * {@link #testAndSplit(int, Object)}. Ukkonen calls this r. + * + * @var int + */ + private $explicitNode; + + /** + * Create a new suffix tree from a given word. The word given as parameter + * is used internally and should not be modified anymore, so copy it before + * if required. + * + * @param array $word + */ + public function __construct(array $word) + { + $this->word = $word; + $size = count($word); + $this->INFTY = $size; + + $expectedNodes = 2 * $size; + $this->nodeWordBegin = array_fill(0, $expectedNodes, 0); + $this->nodeWordEnd = array_fill(0, $expectedNodes, 0); + $this->suffixLink = array_fill(0, $expectedNodes, 0); + $this->nextNode = new SuffixTreeHashTable($expectedNodes); + + $this->createRootNode(); + + for ($i = 0; $i < $size; ++$i) { + $this->update($i); + $this->canonize($i + 1); + } + } + + /** + * Creates the root node. + * + * @return void + */ + private function createRootNode() + { + $this->numNodes = 1; + $this->nodeWordBegin[0] = 0; + $this->nodeWordEnd[0] = 0; + $this->suffixLink[0] = -1; + } + + /** + * The update function as defined in Ukkonen's paper. This inserts + * the character at charPos into the tree. It works on the canonical + * reference pair ({@link #currentNode}, ({@link #refWordBegin}, charPos)). + * + * @param int $charPos + * @return void + */ + private function update(int $charPos) { + $lastNode = 0; + while (!$this->testAndSplit($charPos, $this->word[$charPos])) { + $newNode = $this->numNodes++; + $this->nodeWordBegin[$newNode] = $charPos; + $this->nodeWordEnd[$newNode] = $this->INFTY; + $this->nextNode->put($this->explicitNode, $this->word[$charPos], $newNode); + + if ($lastNode != 0) { + $this->suffixLink[$lastNode] = $this->explicitNode; + } + $lastNode = $this->explicitNode; + $this->currentNode = $this->suffixLink[$this->currentNode]; + $this->canonize($charPos); + } + if ($lastNode != 0) { + $this->suffixLink[$lastNode] = $this->currentNode; + } + } + + /** + * The test-and-split function as defined in Ukkonen's paper. This + * checks whether the state given by the canonical reference pair ( + * {@link #currentNode}, ({@link #refWordBegin}, refWordEnd)) is the end + * point (by checking whether a transition for the + * nextCharacter exists). Additionally the state is made + * explicit if it not already is and this is not the end-point. It returns + * true if the end-point was reached. The newly created (or reached) + * explicit node is returned in the "global" variable. + * + * @param int $refWordEnd + * @param object $nextCharacter + * @return boolean + */ + private function testAndSplit(int $refWordEnd, JavaObjectInterface $nextCharacter) + { + if ($this->currentNode < 0) { + // trap state is always end state + return true; + } + + if ($refWordEnd <= $this->refWordBegin) { + if ($this->nextNode->get($this->currentNode, $nextCharacter) < 0) { + $this->explicitNode = $this->currentNode; + return false; + } + return true; + } + + /** @var int */ + $next = $this->nextNode->get($this->currentNode, $this->word[$this->refWordBegin]); + if ($nextCharacter->equals($this->word[$this->nodeWordBegin[$next] + $refWordEnd - $this->refWordBegin])) { + return true; + } + + // not an end-point and not explicit, so make it explicit. + $this->explicitNode = $this->numNodes++; + $this->nodeWordBegin[$this->explicitNode] = $this->nodeWordBegin[$next]; + $this->nodeWordEnd[$this->explicitNode] = $this->nodeWordBegin[$next] + $refWordEnd - $this->refWordBegin; + $this->nextNode->put($this->currentNode, $this->word[$this->refWordBegin], $this->explicitNode); + + $this->nodeWordBegin[$next] += $refWordEnd - $this->refWordBegin; + $this->nextNode->put($this->explicitNode, $this->word[$this->nodeWordBegin[$next]], $next); + return false; + } + + /** + * The canonize function as defined in Ukkonen's paper. Changes the + * reference pair (currentNode, (refWordBegin, refWordEnd)) into a canonical + * reference pair. It works on the "global" variables {@link #currentNode} + * and {@link #refWordBegin} and the parameter, writing the result back to + * the globals. + * + * @param int $refWordEnd one after the end index for the word of the reference pair. + * @return void + */ + private function canonize(int $refWordEnd): void + { + if ($this->currentNode === -1) { + // explicitly handle trap state + $this->currentNode = 0; + $this->refWordBegin++; + } + + if ($refWordEnd <= $this->refWordBegin) { + // empty word, so already canonical + return; + } + + /** @var int */ + $next = $this->nextNode->get( + $this->currentNode, + $this->word[$this->refWordBegin] + ); + while ($this->nodeWordEnd[$next] - $this->nodeWordBegin[$next] <= $refWordEnd + - $this->refWordBegin) { + $this->refWordBegin += $this->nodeWordEnd[$next] - $this->nodeWordBegin[$next]; + $this->currentNode = $next; + if ($refWordEnd > $this->refWordBegin) { + $next = $this->nextNode->get($this->currentNode, $this->word[$this->refWordBegin]); + } else { + break; + } + } + } + + /** + * This method makes sure the child lists are filled (required for + * traversing the tree). + * + * @return void + */ + protected function ensureChildLists() + { + if ($this->nodeChildFirst == null || count($this->nodeChildFirst) < $this->numNodes) { + $this->nodeChildFirst = array_fill(0, $this->numNodes, 0); + $this->nodeChildNext = array_fill(0, $this->numNodes, 0); + $this->nodeChildNode = array_fill(0, $this->numNodes, 0); + $this->nextNode->extractChildLists($this->nodeChildFirst, $this->nodeChildNext, $this->nodeChildNode); + } + } + + /** + * Returns whether the given word is contained in the string given at + * construction time. + * + * @param array $find + * @return boolean + */ + public function containsWord(array $find) { + $node = 0; + $findSize = count($find); + for ($i = 0; $i < $findSize;) { + /** @var int */ + $next = $this->nextNode->get($node, $find[$i]); + if ($next < 0) { + return false; + } + for ($j = $this->nodeWordBegin[$next]; $j < $this->nodeWordEnd[$next] && $i < $findSize; ++$i, ++$j) { + if (!$this->word[$j]->equals($find[$i])) { + return false; + } + } + $node = $next; + } + return true; + } +} diff --git a/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php b/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php new file mode 100644 index 00000000..44827705 --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php @@ -0,0 +1,241 @@ + + * It hashes from (node, character) pairs to the next node, where nodes are + * represented by integers and the type of characters is determined by the + * generic parameter. + * + * @author Benjamin Hummel + * @author $Author: juergens $ + * + * @version $Revision: 34670 $ + * @ConQAT.Rating GREEN Hash: 6A7A830078AF0CA9C2D84C148F336DF4 + */ +class SuffixTreeHashTable +{ + /** + * These numbers were taken from + * http://planetmath.org/encyclopedia/GoodHashTablePrimes.html + * @var int[] + */ + private $allowedSizes = [ 53, 97, 193, 389, 769, 1543, + 3079, 6151, 12289, 24593, 49157, 98317, 196613, 393241, 786433, + 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319, + 201326611, 402653189, 805306457, 1610612741 ]; + + /** + * The size of the hash table. + * @var int + */ + private $tableSize; + + /** + * Storage space for the node part of the key + * @var int[] + */ + private $keyNodes; + + /** + * Storage space for the character part of the key. + * @var object[] + */ + private $keyChars; + + /** + * Storage space for the result node. + * @var int[] + */ + private $resultNodes; + + /** + * Debug info: number of stored nodes. + * @var int + */ + private $_numStoredNodes = 0; + + /** + * Debug info: number of calls to find so far. + * @var int + */ + private $_numFind = 0; + + /** + * Debug info: number of collisions (i.e. wrong finds) during find so far. + * @var int + */ + private $_numColl = 0; + + /** + * Creates a new hash table for the given number of nodes. Trying to add + * more nodes will result in worse performance down to entering an infinite + * loop on some operations. + * + * @param int $numNodes + */ + public function __construct(int $numNodes) + { + $minSize = (int) ceil(1.5 * $numNodes); + $sizeIndex = 0; + while ($this->allowedSizes[$sizeIndex] < $minSize) { + $sizeIndex++; + } + $this->tableSize = $this->allowedSizes[$sizeIndex]; + + $this->keyNodes = array_fill(0, $this->tableSize, 0); + $this->keyChars = array_fill(0, $this->tableSize, null); + $this->resultNodes = array_fill(0, $this->tableSize, 0); + } + + /** + * Returns the position of the (node,char) key in the hash map or the + * position to insert it into if it is not yet in. + * + * @param int $keyNode + * @param JavaObjectInterface $keyChar + * @return int + */ + private function hashFind(int $keyNode, JavaObjectInterface $keyChar) + { + ++$this->_numFind; + /** @var int */ + $hash = $keyChar->hashCode(); + /** @var int */ + $pos = $this->posMod($this->primaryHash($keyNode, $hash)); + /** @var int */ + $secondary = $this->secondaryHash($keyNode, $hash); + while ($this->keyChars[$pos] !== null) { + if ($this->keyNodes[$pos] === $keyNode && $keyChar->equals($this->keyChars[$pos])) { + break; + } + ++$this->_numColl; + $pos = ($pos + $secondary) % $this->tableSize; + } + return $pos; + } + + /** + * Returns the next node for the given (node, character) key pair or a + * negative value if no next node is stored for this key. + * + * @return int + */ + public function get(int $keyNode, JavaObjectInterface $keyChar): int + { + $pos = $this->hashFind($keyNode, $keyChar); + if ($this->keyChars[$pos] === null) { + return -1; + } + return $this->resultNodes[$pos]; + } + + /** + * Inserts the given result node for the (node, character) key pair. + * @return void + */ + public function put(int $keyNode, JavaObjectInterface $keyChar, int $resultNode) + { + $pos = $this->hashFind($keyNode, $keyChar); + if ($this->keyChars[$pos] == null) { + ++$this->_numStoredNodes; + $this->keyChars[$pos] = $keyChar; + $this->keyNodes[$pos] = $keyNode; + } + $this->resultNodes[$pos] = $resultNode; + } + + /** + * Returns the primary hash value for a (node, character) key pair. + * @return int + */ + private function primaryHash(int $keyNode, int $keyCharHash) + { + $res = $keyCharHash ^ (13 * $keyNode); + return $res; + } + + /** + * Returns the secondary hash value for a (node, character) key pair. + * @return int + */ + private function secondaryHash(int $keyNode, int $keyCharHash) + { + $result = $this->posMod(($keyCharHash ^ (1025 * $keyNode))); + if ($result == 0) { + return 2; + } + return $result; + } + + /** + * Returns the smallest non-negative number congruent to x modulo + * {@link #tableSize}. + * @return int + */ + private function posMod(int $x) + { + $x %= $this->tableSize; + if ($x < 0) { + $x += $this->tableSize; + } + return $x; + } + + /** + * Extracts the list of child nodes for each node from the hash table + * entries as a linked list. All arrays are expected to be initially empty + * and of suitable size (i.e. for n nodes it should have size + * n given that nodes are numbered 0 to n-1). Those arrays will be + * filled from this method. + *

+ * The method is package visible, as it is tighly coupled to the + * {@link SuffixTree} class. + * + * @param int[] nodeFirstIndex an array giving for each node the index where the first child + * will be stored (or -1 if it has no children). + * @param int[] nodeNextIndex this array gives the next index of the child list or -1 if + * this is the last one. + * @param int[] nodeChild this array stores the actual name (=number) of the mode in the + * child list. + * @return void + * @throws ArrayIndexOutOfBoundsException if any of the given arrays was too small. + */ + public function extractChildLists(array &$nodeFirstIndex, array &$nodeNextIndex, array &$nodeChild) + { + // Instead of Arrays.fill($nodeFirstIndex, -1); + foreach ($nodeFirstIndex as $k => $v) { + $nodeFirstIndex[$k] = -1; + } + $free = 0; + for ($i = 0; $i < $this->tableSize; ++$i) { + if ($this->keyChars[$i] !== null) { + // insert $this->keyNodes[$i] -> $this->resultNodes[$i] + $nodeChild[$free] = $this->resultNodes[$i]; + $nodeNextIndex[$free] = $nodeFirstIndex[$this->keyNodes[$i]]; + $nodeFirstIndex[$this->keyNodes[$i]] = $free++; + } + } + } +} diff --git a/src/Detector/Strategy/SuffixTree/main.php b/src/Detector/Strategy/SuffixTree/main.php new file mode 100644 index 00000000..9cf76ea6 --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/main.php @@ -0,0 +1,55 @@ + true, + T_COMMENT => true, + T_DOC_COMMENT => true, + T_OPEN_TAG => true, + T_OPEN_TAG_WITH_ECHO => true, + T_CLOSE_TAG => true, + T_WHITESPACE => true, + T_USE => true, + T_NS_SEPARATOR => true, + ]; + foreach($tokens as $token) { + if (is_array($token)) { + if (isset($tokensIgnoreList[$token[0]])) { + continue; + } + $word[] = new PhpToken( + $token[0], + token_name($token[0]), + $token[2], + $file, + $token[1] + ); + } + } +} else { + die('Only supports one file'); +} +$word[] = new Sentinel(); +$tree = new ApproximateCloneDetectingSuffixTree($word); +$tree->findClones(10, 5, 10); diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php new file mode 100644 index 00000000..66e6bc19 --- /dev/null +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -0,0 +1,46 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector\Strategy; + +use function is_array; +use function array_keys; +use function file_get_contents; +use function token_get_all; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\PhpToken; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\CloneInfo; + +final class SuffixTreeStrategy +{ + public function processFile(string $file, int $minLines, int $minTokens, CodeCloneMap $result, bool $fuzzy = false): void + { + $content = file_get_contents($file); + $tokens = token_get_all($content); + $word = []; + + foreach (array_keys($tokens) as $key) { + $token = $tokens[$key]; + + if (is_array($token)) { + if (!isset($this->tokensIgnoreList[$token[0]])) { + $word[] = new PhpToken( + $token[0], + token_name($token[0]), + $token[2], + $file, + $token[1] + ); + } + } + } + $tree = new ApproximateCloneDetectingSuffixTree($word); + /** @var CloneInfo[] */ + $cloneInfos = $tree->findClones(10, 5, 10); + } +} From 0179eff162768dc43f774f8b4c55223e8fdca384 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Thu, 17 Jun 2021 16:41:13 +0200 Subject: [PATCH 02/29] Add new command argument: algorithm --- src/CLI/Application.php | 23 ++++++++- src/CLI/Arguments.php | 13 ++++- src/CLI/ArgumentsBuilder.php | 8 ++++ .../ApproximateCloneDetectingSuffixTree.php | 48 +------------------ src/Detector/Strategy/SuffixTreeStrategy.php | 47 +++++++++++++++++- 5 files changed, 89 insertions(+), 50 deletions(-) diff --git a/src/CLI/Application.php b/src/CLI/Application.php index 15b977cc..7e8764a5 100644 --- a/src/CLI/Application.php +++ b/src/CLI/Application.php @@ -12,9 +12,12 @@ use const PHP_EOL; use function count; use function printf; +use Exception; use SebastianBergmann\FileIterator\Facade; use SebastianBergmann\PHPCPD\Detector\Detector; +use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy; use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy; use SebastianBergmann\PHPCPD\Log\PMD; use SebastianBergmann\PHPCPD\Log\Text; use SebastianBergmann\Timer\ResourceUsageFormatter; @@ -62,7 +65,7 @@ public function run(array $argv): int return 1; } - $strategy = new DefaultStrategy; + $strategy = $this->pickStrategy($arguments->algorithm()); $timer = new Timer; $timer->start(); @@ -93,6 +96,24 @@ private function printVersion(): void ); } + private function pickStrategy(?string $algorithm): AbstractStrategy + { + switch ($algorithm) { + case null: + case 'rabin-karp': + return new DefaultStrategy(); + + break; + + case 'suffixtree': + return new SuffixTreeStrategy(); + + break; + default: + throw new Exception('Unsupported algorithm: ' . $algorithm); + } + } + private function help(): void { print <<<'EOT' diff --git a/src/CLI/Arguments.php b/src/CLI/Arguments.php index cbe4f296..6f0ab7a9 100644 --- a/src/CLI/Arguments.php +++ b/src/CLI/Arguments.php @@ -61,7 +61,12 @@ final class Arguments */ private $version; - public function __construct(array $directories, array $suffixes, array $exclude, ?string $pmdCpdXmlLogfile, int $linesThreshold, int $tokensThreshold, bool $fuzzy, bool $verbose, bool $help, bool $version) + /** + * @var ?string + */ + private $algorithm; + + public function __construct(array $directories, array $suffixes, array $exclude, ?string $pmdCpdXmlLogfile, int $linesThreshold, int $tokensThreshold, bool $fuzzy, bool $verbose, bool $help, bool $version, ?string $algorithm) { $this->directories = $directories; $this->suffixes = $suffixes; @@ -73,6 +78,7 @@ public function __construct(array $directories, array $suffixes, array $exclude, $this->verbose = $verbose; $this->help = $help; $this->version = $version; + $this->algorithm = $algorithm; } /** @@ -133,4 +139,9 @@ public function version(): bool { return $this->version; } + + public function algorithm(): ?string + { + return $this->algorithm; + } } diff --git a/src/CLI/ArgumentsBuilder.php b/src/CLI/ArgumentsBuilder.php index c7dfa0b5..3f269b85 100644 --- a/src/CLI/ArgumentsBuilder.php +++ b/src/CLI/ArgumentsBuilder.php @@ -33,6 +33,7 @@ public function build(array $argv): Arguments 'verbose', 'help', 'version', + 'algorithm=', ] ); } catch (CliParserException $e) { @@ -53,6 +54,7 @@ public function build(array $argv): Arguments $verbose = false; $help = false; $version = false; + $algorithm = 'rabin-karp'; foreach ($options[0] as $option) { switch ($option[0]) { @@ -101,6 +103,11 @@ public function build(array $argv): Arguments case '--version': $version = true; + break; + + case '--algorithm': + $algorithm = $option[1]; + break; } } @@ -122,6 +129,7 @@ public function build(array $argv): Arguments $verbose, $help, $version, + $algorithm ); } } diff --git a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php index a23b48e1..f48ed728 100644 --- a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php +++ b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php @@ -164,7 +164,7 @@ public function findClones(int $minLength, int $maxErrors, int $headEquality) foreach ($existingClones as $ci) { // length = number of tokens // TODO: min token length - if ($ci->length > 25) { + if ($ci->length > $minLength) { /** @var CloneInfo */ $previousCi = $map[$ci->token->line] ?? null; if ($previousCi == null) { @@ -186,52 +186,8 @@ public function findClones(int $minLength, int $maxErrors, int $headEquality) /** @var CloneInfo[] */ $values = array_values($map); - return $values; - - // TODO: Below moved to SuffixTreeStrategy class. usort($values, function ($a, $b) { return $b->length - $a->length;}); - printf( - "\nFound %d clones with %d duplicated lines in %d files:\n\n", - count($values), - 0, // TODO: Fix - 0 - ); - // TODO: Filter overlapping clones. - for ($i = 0; $i < count($values); $i++) { - /** @var CloneInfo */ - $ci = $values[$i]; - try { - /** @var PhpToken */ - $lastToken = $this->word[$ci->position + $ci->length]; - $lines = $lastToken->line - $ci->token->line; - printf( - " - %s:%d-%d (%d lines)\n", - $ci->token->file, - $ci->token->line, - $ci->token->line + $lines - 1, - $lines - ); - } catch(IndexOutOfBoundsException $e) { - printf("index out of bounds, ci.position = %d, ci.length = %d", $ci->position, $ci->length); - } - /** @var int[] */ - $others = $ci->otherClones->extractFirstList(); - for ($j = 0; $j < count($others); $j++) { - $otherStart = $others[$j]; - /** @var PhpToken */ - $t = $this->word[$otherStart]; - /** @var PhpToken */ - $lastToken = $this->word[$ci->position + $ci->length]; - $lines = $lastToken->line - $ci->token->line; - printf( - " %s:%d-%d\n", - $t->file, - $t->line, - $t->line + $lines - 1 - ); - } - print("\n"); - } + return $values; } /** diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index 66e6bc19..929cd844 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -16,7 +16,7 @@ use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\PhpToken; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\CloneInfo; -final class SuffixTreeStrategy +final class SuffixTreeStrategy extends AbstractStrategy { public function processFile(string $file, int $minLines, int $minTokens, CodeCloneMap $result, bool $fuzzy = false): void { @@ -40,7 +40,50 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo } } $tree = new ApproximateCloneDetectingSuffixTree($word); + $editDistance = 5; + $headEquality = 10; /** @var CloneInfo[] */ - $cloneInfos = $tree->findClones(10, 5, 10); + $cloneInfos = $tree->findClones($minTokens, $editDistance, $headEquality); + + foreach ($cloneInfos as $cloneInfo) { + /** @var PhpToken */ + $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length]; + $lines = $lastToken->line - $cloneInfo->token->line; + /* + printf( + " - %s:%d-%d (%d lines)\n", + $cloneInfo->token->file, + $cloneInfo->token->line, + $cloneInfo->token->line + $lines - 1, + $lines + ); + */ + $result->add( + new CodeClone( + new CodeCloneFile($cloneInfo->token->file, $cloneInfo->token->line), + new CodeCloneFile($t->file, $t->line), + $lines, + 0 + ) + ); + /** @var int[] */ + $others = $cloneInfo->otherClones->extractFirstList(); + for ($j = 0; $j < count($others); $j++) { + $otherStart = $others[$j]; + /** @var PhpToken */ + $t = $this->word[$otherStart]; + /** @var PhpToken */ + $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length]; + $lines = $lastToken->line - $cloneInfo->token->line; + /* + printf( + " %s:%d-%d\n", + $t->file, + $t->line, + $t->line + $lines - 1 + ); + */ + } + } } } From 8d3c0cb81fcf4b496a50682dab9cd53f9a18a35a Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Thu, 17 Jun 2021 16:47:54 +0200 Subject: [PATCH 03/29] Report clone properly --- .../ApproximateCloneDetectingSuffixTree.php | 2 ++ src/Detector/Strategy/SuffixTreeStrategy.php | 28 +++++++++++-------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php index f48ed728..705a87c6 100644 --- a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php +++ b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php @@ -158,6 +158,8 @@ public function findClones(int $minLength, int $maxErrors, int $headEquality) } } + $map = []; + for ($index = 0; $index <= count($this->word); ++$index) { $existingClones = $this->cloneInfos[$index] ?? null; if ($existingClones != null) { diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index 929cd844..dbd1bbc1 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -13,8 +13,12 @@ use function array_keys; use function file_get_contents; use function token_get_all; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\ApproximateCloneDetectingSuffixTree; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\PhpToken; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\CloneInfo; +use SebastianBergmann\PHPCPD\CodeClone; +use SebastianBergmann\PHPCPD\CodeCloneFile; +use SebastianBergmann\PHPCPD\CodeCloneMap; final class SuffixTreeStrategy extends AbstractStrategy { @@ -40,14 +44,14 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo } } $tree = new ApproximateCloneDetectingSuffixTree($word); - $editDistance = 5; + $editDistance = 10; $headEquality = 10; /** @var CloneInfo[] */ $cloneInfos = $tree->findClones($minTokens, $editDistance, $headEquality); foreach ($cloneInfos as $cloneInfo) { /** @var PhpToken */ - $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length]; + $lastToken = $word[$cloneInfo->position + $cloneInfo->length]; $lines = $lastToken->line - $cloneInfo->token->line; /* printf( @@ -58,23 +62,23 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo $lines ); */ - $result->add( - new CodeClone( - new CodeCloneFile($cloneInfo->token->file, $cloneInfo->token->line), - new CodeCloneFile($t->file, $t->line), - $lines, - 0 - ) - ); /** @var int[] */ $others = $cloneInfo->otherClones->extractFirstList(); for ($j = 0; $j < count($others); $j++) { $otherStart = $others[$j]; /** @var PhpToken */ - $t = $this->word[$otherStart]; + $t = $word[$otherStart]; /** @var PhpToken */ - $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length]; + $lastToken = $word[$cloneInfo->position + $cloneInfo->length]; $lines = $lastToken->line - $cloneInfo->token->line; + $result->add( + new CodeClone( + new CodeCloneFile($cloneInfo->token->file, $cloneInfo->token->line), + new CodeCloneFile($t->file, $t->line), + $lines, + 0 + ) + ); /* printf( " %s:%d-%d\n", From 4224ba371aac21c4b4c9c2adcbca2525f9c15faf Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Thu, 17 Jun 2021 16:53:30 +0200 Subject: [PATCH 04/29] Remove unused code --- src/Detector/Strategy/SuffixTreeStrategy.php | 23 ++------------------ 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index dbd1bbc1..3562129d 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -50,18 +50,6 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo $cloneInfos = $tree->findClones($minTokens, $editDistance, $headEquality); foreach ($cloneInfos as $cloneInfo) { - /** @var PhpToken */ - $lastToken = $word[$cloneInfo->position + $cloneInfo->length]; - $lines = $lastToken->line - $cloneInfo->token->line; - /* - printf( - " - %s:%d-%d (%d lines)\n", - $cloneInfo->token->file, - $cloneInfo->token->line, - $cloneInfo->token->line + $lines - 1, - $lines - ); - */ /** @var int[] */ $others = $cloneInfo->otherClones->extractFirstList(); for ($j = 0; $j < count($others); $j++) { @@ -76,17 +64,10 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo new CodeCloneFile($cloneInfo->token->file, $cloneInfo->token->line), new CodeCloneFile($t->file, $t->line), $lines, - 0 + // TODO: Double check this + $otherStart + 1 - $cloneInfo->position ) ); - /* - printf( - " %s:%d-%d\n", - $t->file, - $t->line, - $t->line + $lines - 1 - ); - */ } } } From f2dd32a409c653f43720533809b327a2b234f9ce Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Thu, 17 Jun 2021 16:55:25 +0200 Subject: [PATCH 05/29] Free tokens memory --- src/Detector/Strategy/SuffixTreeStrategy.php | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index 3562129d..17eb9c0f 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -43,6 +43,7 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo } } } + unset($tokens); $tree = new ApproximateCloneDetectingSuffixTree($word); $editDistance = 10; $headEquality = 10; From 093a8d037a88cd060be05280065a57c6e7ad7cf6 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Thu, 17 Jun 2021 17:47:39 +0200 Subject: [PATCH 06/29] Add post-process step, so suffixtree algo can collect all tokens first --- src/Detector/Detector.php | 2 ++ src/Detector/Strategy/AbstractStrategy.php | 2 ++ src/Detector/Strategy/SuffixTreeStrategy.php | 35 +++++++++++++++----- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/Detector/Detector.php b/src/Detector/Detector.php index 9ccffeee..34848434 100644 --- a/src/Detector/Detector.php +++ b/src/Detector/Detector.php @@ -42,6 +42,8 @@ public function copyPasteDetection(iterable $files, int $minLines = 5, int $minT ); } + $this->strategy->postProcess(); + return $result; } } diff --git a/src/Detector/Strategy/AbstractStrategy.php b/src/Detector/Strategy/AbstractStrategy.php index 2efd672b..cd660f9b 100644 --- a/src/Detector/Strategy/AbstractStrategy.php +++ b/src/Detector/Strategy/AbstractStrategy.php @@ -43,4 +43,6 @@ abstract class AbstractStrategy protected $hashes = []; abstract public function processFile(string $file, int $minLines, int $minTokens, CodeCloneMap $result, bool $fuzzy = false): void; + + public function postProcess(): void { } } diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index 17eb9c0f..31494be9 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -16,24 +16,30 @@ use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\ApproximateCloneDetectingSuffixTree; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\PhpToken; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\CloneInfo; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Sentinel; use SebastianBergmann\PHPCPD\CodeClone; use SebastianBergmann\PHPCPD\CodeCloneFile; use SebastianBergmann\PHPCPD\CodeCloneMap; final class SuffixTreeStrategy extends AbstractStrategy { + /** + * @var PhpToken[] + */ + private $word = []; + public function processFile(string $file, int $minLines, int $minTokens, CodeCloneMap $result, bool $fuzzy = false): void { + echo 'Process file ' . $file . PHP_EOL; $content = file_get_contents($file); $tokens = token_get_all($content); - $word = []; foreach (array_keys($tokens) as $key) { $token = $tokens[$key]; if (is_array($token)) { if (!isset($this->tokensIgnoreList[$token[0]])) { - $word[] = new PhpToken( + $this->word[] = new PhpToken( $token[0], token_name($token[0]), $token[2], @@ -43,12 +49,23 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo } } } - unset($tokens); - $tree = new ApproximateCloneDetectingSuffixTree($word); - $editDistance = 10; + + $this->minLines = $minLines; + $this->minTokens = $minTokens; + $this->result = $result; + } + + public function postProcess(): void + { + // Sentinel = End of word + $this->word[] = new Sentinel(); + echo 'Total word length: ' . count($this->word) . PHP_EOL; + + $tree = new ApproximateCloneDetectingSuffixTree($this->word); + $editDistance = 5; $headEquality = 10; /** @var CloneInfo[] */ - $cloneInfos = $tree->findClones($minTokens, $editDistance, $headEquality); + $cloneInfos = $tree->findClones($this->minTokens, $editDistance, $headEquality); foreach ($cloneInfos as $cloneInfo) { /** @var int[] */ @@ -56,11 +73,11 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo for ($j = 0; $j < count($others); $j++) { $otherStart = $others[$j]; /** @var PhpToken */ - $t = $word[$otherStart]; + $t = $this->word[$otherStart]; /** @var PhpToken */ - $lastToken = $word[$cloneInfo->position + $cloneInfo->length]; + $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length]; $lines = $lastToken->line - $cloneInfo->token->line; - $result->add( + $this->result->add( new CodeClone( new CodeCloneFile($cloneInfo->token->file, $cloneInfo->token->line), new CodeCloneFile($t->file, $t->line), From 793d486dabba8bc12336007caa938c346385fc43 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Thu, 17 Jun 2021 18:34:48 +0200 Subject: [PATCH 07/29] Add algorithm setting to help text --- src/CLI/Application.php | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/CLI/Application.php b/src/CLI/Application.php index 7e8764a5..65595136 100644 --- a/src/CLI/Application.php +++ b/src/CLI/Application.php @@ -129,9 +129,10 @@ private function help(): void Options for analysing files: - --fuzzy Fuzz variable names - --min-lines Minimum number of identical lines (default: 5) - --min-tokens Minimum number of identical tokens (default: 70) + --fuzzy Fuzz variable names + --min-lines Minimum number of identical lines (default: 5) + --min-tokens Minimum number of identical tokens (default: 70) + --algorithm Select which algorithm to use ('rabin-karp' (default) or 'suffixtree') Options for report generation: From 12279b4f4a6f99c4f4b805bc0a5104ad76d8357a Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Thu, 17 Jun 2021 18:42:13 +0200 Subject: [PATCH 08/29] Add description of the rabin karp algorithm --- src/Detector/Strategy/DefaultStrategy.php | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/Detector/Strategy/DefaultStrategy.php b/src/Detector/Strategy/DefaultStrategy.php index 014a33dd..155620a1 100644 --- a/src/Detector/Strategy/DefaultStrategy.php +++ b/src/Detector/Strategy/DefaultStrategy.php @@ -25,6 +25,18 @@ use SebastianBergmann\PHPCPD\CodeCloneFile; use SebastianBergmann\PHPCPD\CodeCloneMap; +/** + * This is a Rabin-Karp with an additional normalization steps before + * the hashing happens. + * + * 1. Tokenization + * 2. Deletion of logic neutral tokens like T_CLOSE_TAG;T_COMMENT; + * T_DOC_COMMENT; T_INLINE_HTML; T_NS_SEPARATOR; T_OPEN_TAG; + * T_OPEN_TAG_WITH_ECHO; T_USE; T_WHITESPACE; + * 3. If needed deletion of variable names + * 4. Normalization of token + value using crc32 + * 5. Now the classic Rabin-Karp hashing takes place + */ final class DefaultStrategy extends AbstractStrategy { public function processFile(string $file, int $minLines, int $minTokens, CodeCloneMap $result, bool $fuzzy = false): void From c9e22de3ffa87b29edf5b1a6b86822c30edde0fc Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Tue, 22 Jun 2021 19:38:18 +0200 Subject: [PATCH 09/29] Move hashes from abstract strategy --- src/Detector/Strategy/AbstractStrategy.php | 5 ----- src/Detector/Strategy/DefaultStrategy.php | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Detector/Strategy/AbstractStrategy.php b/src/Detector/Strategy/AbstractStrategy.php index cd660f9b..f6c9fadb 100644 --- a/src/Detector/Strategy/AbstractStrategy.php +++ b/src/Detector/Strategy/AbstractStrategy.php @@ -37,11 +37,6 @@ abstract class AbstractStrategy T_NS_SEPARATOR => true, ]; - /** - * @psalm-var array - */ - protected $hashes = []; - abstract public function processFile(string $file, int $minLines, int $minTokens, CodeCloneMap $result, bool $fuzzy = false): void; public function postProcess(): void { } diff --git a/src/Detector/Strategy/DefaultStrategy.php b/src/Detector/Strategy/DefaultStrategy.php index 155620a1..114cb454 100644 --- a/src/Detector/Strategy/DefaultStrategy.php +++ b/src/Detector/Strategy/DefaultStrategy.php @@ -39,6 +39,11 @@ */ final class DefaultStrategy extends AbstractStrategy { + /** + * @psalm-var array + */ + protected $hashes = []; + public function processFile(string $file, int $minLines, int $minTokens, CodeCloneMap $result, bool $fuzzy = false): void { $buffer = file_get_contents($file); From af01d9e5ce0a9bceb5fbdb9a04a7d9b81939a659 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Tue, 22 Jun 2021 20:10:52 +0200 Subject: [PATCH 10/29] Factor out strategy configuration DTO --- src/CLI/Application.php | 17 ++++++++------ src/CLI/Arguments.php | 24 +++++++++++++++++++- src/CLI/ArgumentsBuilder.php | 18 ++++++++++++++- src/Detector/Detector.php | 15 ++++++++---- src/Detector/Strategy/AbstractStrategy.php | 2 +- src/Detector/Strategy/DefaultStrategy.php | 21 ++++++++++------- src/Detector/Strategy/SuffixTreeStrategy.php | 21 ++++++++++------- 7 files changed, 88 insertions(+), 30 deletions(-) diff --git a/src/CLI/Application.php b/src/CLI/Application.php index 65595136..d3d8096f 100644 --- a/src/CLI/Application.php +++ b/src/CLI/Application.php @@ -18,6 +18,7 @@ use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy; use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy; +use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration; use SebastianBergmann\PHPCPD\Log\PMD; use SebastianBergmann\PHPCPD\Log\Text; use SebastianBergmann\Timer\ResourceUsageFormatter; @@ -65,6 +66,8 @@ public function run(array $argv): int return 1; } + $config = new StrategyConfiguration($arguments); + $strategy = $this->pickStrategy($arguments->algorithm()); $timer = new Timer; @@ -72,9 +75,7 @@ public function run(array $argv): int $clones = (new Detector($strategy))->copyPasteDetection( $files, - $arguments->linesThreshold(), - $arguments->tokensThreshold(), - $arguments->fuzzy() + $config ); (new Text)->printResult($clones, $arguments->verbose()); @@ -129,10 +130,12 @@ private function help(): void Options for analysing files: - --fuzzy Fuzz variable names - --min-lines Minimum number of identical lines (default: 5) - --min-tokens Minimum number of identical tokens (default: 70) - --algorithm Select which algorithm to use ('rabin-karp' (default) or 'suffixtree') + --fuzzy Fuzz variable names + --min-lines Minimum number of identical lines (default: 5) + --min-tokens Minimum number of identical tokens (default: 70) + --algorithm Select which algorithm to use ('rabin-karp' (default) or 'suffixtree') + --edit-distance Distance in number of edits between two clones (only for suffixtree; default: 5) + --head-equality Minimum equality at start of clone (only for suffixtree; default 10) Options for report generation: diff --git a/src/CLI/Arguments.php b/src/CLI/Arguments.php index 6f0ab7a9..112aca2a 100644 --- a/src/CLI/Arguments.php +++ b/src/CLI/Arguments.php @@ -66,7 +66,17 @@ final class Arguments */ private $algorithm; - public function __construct(array $directories, array $suffixes, array $exclude, ?string $pmdCpdXmlLogfile, int $linesThreshold, int $tokensThreshold, bool $fuzzy, bool $verbose, bool $help, bool $version, ?string $algorithm) + /** + * @var int + */ + private $editDistance; + + /** + * @var int + */ + private $headEquality; + + public function __construct(array $directories, array $suffixes, array $exclude, ?string $pmdCpdXmlLogfile, int $linesThreshold, int $tokensThreshold, bool $fuzzy, bool $verbose, bool $help, bool $version, ?string $algorithm, int $editDistance, int $headEquality) { $this->directories = $directories; $this->suffixes = $suffixes; @@ -79,6 +89,8 @@ public function __construct(array $directories, array $suffixes, array $exclude, $this->help = $help; $this->version = $version; $this->algorithm = $algorithm; + $this->editDistance = $editDistance; + $this->headEquality = $headEquality; } /** @@ -144,4 +156,14 @@ public function algorithm(): ?string { return $this->algorithm; } + + public function editDistance(): int + { + return $this->editDistance; + } + + public function headEquality(): int + { + return $this->headEquality; + } } diff --git a/src/CLI/ArgumentsBuilder.php b/src/CLI/ArgumentsBuilder.php index 3f269b85..babf87c3 100644 --- a/src/CLI/ArgumentsBuilder.php +++ b/src/CLI/ArgumentsBuilder.php @@ -30,6 +30,8 @@ public function build(array $argv): Arguments 'fuzzy', 'min-lines=', 'min-tokens=', + 'head-equality=', + 'edit-distance=', 'verbose', 'help', 'version', @@ -50,6 +52,8 @@ public function build(array $argv): Arguments $pmdCpdXmlLogfile = null; $linesThreshold = 5; $tokensThreshold = 70; + $editDistance = 5; + $headEquality = 10; $fuzzy = false; $verbose = false; $help = false; @@ -88,6 +92,16 @@ public function build(array $argv): Arguments break; + case '--head-equality': + $headEquality = (int) $option[1]; + + break; + + case '--edit-distance': + $editDistance = (int) $option[1]; + + break; + case '--verbose': $verbose = true; @@ -129,7 +143,9 @@ public function build(array $argv): Arguments $verbose, $help, $version, - $algorithm + $algorithm, + $editDistance, + $headEquality ); } } diff --git a/src/Detector/Detector.php b/src/Detector/Detector.php index 34848434..8c3bf6ae 100644 --- a/src/Detector/Detector.php +++ b/src/Detector/Detector.php @@ -11,6 +11,7 @@ use SebastianBergmann\PHPCPD\CodeCloneMap; use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy; +use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration; final class Detector { @@ -19,12 +20,20 @@ final class Detector */ private $strategy; + /** + * @param AbstractStrategy $strategy + */ public function __construct(AbstractStrategy $strategy) { $this->strategy = $strategy; } - public function copyPasteDetection(iterable $files, int $minLines = 5, int $minTokens = 70, bool $fuzzy = false): CodeCloneMap + /** + * @param iterable $files + * @param StrategyConfiguration $config + * @return CodeCloneMap + */ + public function copyPasteDetection(iterable $files, StrategyConfiguration $config): CodeCloneMap { $result = new CodeCloneMap; @@ -35,10 +44,8 @@ public function copyPasteDetection(iterable $files, int $minLines = 5, int $minT $this->strategy->processFile( $file, - $minLines, - $minTokens, $result, - $fuzzy + $config ); } diff --git a/src/Detector/Strategy/AbstractStrategy.php b/src/Detector/Strategy/AbstractStrategy.php index f6c9fadb..22b9ed04 100644 --- a/src/Detector/Strategy/AbstractStrategy.php +++ b/src/Detector/Strategy/AbstractStrategy.php @@ -37,7 +37,7 @@ abstract class AbstractStrategy T_NS_SEPARATOR => true, ]; - abstract public function processFile(string $file, int $minLines, int $minTokens, CodeCloneMap $result, bool $fuzzy = false): void; + abstract public function processFile(string $file, CodeCloneMap $result, StrategyConfiguration $config): void; public function postProcess(): void { } } diff --git a/src/Detector/Strategy/DefaultStrategy.php b/src/Detector/Strategy/DefaultStrategy.php index 114cb454..6548e1f9 100644 --- a/src/Detector/Strategy/DefaultStrategy.php +++ b/src/Detector/Strategy/DefaultStrategy.php @@ -44,7 +44,12 @@ final class DefaultStrategy extends AbstractStrategy */ protected $hashes = []; - public function processFile(string $file, int $minLines, int $minTokens, CodeCloneMap $result, bool $fuzzy = false): void + /** + * @param string $file + * @param CodeCloneMap $result + * @return void + */ + public function processFile(string $file, CodeCloneMap $result, StrategyConfiguration $config): void { $buffer = file_get_contents($file); $currentTokenPositions = []; @@ -72,7 +77,7 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo $currentTokenRealPositions[$tokenNr++] = $token[2]; - if ($fuzzy && $token[0] === T_VARIABLE) { + if ($config->getFuzzy() && $token[0] === T_VARIABLE) { $token[1] = 'variable'; } @@ -90,7 +95,7 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo $found = false; $tokenNr = 0; - while ($tokenNr <= $count - $minTokens) { + while ($tokenNr <= $count - $config->getMinTokens()) { $line = $currentTokenPositions[$tokenNr]; $realLine = $currentTokenRealPositions[$tokenNr]; @@ -99,7 +104,7 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo substr( $currentSignature, $tokenNr * 5, - $minTokens * 5 + $config->getMinTokens() * 5 ), true ), @@ -120,13 +125,13 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo if ($found) { $fileA = $this->hashes[$firstHash][0]; $firstLineA = $this->hashes[$firstHash][1]; - $lastToken = ($tokenNr - 1) + $minTokens - 1; + $lastToken = ($tokenNr - 1) + $config->getMinTokens() - 1; $lastLine = $currentTokenPositions[$lastToken]; $lastRealLine = $currentTokenRealPositions[$lastToken]; $numLines = $lastLine + 1 - $firstLine; $realNumLines = $lastRealLine + 1 - $firstRealLine; - if ($numLines >= $minLines && + if ($numLines >= $config->getMinLines() && ($fileA !== $file || $firstLineA !== $firstRealLine)) { $result->add( @@ -152,13 +157,13 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo if ($found) { $fileA = $this->hashes[$firstHash][0]; $firstLineA = $this->hashes[$firstHash][1]; - $lastToken = ($tokenNr - 1) + $minTokens - 1; + $lastToken = ($tokenNr - 1) + $config->getMinTokens() - 1; $lastLine = $currentTokenPositions[$lastToken]; $lastRealLine = $currentTokenRealPositions[$lastToken]; $numLines = $lastLine + 1 - $firstLine; $realNumLines = $lastRealLine + 1 - $firstRealLine; - if ($numLines >= $minLines && + if ($numLines >= $config->getMinLines() && ($fileA !== $file || $firstLineA !== $firstRealLine)) { $result->add( new CodeClone( diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index 31494be9..8109b47b 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -28,9 +28,19 @@ final class SuffixTreeStrategy extends AbstractStrategy */ private $word = []; - public function processFile(string $file, int $minLines, int $minTokens, CodeCloneMap $result, bool $fuzzy = false): void + /** + * @var StrategyConfiguration + */ + private $config; + + /** + * @param string $file + * @param CodeCloneMap $result + * @return void + */ + public function processFile(string $file, CodeCloneMap $result, StrategyConfiguration $config): void { - echo 'Process file ' . $file . PHP_EOL; + $this->config = $config; $content = file_get_contents($file); $tokens = token_get_all($content); @@ -50,8 +60,6 @@ public function processFile(string $file, int $minLines, int $minTokens, CodeClo } } - $this->minLines = $minLines; - $this->minTokens = $minTokens; $this->result = $result; } @@ -59,13 +67,10 @@ public function postProcess(): void { // Sentinel = End of word $this->word[] = new Sentinel(); - echo 'Total word length: ' . count($this->word) . PHP_EOL; $tree = new ApproximateCloneDetectingSuffixTree($this->word); - $editDistance = 5; - $headEquality = 10; /** @var CloneInfo[] */ - $cloneInfos = $tree->findClones($this->minTokens, $editDistance, $headEquality); + $cloneInfos = $tree->findClones($this->config->minTokens(), $this->config->getEditDistance(), $this->config->getHeadEquality()); foreach ($cloneInfos as $cloneInfo) { /** @var int[] */ From d2e14294638c17cded22d4ee8d54c6ad36860a37 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Tue, 22 Jun 2021 20:11:57 +0200 Subject: [PATCH 11/29] Remove test file --- src/Detector/Strategy/SuffixTree/main.php | 55 ----------------------- 1 file changed, 55 deletions(-) delete mode 100644 src/Detector/Strategy/SuffixTree/main.php diff --git a/src/Detector/Strategy/SuffixTree/main.php b/src/Detector/Strategy/SuffixTree/main.php deleted file mode 100644 index 9cf76ea6..00000000 --- a/src/Detector/Strategy/SuffixTree/main.php +++ /dev/null @@ -1,55 +0,0 @@ - true, - T_COMMENT => true, - T_DOC_COMMENT => true, - T_OPEN_TAG => true, - T_OPEN_TAG_WITH_ECHO => true, - T_CLOSE_TAG => true, - T_WHITESPACE => true, - T_USE => true, - T_NS_SEPARATOR => true, - ]; - foreach($tokens as $token) { - if (is_array($token)) { - if (isset($tokensIgnoreList[$token[0]])) { - continue; - } - $word[] = new PhpToken( - $token[0], - token_name($token[0]), - $token[2], - $file, - $token[1] - ); - } - } -} else { - die('Only supports one file'); -} -$word[] = new Sentinel(); -$tree = new ApproximateCloneDetectingSuffixTree($word); -$tree->findClones(10, 5, 10); From c2acc9ec97a90f789b518f2902bad69178b76244 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Tue, 22 Jun 2021 20:12:42 +0200 Subject: [PATCH 12/29] Add missing file --- .../Strategy/StrategyConfiguration.php | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 src/Detector/Strategy/StrategyConfiguration.php diff --git a/src/Detector/Strategy/StrategyConfiguration.php b/src/Detector/Strategy/StrategyConfiguration.php new file mode 100644 index 00000000..f7707b6c --- /dev/null +++ b/src/Detector/Strategy/StrategyConfiguration.php @@ -0,0 +1,95 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector\Strategy; + +use SebastianBergmann\PHPCPD\Arguments; + +/** + * Small DTO to carry configuration for a strategy. + * Different algorithms have different configs available. + */ +final class StrategyConfiguration +{ + /** + * Minimum lines to consider + * @var int + */ + private $minLines = 5; + + /** + * Minimum tokens to consider in a clone. + * @var int + */ + private $minTokens = 70; + + /** + * Edit distance to consider when comparing two clones + * Only available for the suffix-tree algorithm + * @var int + */ + private $editDistance = 5; + + /** + * Tokens that must be equal to consider a clone + * Only available for the suffix-tree algorithm + * @var int + */ + private $headEquality = 10; + + /** + * Fuzz variable names + * suffixtree always makes variables and functions fuzzy + * @var bool + */ + private $fuzzy = false; + + /** + * @param Arguments $arguments + */ + public function __construct(Arguments $arguments) + { + $this->minLines = $arguments->linesThreshold; + $this->minTokens = $arguments->tokensThreshold; + $this->fuzzy = $arguments->fuzzy; + $this->editDistance = $arguments->editDistance; + $this->headEquality = $arguments->headEquality; + } + + public function getMinLines(): int + { + return $this->minLines; + } + + public function getMinTokens(): int + { + return $this->minTokens; + } + + public function getHeadEquality(): int + { + return $this->headEquality; + } + + public function getEditDistance(): int + { + return $this->editDistance; + } + + public function getFuzzy(): bool + { + return $this->fuzzy; + } + + public function getMinLines() + { + return $this->minLines; + } + +} From 65f22c72392026a60b980c5da3c7821dfc1e99bb Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Tue, 22 Jun 2021 20:14:26 +0200 Subject: [PATCH 13/29] Remove duplicated method; fix argument access --- .../Strategy/StrategyConfiguration.php | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/src/Detector/Strategy/StrategyConfiguration.php b/src/Detector/Strategy/StrategyConfiguration.php index f7707b6c..613a9b88 100644 --- a/src/Detector/Strategy/StrategyConfiguration.php +++ b/src/Detector/Strategy/StrategyConfiguration.php @@ -55,11 +55,11 @@ final class StrategyConfiguration */ public function __construct(Arguments $arguments) { - $this->minLines = $arguments->linesThreshold; - $this->minTokens = $arguments->tokensThreshold; - $this->fuzzy = $arguments->fuzzy; - $this->editDistance = $arguments->editDistance; - $this->headEquality = $arguments->headEquality; + $this->minLines = $arguments->linesThreshold(); + $this->minTokens = $arguments->tokensThreshold(); + $this->fuzzy = $arguments->fuzzy(); + $this->editDistance = $arguments->editDistance(); + $this->headEquality = $arguments->headEquality(); } public function getMinLines(): int @@ -72,6 +72,11 @@ public function getMinTokens(): int return $this->minTokens; } + public function getFuzzy(): bool + { + return $this->fuzzy; + } + public function getHeadEquality(): int { return $this->headEquality; @@ -81,15 +86,4 @@ public function getEditDistance(): int { return $this->editDistance; } - - public function getFuzzy(): bool - { - return $this->fuzzy; - } - - public function getMinLines() - { - return $this->minLines; - } - } From f060ac3ed92c75f6aa5555c646c59099c97668a8 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Tue, 22 Jun 2021 20:26:53 +0200 Subject: [PATCH 14/29] Indentation --- src/Detector/Strategy/SuffixTreeStrategy.php | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index 8109b47b..2188f940 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -70,7 +70,11 @@ public function postProcess(): void $tree = new ApproximateCloneDetectingSuffixTree($this->word); /** @var CloneInfo[] */ - $cloneInfos = $tree->findClones($this->config->minTokens(), $this->config->getEditDistance(), $this->config->getHeadEquality()); + $cloneInfos = $tree->findClones( + $this->config->getMinTokens(), + $this->config->getEditDistance(), + $this->config->getHeadEquality() + ); foreach ($cloneInfos as $cloneInfo) { /** @var int[] */ From eaae19606ea6e9a8e1eb421caa33527aa0cadd27 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Tue, 22 Jun 2021 20:55:15 +0200 Subject: [PATCH 15/29] Testing --- src/Detector/Strategy/SuffixTreeStrategy.php | 2 +- tests/fixture/type3_clone.php | 40 ++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 tests/fixture/type3_clone.php diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index 2188f940..b162c6c7 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -84,7 +84,7 @@ public function postProcess(): void /** @var PhpToken */ $t = $this->word[$otherStart]; /** @var PhpToken */ - $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length]; + $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length - 1]; $lines = $lastToken->line - $cloneInfo->token->line; $this->result->add( new CodeClone( diff --git a/tests/fixture/type3_clone.php b/tests/fixture/type3_clone.php new file mode 100644 index 00000000..608294da --- /dev/null +++ b/tests/fixture/type3_clone.php @@ -0,0 +1,40 @@ + $b) { + return 'foo'; + } else { + return 'bar'; + } +} + +function bar() +{ + $a = 10; + if ($a > $b) { + return 'foo'; + } else { + return 'bar'; + } +} + +function bar() +{ + $a = 10; + $b = '20'; + if ($a) { + return 'foo'; + } else { + return 'bar'; + } +} From c8625c8a2622b3304595b4572c8a0f1e61f24a28 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Tue, 22 Jun 2021 21:14:39 +0200 Subject: [PATCH 16/29] Check if we fetch sentinel by mistake --- src/CLI/Application.php | 3 --- src/Detector/Strategy/SuffixTreeStrategy.php | 4 ++++ tests/fixture/type3_clone.php | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/CLI/Application.php b/src/CLI/Application.php index d3d8096f..8ce99b02 100644 --- a/src/CLI/Application.php +++ b/src/CLI/Application.php @@ -104,12 +104,9 @@ private function pickStrategy(?string $algorithm): AbstractStrategy case 'rabin-karp': return new DefaultStrategy(); - break; - case 'suffixtree': return new SuffixTreeStrategy(); - break; default: throw new Exception('Unsupported algorithm: ' . $algorithm); } diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index b162c6c7..e7283d4e 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -85,6 +85,10 @@ public function postProcess(): void $t = $this->word[$otherStart]; /** @var PhpToken */ $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length - 1]; + // If we stumbled upon the Sentinel, rewind one step. + if ($lastToken instanceof Sentinel) { + $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length - 2]; + } $lines = $lastToken->line - $cloneInfo->token->line; $this->result->add( new CodeClone( diff --git a/tests/fixture/type3_clone.php b/tests/fixture/type3_clone.php index 608294da..5557e0bd 100644 --- a/tests/fixture/type3_clone.php +++ b/tests/fixture/type3_clone.php @@ -21,8 +21,8 @@ function foo() function bar() { $a = 10; + $b = 20; if ($a > $b) { - return 'foo'; } else { return 'bar'; } From 7142b7b8fec59047e93f37533e0cca7897c92e54 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Wed, 23 Jun 2021 21:06:03 +0200 Subject: [PATCH 17/29] Apply cs-fixer --- src/CLI/Application.php | 2 +- src/Detector/Detector.php | 8 - src/Detector/Strategy/AbstractStrategy.php | 4 +- src/Detector/Strategy/DefaultStrategy.php | 5 - .../Strategy/StrategyConfiguration.php | 18 +- .../ApproximateCloneDetectingSuffixTree.php | 828 ++++++++++-------- .../Strategy/SuffixTree/CloneInfo.php | 62 +- .../SuffixTree/JavaObjectInterface.php | 14 +- src/Detector/Strategy/SuffixTree/PairList.php | 363 ++++---- src/Detector/Strategy/SuffixTree/PhpToken.php | 42 +- src/Detector/Strategy/SuffixTree/Sentinel.php | 34 +- .../Strategy/SuffixTree/SuffixTree.php | 500 ++++++----- .../SuffixTree/SuffixTreeHashTable.php | 324 +++---- src/Detector/Strategy/SuffixTreeStrategy.php | 22 +- 14 files changed, 1140 insertions(+), 1086 deletions(-) diff --git a/src/CLI/Application.php b/src/CLI/Application.php index 8ce99b02..26974701 100644 --- a/src/CLI/Application.php +++ b/src/CLI/Application.php @@ -17,8 +17,8 @@ use SebastianBergmann\PHPCPD\Detector\Detector; use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy; use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy; -use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy; use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy; use SebastianBergmann\PHPCPD\Log\PMD; use SebastianBergmann\PHPCPD\Log\Text; use SebastianBergmann\Timer\ResourceUsageFormatter; diff --git a/src/Detector/Detector.php b/src/Detector/Detector.php index 8c3bf6ae..8c2e737d 100644 --- a/src/Detector/Detector.php +++ b/src/Detector/Detector.php @@ -20,19 +20,11 @@ final class Detector */ private $strategy; - /** - * @param AbstractStrategy $strategy - */ public function __construct(AbstractStrategy $strategy) { $this->strategy = $strategy; } - /** - * @param iterable $files - * @param StrategyConfiguration $config - * @return CodeCloneMap - */ public function copyPasteDetection(iterable $files, StrategyConfiguration $config): CodeCloneMap { $result = new CodeCloneMap; diff --git a/src/Detector/Strategy/AbstractStrategy.php b/src/Detector/Strategy/AbstractStrategy.php index 22b9ed04..ae33870f 100644 --- a/src/Detector/Strategy/AbstractStrategy.php +++ b/src/Detector/Strategy/AbstractStrategy.php @@ -39,5 +39,7 @@ abstract class AbstractStrategy abstract public function processFile(string $file, CodeCloneMap $result, StrategyConfiguration $config): void; - public function postProcess(): void { } + public function postProcess(): void + { + } } diff --git a/src/Detector/Strategy/DefaultStrategy.php b/src/Detector/Strategy/DefaultStrategy.php index 6548e1f9..d2cf020b 100644 --- a/src/Detector/Strategy/DefaultStrategy.php +++ b/src/Detector/Strategy/DefaultStrategy.php @@ -44,11 +44,6 @@ final class DefaultStrategy extends AbstractStrategy */ protected $hashes = []; - /** - * @param string $file - * @param CodeCloneMap $result - * @return void - */ public function processFile(string $file, CodeCloneMap $result, StrategyConfiguration $config): void { $buffer = file_get_contents($file); diff --git a/src/Detector/Strategy/StrategyConfiguration.php b/src/Detector/Strategy/StrategyConfiguration.php index 613a9b88..370b9b89 100644 --- a/src/Detector/Strategy/StrategyConfiguration.php +++ b/src/Detector/Strategy/StrategyConfiguration.php @@ -1,4 +1,4 @@ -minLines = $arguments->linesThreshold(); diff --git a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php index 705a87c6..97ae12b1 100644 --- a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php +++ b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php @@ -1,117 +1,96 @@ - + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; /** * An extension of the suffix tree adding an algorithm for finding approximate * clones, i.e. substrings which are similar. - * + * * @author $Author: hummelb $ + * * @version $Revision: 43151 $ * @ConQAT.Rating GREEN Hash: BB94CD690760BC239F04D32D5BCAC33E */ class ApproximateCloneDetectingSuffixTree extends SuffixTree { + /** + * The minimal length of clones to return. + * + * @var int + */ + protected $minLength; + /** * The number of leaves reachable from the given node (1 for leaves). + * * @var int[] * */ - private $leafCount; + private $leafCount; /** * This is the distance between two entries in the {@link #cloneInfos} map. + * * @var int */ - private $INDEX_SPREAD = 10; + private $INDEX_SPREAD = 10; /** * This map stores for each position the relevant clone infos. + * * @var array */ - //private final ListMap cloneInfos = new ListMap(); - private $cloneInfos = []; + //private final ListMap cloneInfos = new ListMap(); + private $cloneInfos = []; - /** - * The maximal length of a clone. This influences the size of the - * (quadratic) {@link #edBuffer}. + /** + * The maximal length of a clone. This influences the size of the + * (quadratic) {@link #edBuffer}. + * * @var int - */ - private $MAX_LENGTH = 1024; + */ + private $MAX_LENGTH = 1024; /** * Buffer used for calculating edit distance. + * * @var array */ - private $edBuffer = []; + private $edBuffer = []; /** - * The minimal length of clones to return. + * Number of units that must be equal at the start of a clone. + * * @var int */ - protected $minLength; + private $headEquality; /** - * Number of units that must be equal at the start of a clone - * @var int - */ - private $headEquality; - - /** - * Create a new suffix tree from a given word. The word given as parameter - * is used internally and should not be modified anymore, so copy it before - * if required. - *

- * This only word correctly if the given word is closed using a sentinel - * character. + * Create a new suffix tree from a given word. The word given as parameter + * is used internally and should not be modified anymore, so copy it before + * if required. + *

+ * This only word correctly if the given word is closed using a sentinel + * character. * * @param array $word List of tokens to analyze - */ + */ public function __construct(array $word) { - $arr = array_fill(0, $this->MAX_LENGTH, 0); + $arr = array_fill(0, $this->MAX_LENGTH, 0); $this->edBuffer = array_fill(0, $this->MAX_LENGTH, $arr); parent::__construct($word); - $this->ensureChildLists(); - $this->leafCount = array_fill(0, $this->numNodes, 0); - $this->initLeafCount(0); - } - - /** - * Initializes the {@link #leafCount} array which given for each node the - * number of leaves reachable from it (where leaves obtain a value of 1). - * - * @param int $node - * @return void - */ - private function initLeafCount(int $node) - { - $this->leafCount[$node] = 0; - for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) { - $this->initLeafCount($this->nodeChildNode[$e]); - $this->leafCount[$node] += $this->leafCount[$this->nodeChildNode[$e]]; - } - if ($this->leafCount[$node] == 0) { - $this->leafCount[$node] = 1; - } - } + $this->ensureChildLists(); + $this->leafCount = array_fill(0, $this->numNodes, 0); + $this->initLeafCount(0); + } /** * @todo Add options: @@ -121,278 +100,376 @@ private function initLeafCount(int $node) * @todo Possibly add consumer from original code. */ - /** - * Finds all clones in the string (List) used in the constructor. - * - * @param int $minLength the minimal length of a clone in tokens (not lines) - * @param int $maxErrors the maximal number of errors/gaps allowed - * @param int $headEquality the number of elements which have to be the same at the beginning of a clone - * @return void + /** + * Finds all clones in the string (List) used in the constructor. + * + * @param int $minLength the minimal length of a clone in tokens (not lines) + * @param int $maxErrors the maximal number of errors/gaps allowed + * @param int $headEquality the number of elements which have to be the same at the beginning of a clone + * * @throws ConQATException - */ + */ public function findClones(int $minLength, int $maxErrors, int $headEquality) { - $this->minLength = $minLength; - $this->headEquality = $headEquality; - $this->cloneInfos = []; - - for ($i = 0; $i < count($this->word); ++$i) { - // Do quick start, as first character has to match anyway. - $node = $this->nextNode->get(0, $this->word[$i]); - if ($node < 0 || $this->leafCount[$node] <= 1) { - continue; - } - - // we know that we have an exact match of at least 'length' - // characters, as the word itself is part of the suffix tree. - $length = $this->nodeWordEnd[$node] - $this->nodeWordBegin[$node]; - $numReported = 0; - for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) { - if ($this->matchWord($i, $i + $length, $this->nodeChildNode[$e], $length, - $maxErrors)) { - ++$numReported; - } - } - if ($length >= $this->minLength && $numReported != 1) { - $this->reportClone($i, $i + $length, $node, $length, $length); - } - } + $this->minLength = $minLength; + $this->headEquality = $headEquality; + $this->cloneInfos = []; + + for ($i = 0; $i < count($this->word); $i++) { + // Do quick start, as first character has to match anyway. + $node = $this->nextNode->get(0, $this->word[$i]); + + if ($node < 0 || $this->leafCount[$node] <= 1) { + continue; + } + + // we know that we have an exact match of at least 'length' + // characters, as the word itself is part of the suffix tree. + $length = $this->nodeWordEnd[$node] - $this->nodeWordBegin[$node]; + $numReported = 0; + + for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) { + if ($this->matchWord( + $i, + $i + $length, + $this->nodeChildNode[$e], + $length, + $maxErrors + )) { + $numReported++; + } + } + + if ($length >= $this->minLength && $numReported != 1) { + $this->reportClone($i, $i + $length, $node, $length, $length); + } + } $map = []; - for ($index = 0; $index <= count($this->word); ++$index) { - $existingClones = $this->cloneInfos[$index] ?? null; - if ($existingClones != null) { + for ($index = 0; $index <= count($this->word); $index++) { + $existingClones = $this->cloneInfos[$index] ?? null; + + if ($existingClones != null) { foreach ($existingClones as $ci) { // length = number of tokens // TODO: min token length if ($ci->length > $minLength) { /** @var CloneInfo */ $previousCi = $map[$ci->token->line] ?? null; + if ($previousCi == null) { - $map[$ci->token->line] = $ci; - } else if ($ci->length > $previousCi->length) { + $map[$ci->token->line] = $ci; + } elseif ($ci->length > $previousCi->length) { $map[$ci->token->line] = $ci; } /** @var int[] */ $others = $ci->otherClones->extractFirstList(); + for ($j = 0; $j < count($others); $j++) { $otherStart = $others[$j]; /** @var PhpToken */ $t = $this->word[$otherStart]; } } - } - } - } + } + } + } /** @var CloneInfo[] */ $values = array_values($map); - usort($values, function ($a, $b) { return $b->length - $a->length;}); + usort($values, static function ($a, $b) { + return $b->length - $a->length; + }); + return $values; - } - - /** - * Performs the approximative matching between the input word and the tree. - * - * @param int $wordStart the start position of the currently matched word (position in - * the input word). - * @param int $wordPosition the current position along the input word. - * @param int $node the node we are currently at (i.e. the edge leading to this - * node is relevant to us). - * @param int $nodeWordLength the length of the word found along the nodes (this may be - * different from the length along the input word due to gaps). - * @param int $maxErrors the number of errors still allowed. - * @return boolean whether some clone was reported + } + + /** + * This should return true, if the provided character is not allowed to + * match with anything else (e.g. is a sentinel). + */ + protected function mayNotMatch(JavaObjectInterface $character) + { + return $character instanceof Sentinel; + } + + /** + * This method is called whenever the {@link #MAX_LENGTH} is to small and + * hence the {@link #edBuffer} was not large enough. This may cause that a + * really large clone is reported in multiple chunks of size + * {@link #MAX_LENGTH} and potentially minor parts of such a clone might be + * lost. + */ + protected function reportBufferShortage(int $leafStart, int $leafLength): void + { + print 'Encountered buffer shortage: ' . $leafStart . ' ' . $leafLength . "\n"; + } + + /** + * Initializes the {@link #leafCount} array which given for each node the + * number of leaves reachable from it (where leaves obtain a value of 1). + */ + private function initLeafCount(int $node): void + { + $this->leafCount[$node] = 0; + + for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) { + $this->initLeafCount($this->nodeChildNode[$e]); + $this->leafCount[$node] += $this->leafCount[$this->nodeChildNode[$e]]; + } + + if ($this->leafCount[$node] == 0) { + $this->leafCount[$node] = 1; + } + } + + /** + * Performs the approximative matching between the input word and the tree. + * + * @param int $wordStart the start position of the currently matched word (position in + * the input word) + * @param int $wordPosition the current position along the input word + * @param int $node the node we are currently at (i.e. the edge leading to this + * node is relevant to us). + * @param int $nodeWordLength the length of the word found along the nodes (this may be + * different from the length along the input word due to gaps) + * @param int $maxErrors the number of errors still allowed + * * @throws ConQATException - */ + * + * @return bool whether some clone was reported + */ private function matchWord(int $wordStart, int $wordPosition, int $node, int $nodeWordLength, int $maxErrors) { - // We are aware that this method is longer than desirable for code - // reading. However, we currently do not see a refactoring that has a - // sensible cost-benefit ratio. Suggestions are welcome! + // We are aware that this method is longer than desirable for code + // reading. However, we currently do not see a refactoring that has a + // sensible cost-benefit ratio. Suggestions are welcome! - // self match? - if ($this->leafCount[$node] == 1 && $this->nodeWordBegin[$node] == $wordPosition) { - return false; - } + // self match? + if ($this->leafCount[$node] == 1 && $this->nodeWordBegin[$node] == $wordPosition) { + return false; + } - $currentNodeWordLength = min($this->nodeWordEnd[$node] - $this->nodeWordBegin[$node], $this->MAX_LENGTH - 1); + $currentNodeWordLength = min($this->nodeWordEnd[$node] - $this->nodeWordBegin[$node], $this->MAX_LENGTH - 1); - // do min edit distance + // do min edit distance /** @var int */ - $currentLength = $this->calculateMaxLength($wordStart, $wordPosition, $node, - $maxErrors, $currentNodeWordLength); - - if ($currentLength == 0) { - return false; - } - - if ($currentLength >= $this->MAX_LENGTH - 1) { - $this->reportBufferShortage($this->nodeWordBegin[$node], $currentNodeWordLength); - } - - // calculate cheapest match - $best = $maxErrors + 42; - $iBest = 0; - $jBest = 0; - for ($k = 0; $k <= $currentLength; ++$k) { - $i = $currentLength - $k; - $j = $currentLength; - if ($this->edBuffer[$i][$j] < $best) { - $best = $this->edBuffer[$i][$j]; - $iBest = $i; - $jBest = $j; - } - - $i = $currentLength; - $j = $currentLength - $k; - if ($this->edBuffer[$i][$j] < $best) { - $best = $this->edBuffer[$i][$j]; - $iBest = $i; - $jBest = $j; - } - } - - while ($wordPosition + $iBest < count($this->word) - && $jBest < $currentNodeWordLength - && $this->word[$wordPosition + $iBest] != $this->word[$this->nodeWordBegin[$node] + $jBest] - && $this->word[$wordPosition + $iBest]->equals( - $this->word[$this->nodeWordBegin[$node] + $jBest])) { - ++$iBest; - ++$jBest; - } - - $numReported = 0; - if ($currentLength == $currentNodeWordLength) { - // we may proceed - for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) { - if ($this->matchWord($wordStart, $wordPosition + $iBest, - $this->nodeChildNode[$e], $nodeWordLength + $jBest, $maxErrors - - $best)) { - ++$numReported; - } - } - } - - // do not report locally if had reports in exactly one subtree (would be - // pure subclone) - if ($numReported == 1) { - return true; - } - - // disallow tail changes - while ($iBest > 0 - && $jBest > 0 - && !$this->word[$wordPosition + $iBest - 1]->equals( - $this->word[$this->nodeWordBegin[$node] + $jBest - 1])) { - - if ($iBest > 1 - && $this->word[$wordPosition + $iBest - 2]->equals( - $this->word[$this->nodeWordBegin[$node] + $jBest - 1])) { - --$iBest; - } else if ($jBest > 1 - && $this->word[$wordPosition + $iBest - 1]->equals( - $this->word[$this->nodeWordBegin[$node] + $jBest - 2])) { - --$jBest; - } else { - --$iBest; - --$jBest; - } - } - - // report if real clone - if ($iBest > 0 && $jBest > 0) { - $numReported += 1; - $this->reportClone($wordStart, $wordPosition + $iBest, $node, $jBest, $nodeWordLength + $jBest); - } - - return $numReported > 0; - } - - /** - * Calculates the maximum length we may take along the word to the current - * $node (respecting the number of errors to make). * - * - * @param int $wordStart the start position of the currently matched word (position in - * the input word). - * @param int $wordPosition the current position along the input word. - * @param int $node the node we are currently at (i.e. the edge leading to this - * node is relevant to us). - * @param int $maxErrors the number of errors still allowed. - * @param int $currentNodeWordLength the length of the word found along the nodes (this may be - * different from the actual length due to buffer limits). - * @return int the maximal length that can be taken. - */ + $currentLength = $this->calculateMaxLength( + $wordStart, + $wordPosition, + $node, + $maxErrors, + $currentNodeWordLength + ); + + if ($currentLength == 0) { + return false; + } + + if ($currentLength >= $this->MAX_LENGTH - 1) { + $this->reportBufferShortage($this->nodeWordBegin[$node], $currentNodeWordLength); + } + + // calculate cheapest match + $best = $maxErrors + 42; + $iBest = 0; + $jBest = 0; + + for ($k = 0; $k <= $currentLength; $k++) { + $i = $currentLength - $k; + $j = $currentLength; + + if ($this->edBuffer[$i][$j] < $best) { + $best = $this->edBuffer[$i][$j]; + $iBest = $i; + $jBest = $j; + } + + $i = $currentLength; + $j = $currentLength - $k; + + if ($this->edBuffer[$i][$j] < $best) { + $best = $this->edBuffer[$i][$j]; + $iBest = $i; + $jBest = $j; + } + } + + while ($wordPosition + $iBest < count($this->word) && + $jBest < $currentNodeWordLength && + $this->word[$wordPosition + $iBest] != $this->word[$this->nodeWordBegin[$node] + $jBest] && + $this->word[$wordPosition + $iBest]->equals( + $this->word[$this->nodeWordBegin[$node] + $jBest] + )) { + $iBest++; + $jBest++; + } + + $numReported = 0; + + if ($currentLength == $currentNodeWordLength) { + // we may proceed + for ($e = $this->nodeChildFirst[$node]; $e >= 0; $e = $this->nodeChildNext[$e]) { + if ($this->matchWord( + $wordStart, + $wordPosition + $iBest, + $this->nodeChildNode[$e], + $nodeWordLength + $jBest, + $maxErrors + - $best + )) { + $numReported++; + } + } + } + + // do not report locally if had reports in exactly one subtree (would be + // pure subclone) + if ($numReported == 1) { + return true; + } + + // disallow tail changes + while ($iBest > 0 && + $jBest > 0 && + !$this->word[$wordPosition + $iBest - 1]->equals( + $this->word[$this->nodeWordBegin[$node] + $jBest - 1] + )) { + if ($iBest > 1 && + $this->word[$wordPosition + $iBest - 2]->equals( + $this->word[$this->nodeWordBegin[$node] + $jBest - 1] + )) { + $iBest--; + } elseif ($jBest > 1 && + $this->word[$wordPosition + $iBest - 1]->equals( + $this->word[$this->nodeWordBegin[$node] + $jBest - 2] + )) { + $jBest--; + } else { + $iBest--; + $jBest--; + } + } + + // report if real clone + if ($iBest > 0 && $jBest > 0) { + $numReported++; + $this->reportClone($wordStart, $wordPosition + $iBest, $node, $jBest, $nodeWordLength + $jBest); + } + + return $numReported > 0; + } + + /** + * Calculates the maximum length we may take along the word to the current + * $node (respecting the number of errors to make). *. + * + * @param int $wordStart the start position of the currently matched word (position in + * the input word) + * @param int $wordPosition the current position along the input word + * @param int $node the node we are currently at (i.e. the edge leading to this + * node is relevant to us). + * @param int $maxErrors the number of errors still allowed + * @param int $currentNodeWordLength the length of the word found along the nodes (this may be + * different from the actual length due to buffer limits) + * + * @return int the maximal length that can be taken + */ private function calculateMaxLength( int $wordStart, int $wordPosition, int $node, int $maxErrors, - int $currentNodeWordLength) + int $currentNodeWordLength + ) { - $this->edBuffer[0][0] = 0; - $currentLength = 1; - for (; $currentLength <= $currentNodeWordLength; ++$currentLength) { + $this->edBuffer[0][0] = 0; + $currentLength = 1; + + for (; $currentLength <= $currentNodeWordLength; $currentLength++) { /** @var int */ - $best = $currentLength; - $this->edBuffer[0][$currentLength] = $currentLength; - $this->edBuffer[$currentLength][0] = $currentLength; - - if ($wordPosition + $currentLength >= count($this->word)) { - break; - } - - // deal with case that character may not be matched (sentinel!) - $iChar = $this->word[$wordPosition + $currentLength - 1]; - $jChar = $this->word[$this->nodeWordBegin[$node] + $currentLength - 1]; - if ($this->mayNotMatch($iChar) || $this->mayNotMatch($jChar)) { - break; - } - - // usual matrix completion for edit distance - for ($k = 1; $k < $currentLength; ++$k) { - $best = min( - $best, - $this->fillEDBuffer($k, $currentLength, $wordPosition, - $this->nodeWordBegin[$node])); - } - for ($k = 1; $k < $currentLength; ++$k) { - $best = min( - $best, - $this->fillEDBuffer($currentLength, $k, $wordPosition, - $this->nodeWordBegin[$node])); - } - $best = min( - $best, - $this->fillEDBuffer($currentLength, $currentLength, $wordPosition, - $this->nodeWordBegin[$node])); - - if ($best > $maxErrors - || $wordPosition - $wordStart + $currentLength <= $this->headEquality - && $best > 0) { - break; - } - } - --$currentLength; - return $currentLength; - } + $best = $currentLength; + $this->edBuffer[0][$currentLength] = $currentLength; + $this->edBuffer[$currentLength][0] = $currentLength; + + if ($wordPosition + $currentLength >= count($this->word)) { + break; + } + + // deal with case that character may not be matched (sentinel!) + $iChar = $this->word[$wordPosition + $currentLength - 1]; + $jChar = $this->word[$this->nodeWordBegin[$node] + $currentLength - 1]; + + if ($this->mayNotMatch($iChar) || $this->mayNotMatch($jChar)) { + break; + } + + // usual matrix completion for edit distance + for ($k = 1; $k < $currentLength; $k++) { + $best = min( + $best, + $this->fillEDBuffer( + $k, + $currentLength, + $wordPosition, + $this->nodeWordBegin[$node] + ) + ); + } + + for ($k = 1; $k < $currentLength; $k++) { + $best = min( + $best, + $this->fillEDBuffer( + $currentLength, + $k, + $wordPosition, + $this->nodeWordBegin[$node] + ) + ); + } + $best = min( + $best, + $this->fillEDBuffer( + $currentLength, + $currentLength, + $wordPosition, + $this->nodeWordBegin[$node] + ) + ); + + if ($best > $maxErrors || + $wordPosition - $wordStart + $currentLength <= $this->headEquality && + $best > 0) { + break; + } + } + $currentLength--; + + return $currentLength; + } /** - * @return void * @throws ConQATException */ - private function reportClone(int $wordBegin, int $wordEnd, int $currentNode, - int $nodeWordPos, int $nodeWordLength) + private function reportClone( + int $wordBegin, + int $wordEnd, + int $currentNode, + int $nodeWordPos, + int $nodeWordLength + ): void { /** @var int */ - $length = $wordEnd - $wordBegin; - if ($length < $this->minLength || $nodeWordLength < $this->minLength) { - return; - } + $length = $wordEnd - $wordBegin; + + if ($length < $this->minLength || $nodeWordLength < $this->minLength) { + return; + } /** @var PairList */ - $otherClones = new PairList(); + $otherClones = new PairList(); $this->findRemainingClones( $otherClones, $nodeWordLength, @@ -401,118 +478,105 @@ private function reportClone(int $wordBegin, int $wordEnd, int $currentNode, $wordBegin ); - $occurrences = 1 + $otherClones->size(); + $occurrences = 1 + $otherClones->size(); - // check whether we may start from here + // check whether we may start from here /** @var PhpToken */ $t = $this->word[$wordBegin]; /** @var CloneInfo */ - $newInfo = new CloneInfo($length, $wordBegin, $occurrences, $t, $otherClones); - for ($index = max(0, $wordBegin - $this->INDEX_SPREAD + 1); $index <= $wordBegin; ++$index) { + $newInfo = new CloneInfo($length, $wordBegin, $occurrences, $t, $otherClones); + + for ($index = max(0, $wordBegin - $this->INDEX_SPREAD + 1); $index <= $wordBegin; $index++) { /** @var CloneInfo */ - $existingClones = $this->cloneInfos[$index] ?? null; - if ($existingClones != null) { - //for (CloneInfo cloneInfo : $existingClones) { + $existingClones = $this->cloneInfos[$index] ?? null; + + if ($existingClones != null) { + //for (CloneInfo cloneInfo : $existingClones) { foreach ($existingClones as $cloneInfo) { - if ($cloneInfo->dominates($newInfo, $wordBegin - $index)) { - // we already have a dominating clone, so ignore - return; - } - } - } - } - - // add clone to $otherClones to avoid getting more duplicates - for ($i = $wordBegin; $i < $wordEnd; $i += $this->INDEX_SPREAD) { - $this->cloneInfos[$i][] = new CloneInfo($length - ($i - $wordBegin), $wordBegin, $occurrences, $t, $otherClones); - } + if ($cloneInfo->dominates($newInfo, $wordBegin - $index)) { + // we already have a dominating clone, so ignore + return; + } + } + } + } + + // add clone to $otherClones to avoid getting more duplicates + for ($i = $wordBegin; $i < $wordEnd; $i += $this->INDEX_SPREAD) { + $this->cloneInfos[$i][] = new CloneInfo($length - ($i - $wordBegin), $wordBegin, $occurrences, $t, $otherClones); + } /** @var PhpToken */ $t = $this->word[$wordBegin]; - for ($clone = 0; $clone < $otherClones->size(); ++$clone) { - $start = $otherClones->getFirst($clone); - $otherLength = $otherClones->getSecond($clone); + + for ($clone = 0; $clone < $otherClones->size(); $clone++) { + $start = $otherClones->getFirst($clone); + $otherLength = $otherClones->getSecond($clone); + for ($j = 0; $j < $otherLength; $j++) { /** @var PhpToken */ $r = $this->word[$j + $start]; } - for ($i = 0; $i < $otherLength; $i += $this->INDEX_SPREAD) { - //$this->cloneInfos.add($start + $i, new CloneInfo($otherLength - $i, $wordBegin, occurrences, $t, $otherClones)); - $this->cloneInfos[$start + $i][] = new CloneInfo($otherLength - $i, $wordBegin, $occurrences, $t, $otherClones); - } - } - } - - - /** - * Fills the edit distance buffer at position (i,j). - * - * @param int $i the first index of the buffer. - * @param int $j the second index of the buffer. - * @param int $iOffset the offset where the word described by $i starts. - * @param int $jOffset the offset where the word described by $j starts. - * @return int the value inserted into the buffer. - */ + + for ($i = 0; $i < $otherLength; $i += $this->INDEX_SPREAD) { + //$this->cloneInfos.add($start + $i, new CloneInfo($otherLength - $i, $wordBegin, occurrences, $t, $otherClones)); + $this->cloneInfos[$start + $i][] = new CloneInfo($otherLength - $i, $wordBegin, $occurrences, $t, $otherClones); + } + } + } + + /** + * Fills the edit distance buffer at position (i,j). + * + * @param int $i the first index of the buffer + * @param int $j the second index of the buffer + * @param int $iOffset the offset where the word described by $i starts + * @param int $jOffset the offset where the word described by $j starts + * + * @return int the value inserted into the buffer + */ private function fillEDBuffer(int $i, int $j, int $iOffset, int $jOffset) { /** @var JavaObjectInterface */ - $iChar = $this->word[$iOffset + $i - 1]; + $iChar = $this->word[$iOffset + $i - 1]; /** @var JavaObjectInterface */ - $jChar = $this->word[$jOffset + $j - 1]; - - $insertDelete = 1 + min($this->edBuffer[$i - 1][$j], $this->edBuffer[$i][$j - 1]); - $change = $this->edBuffer[$i - 1][$j - 1] + ($iChar->equals($jChar) ? 0 : 1); - return $this->edBuffer[$i][$j] = min($insertDelete, $change); - } - - /** - * Fills a list of pairs giving the start positions and lengths of the - * remaining clones. - * - * @param array $clonePositions the clone positions being filled (start position and length) - * @param int $nodeWordLength the length of the word along the nodes. - * @param int $currentNode the node we are currently at. - * @param int $distance the distance along the word leading to the current node. - * @param int $wordStart the start of the currently searched word. - * @return void - */ + $jChar = $this->word[$jOffset + $j - 1]; + + $insertDelete = 1 + min($this->edBuffer[$i - 1][$j], $this->edBuffer[$i][$j - 1]); + $change = $this->edBuffer[$i - 1][$j - 1] + ($iChar->equals($jChar) ? 0 : 1); + + return $this->edBuffer[$i][$j] = min($insertDelete, $change); + } + + /** + * Fills a list of pairs giving the start positions and lengths of the + * remaining clones. + * + * @param array $clonePositions the clone positions being filled (start position and length) + * @param int $nodeWordLength the length of the word along the nodes + * @param int $currentNode the node we are currently at + * @param int $distance the distance along the word leading to the current node + * @param int $wordStart the start of the currently searched word + */ private function findRemainingClones( PairList $clonePositions, int $nodeWordLength, int $currentNode, int $distance, - int $wordStart) - { - for ($nextNode = $this->nodeChildFirst[$currentNode]; $nextNode >= 0; $nextNode = $this->nodeChildNext[$nextNode]) { - $node = $this->nodeChildNode[$nextNode]; - $this->findRemainingClones($clonePositions, $nodeWordLength, $node, $distance - + $this->nodeWordEnd[$node] - $this->nodeWordBegin[$node], $wordStart); - } - - if ($this->nodeChildFirst[$currentNode] < 0) { - $start = count($this->word) - $distance - $nodeWordLength; - if ($start != $wordStart) { - $clonePositions->add($start, $nodeWordLength); - } - } - } - - /** - * This should return true, if the provided character is not allowed to - * match with anything else (e.g. is a sentinel). - */ - protected function mayNotMatch(JavaObjectInterface $character) + int $wordStart + ): void { - return $character instanceof Sentinel; - } + for ($nextNode = $this->nodeChildFirst[$currentNode]; $nextNode >= 0; $nextNode = $this->nodeChildNext[$nextNode]) { + $node = $this->nodeChildNode[$nextNode]; + $this->findRemainingClones($clonePositions, $nodeWordLength, $node, $distance + + $this->nodeWordEnd[$node] - $this->nodeWordBegin[$node], $wordStart); + } - /** - * This method is called whenever the {@link #MAX_LENGTH} is to small and - * hence the {@link #edBuffer} was not large enough. This may cause that a - * really large clone is reported in multiple chunks of size - * {@link #MAX_LENGTH} and potentially minor parts of such a clone might be - * lost. - */ - protected function reportBufferShortage(int $leafStart, int $leafLength) { - echo "Encountered buffer shortage: " . $leafStart . " " . $leafLength . "\n"; + if ($this->nodeChildFirst[$currentNode] < 0) { + $start = count($this->word) - $distance - $nodeWordLength; + + if ($start != $wordStart) { + $clonePositions->add($start, $nodeWordLength); + } + } } } diff --git a/src/Detector/Strategy/SuffixTree/CloneInfo.php b/src/Detector/Strategy/SuffixTree/CloneInfo.php index 82f10efb..490442a7 100644 --- a/src/Detector/Strategy/SuffixTree/CloneInfo.php +++ b/src/Detector/Strategy/SuffixTree/CloneInfo.php @@ -1,22 +1,12 @@ - + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; /** Stores information on a clone. */ @@ -24,52 +14,54 @@ class CloneInfo { /** * Length of the clone in tokens. + * * @var int */ public $length; /** - * Position in word list + * Position in word list. + * * @var int */ public $position; - /** - * Number of occurrences of the clone. - * @var int - */ - private $occurrences; - /** * @var PhpToken */ public $token; /** - * Related clones + * Related clones. + * * @var PairList */ public $otherClones; + /** + * Number of occurrences of the clone. + * + * @var int + */ + private $occurrences; + /** Constructor. */ public function __construct(int $length, int $position, int $occurrences, PhpToken $token, PairList $otherClones) { - $this->length = $length; - $this->position = $position; + $this->length = $length; + $this->position = $position; $this->occurrences = $occurrences; - $this->token = $token; + $this->token = $token; $this->otherClones = $otherClones; } /** * Returns whether this clone info dominates the given one, i.e. whether * both {@link #length} and {@link #occurrences} s not smaller. - * - * @param CloneInfo $ci - * @param later The amount the given clone starts later than the "this" clone. - * @return boolean + * + * @param later the amount the given clone starts later than the "this" clone */ - public function dominates(CloneInfo $ci, int $later): bool + public function dominates(self $ci, int $later): bool { return $this->length - $later >= $ci->length && $this->occurrences >= $ci->occurrences; } diff --git a/src/Detector/Strategy/SuffixTree/JavaObjectInterface.php b/src/Detector/Strategy/SuffixTree/JavaObjectInterface.php index 2227696a..e6b12f96 100644 --- a/src/Detector/Strategy/SuffixTree/JavaObjectInterface.php +++ b/src/Detector/Strategy/SuffixTree/JavaObjectInterface.php @@ -1,9 +1,17 @@ - + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; interface JavaObjectInterface { public function hashCode(): int; - public function equals(JavaObjectInterface $obj): bool; + + public function equals(self $obj): bool; } diff --git a/src/Detector/Strategy/SuffixTree/PairList.php b/src/Detector/Strategy/SuffixTree/PairList.php index ab2a2939..c4e45ee2 100644 --- a/src/Detector/Strategy/SuffixTree/PairList.php +++ b/src/Detector/Strategy/SuffixTree/PairList.php @@ -1,28 +1,19 @@ - + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; /** * A list for storing pairs in a specific order. - * + * * @author $Author: hummelb $ + * * @version $Rev: 51770 $ * @ConQAT.Rating GREEN Hash: 7459D6D0F59028B37DD23DD091BDCEEA */ @@ -30,231 +21,247 @@ class PairList { /** * Version used for serialization. + * * @var int */ - private $serialVersionUID = 1; + private $serialVersionUID = 1; /** * The current size. + * * @var int */ - private $size = 0; + private $size = 0; /** * The array used for storing the S. + * * @var object[] */ - private $firstElements; + private $firstElements; /** * The array used for storing the T. - * @var object[] + * + * @var object[] */ - private $secondElements; + private $secondElements; public function __construct(int $initialCapacity = 16) { - if ($initialCapacity < 1) { - $initialCapacity = 1; - } - $this->firstElements = array_fill(0, $initialCapacity, null); + if ($initialCapacity < 1) { + $initialCapacity = 1; + } + $this->firstElements = array_fill(0, $initialCapacity, null); $this->secondElements = array_fill(0, $initialCapacity, null); - } + } - - /** Returns whether the list is empty. */ + /** Returns whether the list is empty. */ public function isEmpty(): bool { - return $this->size == 0; - } + return $this->size == 0; + } - /** Returns the size of the list. */ + /** Returns the size of the list. */ public function size(): int { - return $this->size; - } + return $this->size; + } /** * Add the given pair to the list. - * @return void */ public function add($first, $second): void { - $this->firstElements[$this->size] = $first; - $this->secondElements[$this->size] = $second; - ++$this->size; - } + $this->firstElements[$this->size] = $first; + $this->secondElements[$this->size] = $second; + $this->size++; + } - /** Adds all pairs from another list. */ - public function addAll(PairList $other): void - { - // we have to store this in a local var, as other.$this->size may change if - // other == this - $otherSize = $other->size; - - for ($i = 0; $i < $otherSize; ++$i) { - $this->firstElements[$this->size] = $other->firstElements[$i]; - $this->secondElements[$this->size] = $other->secondElements[$i]; - ++$this->size; - } - } - - /** Make sure there is space for at least the given amount of elements. */ - protected function ensureSpace(int $space): void + /** Adds all pairs from another list. */ + public function addAll(self $other): void { - if ($space <= count($this->firstElements)) { - return; - } - - $oldFirst = $this->firstElements; - $oldSecond = $this->secondElements; - $newSize = count($this->firstElements) * 2; - while ($newSize < $space) { - $newSize *= 2; - } - } - - /** Returns the first element at given index. */ + // we have to store this in a local var, as other.$this->size may change if + // other == this + $otherSize = $other->size; + + for ($i = 0; $i < $otherSize; $i++) { + $this->firstElements[$this->size] = $other->firstElements[$i]; + $this->secondElements[$this->size] = $other->secondElements[$i]; + $this->size++; + } + } + + /** Returns the first element at given index. */ public function getFirst(int $i) { - $this->checkWithinBounds($i); - return $this->firstElements[$i]; - } - - /** - * Checks whether the given $i is within the bounds. Throws an - * exception otherwise. - */ - private function checkWithinBounds(int $i): void - { - if ($i < 0 || $i >= $this->size) { - throw new Exception("Out of bounds: " + $i); - } - } + $this->checkWithinBounds($i); - /** Sets the first element at given index. */ + return $this->firstElements[$i]; + } + + /** Sets the first element at given index. */ public function setFirst(int $i, $value): void { - $this->checkWithinBounds($i); - $this->firstElements[$i] = $value; - } + $this->checkWithinBounds($i); + $this->firstElements[$i] = $value; + } - /** Returns the second element at given index. */ + /** Returns the second element at given index. */ public function getSecond(int $i) { - $this->checkWithinBounds($i); - return $this->secondElements[$i]; - } + $this->checkWithinBounds($i); + + return $this->secondElements[$i]; + } - /** Sets the first element at given index. */ + /** Sets the first element at given index. */ public function setSecond(int $i, $value): void { - $this->checkWithinBounds($i); - $this->secondElements[$i] = $value; - } + $this->checkWithinBounds($i); + $this->secondElements[$i] = $value; + } - /** Creates a new list containing all first elements. */ + /** Creates a new list containing all first elements. */ public function extractFirstList(): array { - //array $result = new ArrayList($this->size + 1); - $result = []; - for ($i = 0; $i < $this->size; ++$i) { - $result[] = $this->firstElements[$i]; - } - return $result; - } - - /** Creates a new list containing all second elements. */ + //array $result = new ArrayList($this->size + 1); + $result = []; + + for ($i = 0; $i < $this->size; $i++) { + $result[] = $this->firstElements[$i]; + } + + return $result; + } + + /** Creates a new list containing all second elements. */ public function extractSecondList(): array { - //$result = new ArrayList($this->size + 1); - $result = []; - for ($i = 0; $i < $this->size; ++$i) { - $result[] = $this->secondElements[$i]; - } - return $result; - } - - /** - * Swaps the pairs of this list. Is S and T are different types, this will - * be extremely dangerous. - */ + //$result = new ArrayList($this->size + 1); + $result = []; + + for ($i = 0; $i < $this->size; $i++) { + $result[] = $this->secondElements[$i]; + } + + return $result; + } + + /** + * Swaps the pairs of this list. Is S and T are different types, this will + * be extremely dangerous. + */ public function swapPairs(): void { - $temp = $this->firstElements; - $this->firstElements = $this->secondElements; - $this->secondElements = $temp; - } + $temp = $this->firstElements; + $this->firstElements = $this->secondElements; + $this->secondElements = $temp; + } - /** Swaps the entries located at indexes $i and $j. */ + /** Swaps the entries located at indexes $i and $j. */ public function swapEntries(int $i, int $j): void { - $tmp1 = $this->getFirst($i); - $tmp2 = $this->getSecond($i); - $this->setFirst($i, $this->getFirst($j)); - $this->setSecond($i, $this->getSecond($j)); - $this->setFirst($j, $tmp1); - $this->setSecond($j, $tmp2); - } - - /** Clears this list. */ + $tmp1 = $this->getFirst($i); + $tmp2 = $this->getSecond($i); + $this->setFirst($i, $this->getFirst($j)); + $this->setSecond($i, $this->getSecond($j)); + $this->setFirst($j, $tmp1); + $this->setSecond($j, $tmp2); + } + + /** Clears this list. */ public function clear(): void { - $this->size = 0; - } + $this->size = 0; + } - /** Removes the last element of the list. */ + /** Removes the last element of the list. */ public function removeLast(): void { - $this->size -= 1; - } + $this->size--; + } public function toString(): string { - $result = ''; - $result += ('['); - for ($i = 0; $i < $this->size; $i++) { - if ($i != 0) { - $result .= ','; - } - $result .= '('; - $result .= (string) $this->firstElements[$i]; - $result .= ','; - $result .= (string) $this->secondElements[$i]; - $result .= ')'; - } - $result .= ']'; - return $result; - } + $result = ''; + $result += ('['); + + for ($i = 0; $i < $this->size; $i++) { + if ($i != 0) { + $result .= ','; + } + $result .= '('; + $result .= (string) $this->firstElements[$i]; + $result .= ','; + $result .= (string) $this->secondElements[$i]; + $result .= ')'; + } + $result .= ']'; + + return $result; + } public function hashCode(): int { - $prime = 31; - $hash = $this->size; - $hash = $prime * $hash + crc32(serialize($this->firstElements)); - return $prime * $hash + crc32(serialize($this->secondElements)); - } + $prime = 31; + $hash = $this->size; + $hash = $prime * $hash + crc32(serialize($this->firstElements)); - public function equals(PairList $obj): bool + return $prime * $hash + crc32(serialize($this->secondElements)); + } + + public function equals(self $obj): bool { // TODO: Doesn't work in PHP - if ($this === $obj) { - return true; - } - if (!($obj instanceof PairList)) { - return false; - } - - $other = $obj; - if ($this->size !== $other->size) { - return false; - } - for ($i = 0; $i < $this->size; $i++) { - if (!($this->firstElements[$i] == $other->firstElements[$i]) - || !($this->secondElements[$i] != $this->secondElements[$i])) { - return false; - } - } - return true; - } + if ($this === $obj) { + return true; + } + + if (!($obj instanceof self)) { + return false; + } + + $other = $obj; + + if ($this->size !== $other->size) { + return false; + } + + for ($i = 0; $i < $this->size; $i++) { + if (!($this->firstElements[$i] == $other->firstElements[$i]) || + !($this->secondElements[$i] != $this->secondElements[$i])) { + return false; + } + } + + return true; + } + + /** Make sure there is space for at least the given amount of elements. */ + protected function ensureSpace(int $space): void + { + if ($space <= count($this->firstElements)) { + return; + } + + $oldFirst = $this->firstElements; + $oldSecond = $this->secondElements; + $newSize = count($this->firstElements) * 2; + + while ($newSize < $space) { + $newSize *= 2; + } + } + + /** + * Checks whether the given $i is within the bounds. Throws an + * exception otherwise. + */ + private function checkWithinBounds(int $i): void + { + if ($i < 0 || $i >= $this->size) { + throw new Exception('Out of bounds: ' + $i); + } + } } diff --git a/src/Detector/Strategy/SuffixTree/PhpToken.php b/src/Detector/Strategy/SuffixTree/PhpToken.php index bb8a559d..ba84ef25 100644 --- a/src/Detector/Strategy/SuffixTree/PhpToken.php +++ b/src/Detector/Strategy/SuffixTree/PhpToken.php @@ -1,13 +1,24 @@ - + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; class PhpToken implements JavaObjectInterface { public $tokenCode; + public $line; + public $file; + public $tokenName; + public $content; public function __construct( @@ -19,14 +30,16 @@ public function __construct( ) { $this->tokenCode = $tokenCode; $this->tokenName = $tokenName; - $this->line = $line; - $this->content = $content; - $this->file = $file; + $this->line = $line; + $this->content = $content; + $this->file = $file; + } + + public function __toString() + { + return $this->tokenName; } - /** - * @return int - */ public function hashCode(): int { return (int) crc32($this->content); @@ -64,21 +77,16 @@ public function hashCode(): int //return $tokenCode; } - /** - * @return boolean - */ - public function equals(JavaObjectInterface $token): bool { + public function equals(JavaObjectInterface $token): bool + { return $token->hashCode() === $this->hashCode(); } /** * @return string */ - public function toString() { - return $this->tokenName; - } - - public function __tostring() { + public function toString() + { return $this->tokenName; } } diff --git a/src/Detector/Strategy/SuffixTree/Sentinel.php b/src/Detector/Strategy/SuffixTree/Sentinel.php index 378dff0f..aa30af34 100644 --- a/src/Detector/Strategy/SuffixTree/Sentinel.php +++ b/src/Detector/Strategy/SuffixTree/Sentinel.php @@ -1,22 +1,12 @@ - + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; /** @@ -40,14 +30,14 @@ public function hashCode(): int return $this->hash; } - public function equals(object $obj): bool + public function equals(JavaObjectInterface $obj): bool { // Original code uses physical object equality, not present in PHP. - return $obj instanceof Sentinel; + return $obj instanceof self; } public function toString(): string { - return "$"; + return '$'; } } diff --git a/src/Detector/Strategy/SuffixTree/SuffixTree.php b/src/Detector/Strategy/SuffixTree/SuffixTree.php index d96571d9..b912f765 100644 --- a/src/Detector/Strategy/SuffixTree/SuffixTree.php +++ b/src/Detector/Strategy/SuffixTree/SuffixTree.php @@ -1,22 +1,12 @@ - + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; /** @@ -41,10 +31,10 @@ *

* Everything but the construction itself is protected to simplify increasing * its functionality by subclassing but without introducing new method calls. - * + * * @author Benjamin Hummel * @author $Author: kinnen $ - * + * * @version $Revision: 41751 $ * @ConQAT.Rating GREEN Hash: 4B2EF0606B3085A6831764ED042FF20D */ @@ -52,297 +42,301 @@ class SuffixTree { /** * Infinity in this context. - * @var int + * + * @var int */ - protected $INFTY; + protected $INFTY; /** The word we are working on. - * @var array */ - protected $word; + * @var array */ + protected $word; /** The number of nodes created so far. - * @var int */ - protected $numNodes = 0; + * @var int */ + protected $numNodes = 0; - /** - * For each node this holds the index of the first character of - * {@link #word} labeling the transition to this node. This - * corresponds to the k for a transition used in Ukkonen's paper. + /** + * For each node this holds the index of the first character of + * {@link #word} labeling the transition to this node. This + * corresponds to the k for a transition used in Ukkonen's paper. * * @var int[] - */ - protected $nodeWordBegin; + */ + protected $nodeWordBegin; - /** - * For each node this holds the index of the one after the last character of - * {@link #word} labeling the transition to this node. This - * corresponds to the p for a transition used in Ukkonen's paper. + /** + * For each node this holds the index of the one after the last character of + * {@link #word} labeling the transition to this node. This + * corresponds to the p for a transition used in Ukkonen's paper. * * @var int[] - */ - protected $nodeWordEnd; + */ + protected $nodeWordEnd; /** For each node its suffix link (called function f by Ukkonen). - * @var int[] */ - protected $suffixLink; + * @var int[] */ + protected $suffixLink; - /** - * The next node function realized as a hash table. This corresponds to the - * g function used in Ukkonen's paper. + /** + * The next node function realized as a hash table. This corresponds to the + * g function used in Ukkonen's paper. * * @var SuffixTreeHashTable - */ - protected $nextNode; - - /** - * An array giving for each node the index where the first child will be - * stored (or -1 if it has no children). It is initially empty and will be - * filled "on demand" using - * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])} - * . + */ + protected $nextNode; + + /** + * An array giving for each node the index where the first child will be + * stored (or -1 if it has no children). It is initially empty and will be + * filled "on demand" using + * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])} + * . * * @var int[] - */ - protected $nodeChildFirst; - - /** - * This array gives the next index of the child list or -1 if this is the - * last one. It is initially empty and will be filled "on demand" using - * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])} - * . + */ + protected $nodeChildFirst; + + /** + * This array gives the next index of the child list or -1 if this is the + * last one. It is initially empty and will be filled "on demand" using + * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])} + * . * * @var int[] - */ - protected $nodeChildNext; - - /** - * This array stores the actual name (=number) of the mode in the child - * list. It is initially empty and will be filled "on demand" using - * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])} - * . + */ + protected $nodeChildNext; + + /** + * This array stores the actual name (=number) of the mode in the child + * list. It is initially empty and will be filled "on demand" using + * {@link org.conqat.engine.code_clones.detection.suffixtree.SuffixTreeHashTable#extractChildLists(int[], int[], int[])} + * . * * @var int[] - */ - protected $nodeChildNode; + */ + protected $nodeChildNode; - /** - * The node we are currently at as a "global" variable (as it is always - * passed unchanged). This is called s in Ukkonen's paper. + /** + * The node we are currently at as a "global" variable (as it is always + * passed unchanged). This is called s in Ukkonen's paper. * * @var int - */ - private $currentNode = 0; + */ + private $currentNode = 0; - /** - * Beginning of the word part of the reference pair. This is kept "global" - * (in constrast to the end) as this is passed unchanged to all functions. - * Ukkonen calls this k. + /** + * Beginning of the word part of the reference pair. This is kept "global" + * (in constrast to the end) as this is passed unchanged to all functions. + * Ukkonen calls this k. * * @var int - */ - private $refWordBegin = 0; + */ + private $refWordBegin = 0; - /** - * This is the new (or old) explicit state as returned by - * {@link #testAndSplit(int, Object)}. Ukkonen calls this r. + /** + * This is the new (or old) explicit state as returned by + * {@link #testAndSplit(int, Object)}. Ukkonen calls this r. * * @var int - */ - private $explicitNode; + */ + private $explicitNode; - /** - * Create a new suffix tree from a given word. The word given as parameter - * is used internally and should not be modified anymore, so copy it before - * if required. - * - * @param array $word - */ + /** + * Create a new suffix tree from a given word. The word given as parameter + * is used internally and should not be modified anymore, so copy it before + * if required. + */ public function __construct(array $word) { - $this->word = $word; - $size = count($word); - $this->INFTY = $size; + $this->word = $word; + $size = count($word); + $this->INFTY = $size; - $expectedNodes = 2 * $size; + $expectedNodes = 2 * $size; $this->nodeWordBegin = array_fill(0, $expectedNodes, 0); - $this->nodeWordEnd = array_fill(0, $expectedNodes, 0); - $this->suffixLink = array_fill(0, $expectedNodes, 0); - $this->nextNode = new SuffixTreeHashTable($expectedNodes); + $this->nodeWordEnd = array_fill(0, $expectedNodes, 0); + $this->suffixLink = array_fill(0, $expectedNodes, 0); + $this->nextNode = new SuffixTreeHashTable($expectedNodes); - $this->createRootNode(); + $this->createRootNode(); - for ($i = 0; $i < $size; ++$i) { - $this->update($i); + for ($i = 0; $i < $size; $i++) { + $this->update($i); $this->canonize($i + 1); - } - } + } + } /** - * Creates the root node. + * Returns whether the given word is contained in the string given at + * construction time. * - * @return void + * @return bool */ - private function createRootNode() + public function containsWord(array $find) { - $this->numNodes = 1; - $this->nodeWordBegin[0] = 0; - $this->nodeWordEnd[0] = 0; - $this->suffixLink[0] = -1; - } - - /** - * The update function as defined in Ukkonen's paper. This inserts - * the character at charPos into the tree. It works on the canonical - * reference pair ({@link #currentNode}, ({@link #refWordBegin}, charPos)). - * - * @param int $charPos - * @return void - */ - private function update(int $charPos) { - $lastNode = 0; - while (!$this->testAndSplit($charPos, $this->word[$charPos])) { - $newNode = $this->numNodes++; - $this->nodeWordBegin[$newNode] = $charPos; - $this->nodeWordEnd[$newNode] = $this->INFTY; - $this->nextNode->put($this->explicitNode, $this->word[$charPos], $newNode); - - if ($lastNode != 0) { - $this->suffixLink[$lastNode] = $this->explicitNode; - } - $lastNode = $this->explicitNode; - $this->currentNode = $this->suffixLink[$this->currentNode]; - $this->canonize($charPos); - } - if ($lastNode != 0) { - $this->suffixLink[$lastNode] = $this->currentNode; - } - } - - /** - * The test-and-split function as defined in Ukkonen's paper. This - * checks whether the state given by the canonical reference pair ( - * {@link #currentNode}, ({@link #refWordBegin}, refWordEnd)) is the end - * point (by checking whether a transition for the - * nextCharacter exists). Additionally the state is made - * explicit if it not already is and this is not the end-point. It returns - * true if the end-point was reached. The newly created (or reached) - * explicit node is returned in the "global" variable. + $node = 0; + $findSize = count($find); + + for ($i = 0; $i < $findSize;) { + /** @var int */ + $next = $this->nextNode->get($node, $find[$i]); + + if ($next < 0) { + return false; + } + + for ($j = $this->nodeWordBegin[$next]; $j < $this->nodeWordEnd[$next] && $i < $findSize; ++$i, ++$j) { + if (!$this->word[$j]->equals($find[$i])) { + return false; + } + } + $node = $next; + } + + return true; + } + + /** + * This method makes sure the child lists are filled (required for + * traversing the tree). + */ + protected function ensureChildLists(): void + { + if ($this->nodeChildFirst == null || count($this->nodeChildFirst) < $this->numNodes) { + $this->nodeChildFirst = array_fill(0, $this->numNodes, 0); + $this->nodeChildNext = array_fill(0, $this->numNodes, 0); + $this->nodeChildNode = array_fill(0, $this->numNodes, 0); + $this->nextNode->extractChildLists($this->nodeChildFirst, $this->nodeChildNext, $this->nodeChildNode); + } + } + + /** + * Creates the root node. + */ + private function createRootNode(): void + { + $this->numNodes = 1; + $this->nodeWordBegin[0] = 0; + $this->nodeWordEnd[0] = 0; + $this->suffixLink[0] = -1; + } + + /** + * The update function as defined in Ukkonen's paper. This inserts + * the character at charPos into the tree. It works on the canonical + * reference pair ({@link #currentNode}, ({@link #refWordBegin}, charPos)). + */ + private function update(int $charPos): void + { + $lastNode = 0; + + while (!$this->testAndSplit($charPos, $this->word[$charPos])) { + $newNode = $this->numNodes++; + $this->nodeWordBegin[$newNode] = $charPos; + $this->nodeWordEnd[$newNode] = $this->INFTY; + $this->nextNode->put($this->explicitNode, $this->word[$charPos], $newNode); + + if ($lastNode != 0) { + $this->suffixLink[$lastNode] = $this->explicitNode; + } + $lastNode = $this->explicitNode; + $this->currentNode = $this->suffixLink[$this->currentNode]; + $this->canonize($charPos); + } + + if ($lastNode != 0) { + $this->suffixLink[$lastNode] = $this->currentNode; + } + } + + /** + * The test-and-split function as defined in Ukkonen's paper. This + * checks whether the state given by the canonical reference pair ( + * {@link #currentNode}, ({@link #refWordBegin}, refWordEnd)) is the end + * point (by checking whether a transition for the + * nextCharacter exists). Additionally the state is made + * explicit if it not already is and this is not the end-point. It returns + * true if the end-point was reached. The newly created (or reached) + * explicit node is returned in the "global" variable. * - * @param int $refWordEnd * @param object $nextCharacter - * @return boolean - */ + * + * @return bool + */ private function testAndSplit(int $refWordEnd, JavaObjectInterface $nextCharacter) { - if ($this->currentNode < 0) { - // trap state is always end state - return true; - } - - if ($refWordEnd <= $this->refWordBegin) { - if ($this->nextNode->get($this->currentNode, $nextCharacter) < 0) { - $this->explicitNode = $this->currentNode; - return false; - } - return true; - } + if ($this->currentNode < 0) { + // trap state is always end state + return true; + } + + if ($refWordEnd <= $this->refWordBegin) { + if ($this->nextNode->get($this->currentNode, $nextCharacter) < 0) { + $this->explicitNode = $this->currentNode; + + return false; + } + + return true; + } /** @var int */ - $next = $this->nextNode->get($this->currentNode, $this->word[$this->refWordBegin]); - if ($nextCharacter->equals($this->word[$this->nodeWordBegin[$next] + $refWordEnd - $this->refWordBegin])) { - return true; - } - - // not an end-point and not explicit, so make it explicit. - $this->explicitNode = $this->numNodes++; - $this->nodeWordBegin[$this->explicitNode] = $this->nodeWordBegin[$next]; - $this->nodeWordEnd[$this->explicitNode] = $this->nodeWordBegin[$next] + $refWordEnd - $this->refWordBegin; - $this->nextNode->put($this->currentNode, $this->word[$this->refWordBegin], $this->explicitNode); - - $this->nodeWordBegin[$next] += $refWordEnd - $this->refWordBegin; - $this->nextNode->put($this->explicitNode, $this->word[$this->nodeWordBegin[$next]], $next); - return false; - } - - /** - * The canonize function as defined in Ukkonen's paper. Changes the - * reference pair (currentNode, (refWordBegin, refWordEnd)) into a canonical - * reference pair. It works on the "global" variables {@link #currentNode} - * and {@link #refWordBegin} and the parameter, writing the result back to - * the globals. - * - * @param int $refWordEnd one after the end index for the word of the reference pair. - * @return void - */ + $next = $this->nextNode->get($this->currentNode, $this->word[$this->refWordBegin]); + + if ($nextCharacter->equals($this->word[$this->nodeWordBegin[$next] + $refWordEnd - $this->refWordBegin])) { + return true; + } + + // not an end-point and not explicit, so make it explicit. + $this->explicitNode = $this->numNodes++; + $this->nodeWordBegin[$this->explicitNode] = $this->nodeWordBegin[$next]; + $this->nodeWordEnd[$this->explicitNode] = $this->nodeWordBegin[$next] + $refWordEnd - $this->refWordBegin; + $this->nextNode->put($this->currentNode, $this->word[$this->refWordBegin], $this->explicitNode); + + $this->nodeWordBegin[$next] += $refWordEnd - $this->refWordBegin; + $this->nextNode->put($this->explicitNode, $this->word[$this->nodeWordBegin[$next]], $next); + + return false; + } + + /** + * The canonize function as defined in Ukkonen's paper. Changes the + * reference pair (currentNode, (refWordBegin, refWordEnd)) into a canonical + * reference pair. It works on the "global" variables {@link #currentNode} + * and {@link #refWordBegin} and the parameter, writing the result back to + * the globals. + * + * @param int $refWordEnd one after the end index for the word of the reference pair + */ private function canonize(int $refWordEnd): void { - if ($this->currentNode === -1) { - // explicitly handle trap state - $this->currentNode = 0; - $this->refWordBegin++; - } + if ($this->currentNode === -1) { + // explicitly handle trap state + $this->currentNode = 0; + $this->refWordBegin++; + } - if ($refWordEnd <= $this->refWordBegin) { - // empty word, so already canonical - return; - } + if ($refWordEnd <= $this->refWordBegin) { + // empty word, so already canonical + return; + } /** @var int */ $next = $this->nextNode->get( $this->currentNode, $this->word[$this->refWordBegin] ); - while ($this->nodeWordEnd[$next] - $this->nodeWordBegin[$next] <= $refWordEnd - - $this->refWordBegin) { - $this->refWordBegin += $this->nodeWordEnd[$next] - $this->nodeWordBegin[$next]; - $this->currentNode = $next; - if ($refWordEnd > $this->refWordBegin) { - $next = $this->nextNode->get($this->currentNode, $this->word[$this->refWordBegin]); - } else { - break; - } + + while ($this->nodeWordEnd[$next] - $this->nodeWordBegin[$next] <= $refWordEnd + - $this->refWordBegin) { + $this->refWordBegin += $this->nodeWordEnd[$next] - $this->nodeWordBegin[$next]; + $this->currentNode = $next; + + if ($refWordEnd > $this->refWordBegin) { + $next = $this->nextNode->get($this->currentNode, $this->word[$this->refWordBegin]); + } else { + break; + } } } - - /** - * This method makes sure the child lists are filled (required for - * traversing the tree). - * - * @return void - */ - protected function ensureChildLists() - { - if ($this->nodeChildFirst == null || count($this->nodeChildFirst) < $this->numNodes) { - $this->nodeChildFirst = array_fill(0, $this->numNodes, 0); - $this->nodeChildNext = array_fill(0, $this->numNodes, 0); - $this->nodeChildNode = array_fill(0, $this->numNodes, 0); - $this->nextNode->extractChildLists($this->nodeChildFirst, $this->nodeChildNext, $this->nodeChildNode); - } - } - - /** - * Returns whether the given word is contained in the string given at - * construction time. - * - * @param array $find - * @return boolean - */ - public function containsWord(array $find) { - $node = 0; - $findSize = count($find); - for ($i = 0; $i < $findSize;) { - /** @var int */ - $next = $this->nextNode->get($node, $find[$i]); - if ($next < 0) { - return false; - } - for ($j = $this->nodeWordBegin[$next]; $j < $this->nodeWordEnd[$next] && $i < $findSize; ++$i, ++$j) { - if (!$this->word[$j]->equals($find[$i])) { - return false; - } - } - $node = $next; - } - return true; - } } diff --git a/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php b/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php index 44827705..2eeb90d8 100644 --- a/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php +++ b/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php @@ -1,22 +1,12 @@ - + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; /** @@ -27,215 +17,229 @@ * It hashes from (node, character) pairs to the next node, where nodes are * represented by integers and the type of characters is determined by the * generic parameter. - * + * * @author Benjamin Hummel * @author $Author: juergens $ - * + * * @version $Revision: 34670 $ * @ConQAT.Rating GREEN Hash: 6A7A830078AF0CA9C2D84C148F336DF4 */ class SuffixTreeHashTable { - /** - * These numbers were taken from - * http://planetmath.org/encyclopedia/GoodHashTablePrimes.html + /** + * These numbers were taken from + * http://planetmath.org/encyclopedia/GoodHashTablePrimes.html. + * * @var int[] - */ - private $allowedSizes = [ 53, 97, 193, 389, 769, 1543, - 3079, 6151, 12289, 24593, 49157, 98317, 196613, 393241, 786433, - 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319, - 201326611, 402653189, 805306457, 1610612741 ]; + */ + private $allowedSizes = [53, 97, 193, 389, 769, 1543, + 3079, 6151, 12289, 24593, 49157, 98317, 196613, 393241, 786433, + 1572869, 3145739, 6291469, 12582917, 25165843, 50331653, 100663319, + 201326611, 402653189, 805306457, 1610612741, ]; /** * The size of the hash table. + * * @var int */ - private $tableSize; + private $tableSize; /** - * Storage space for the node part of the key + * Storage space for the node part of the key. + * * @var int[] */ - private $keyNodes; + private $keyNodes; /** * Storage space for the character part of the key. + * * @var object[] */ - private $keyChars; + private $keyChars; /** * Storage space for the result node. + * * @var int[] */ - private $resultNodes; + private $resultNodes; /** * Debug info: number of stored nodes. - * @var int + * + * @var int */ - private $_numStoredNodes = 0; + private $_numStoredNodes = 0; /** * Debug info: number of calls to find so far. + * * @var int */ - private $_numFind = 0; + private $_numFind = 0; /** * Debug info: number of collisions (i.e. wrong finds) during find so far. + * * @var int */ - private $_numColl = 0; + private $_numColl = 0; - /** - * Creates a new hash table for the given number of nodes. Trying to add - * more nodes will result in worse performance down to entering an infinite - * loop on some operations. - * - * @param int $numNodes - */ + /** + * Creates a new hash table for the given number of nodes. Trying to add + * more nodes will result in worse performance down to entering an infinite + * loop on some operations. + */ public function __construct(int $numNodes) { - $minSize = (int) ceil(1.5 * $numNodes); - $sizeIndex = 0; - while ($this->allowedSizes[$sizeIndex] < $minSize) { - $sizeIndex++; - } - $this->tableSize = $this->allowedSizes[$sizeIndex]; - - $this->keyNodes = array_fill(0, $this->tableSize, 0); - $this->keyChars = array_fill(0, $this->tableSize, null); - $this->resultNodes = array_fill(0, $this->tableSize, 0); - } - - /** - * Returns the position of the (node,char) key in the hash map or the - * position to insert it into if it is not yet in. + $minSize = (int) ceil(1.5 * $numNodes); + $sizeIndex = 0; + + while ($this->allowedSizes[$sizeIndex] < $minSize) { + $sizeIndex++; + } + $this->tableSize = $this->allowedSizes[$sizeIndex]; + + $this->keyNodes = array_fill(0, $this->tableSize, 0); + $this->keyChars = array_fill(0, $this->tableSize, null); + $this->resultNodes = array_fill(0, $this->tableSize, 0); + } + + /** + * Returns the next node for the given (node, character) key pair or a + * negative value if no next node is stored for this key. + */ + public function get(int $keyNode, JavaObjectInterface $keyChar): int + { + $pos = $this->hashFind($keyNode, $keyChar); + + if ($this->keyChars[$pos] === null) { + return -1; + } + + return $this->resultNodes[$pos]; + } + + /** + * Inserts the given result node for the (node, character) key pair. + */ + public function put(int $keyNode, JavaObjectInterface $keyChar, int $resultNode): void + { + $pos = $this->hashFind($keyNode, $keyChar); + + if ($this->keyChars[$pos] == null) { + $this->_numStoredNodes++; + $this->keyChars[$pos] = $keyChar; + $this->keyNodes[$pos] = $keyNode; + } + $this->resultNodes[$pos] = $resultNode; + } + + /** + * Extracts the list of child nodes for each node from the hash table + * entries as a linked list. All arrays are expected to be initially empty + * and of suitable size (i.e. for n nodes it should have size + * n given that nodes are numbered 0 to n-1). Those arrays will be + * filled from this method. + *

+ * The method is package visible, as it is tighly coupled to the + * {@link SuffixTree} class. + * + * @param int[] nodeFirstIndex an array giving for each node the index where the first child + * will be stored (or -1 if it has no children) + * @param int[] nodeNextIndex this array gives the next index of the child list or -1 if + * this is the last one + * @param int[] nodeChild this array stores the actual name (=number) of the mode in the + * child list + * + * @throws ArrayIndexOutOfBoundsException if any of the given arrays was too small + */ + public function extractChildLists(array &$nodeFirstIndex, array &$nodeNextIndex, array &$nodeChild): void + { + // Instead of Arrays.fill($nodeFirstIndex, -1); + foreach ($nodeFirstIndex as $k => $v) { + $nodeFirstIndex[$k] = -1; + } + $free = 0; + + for ($i = 0; $i < $this->tableSize; $i++) { + if ($this->keyChars[$i] !== null) { + // insert $this->keyNodes[$i] -> $this->resultNodes[$i] + $nodeChild[$free] = $this->resultNodes[$i]; + $nodeNextIndex[$free] = $nodeFirstIndex[$this->keyNodes[$i]]; + $nodeFirstIndex[$this->keyNodes[$i]] = $free++; + } + } + } + + /** + * Returns the position of the (node,char) key in the hash map or the + * position to insert it into if it is not yet in. * - * @param int $keyNode - * @param JavaObjectInterface $keyChar * @return int - */ + */ private function hashFind(int $keyNode, JavaObjectInterface $keyChar) { - ++$this->_numFind; + $this->_numFind++; /** @var int */ - $hash = $keyChar->hashCode(); + $hash = $keyChar->hashCode(); /** @var int */ - $pos = $this->posMod($this->primaryHash($keyNode, $hash)); + $pos = $this->posMod($this->primaryHash($keyNode, $hash)); /** @var int */ - $secondary = $this->secondaryHash($keyNode, $hash); - while ($this->keyChars[$pos] !== null) { - if ($this->keyNodes[$pos] === $keyNode && $keyChar->equals($this->keyChars[$pos])) { - break; - } - ++$this->_numColl; - $pos = ($pos + $secondary) % $this->tableSize; - } - return $pos; - } - - /** - * Returns the next node for the given (node, character) key pair or a - * negative value if no next node is stored for this key. - * - * @return int - */ - public function get(int $keyNode, JavaObjectInterface $keyChar): int - { - $pos = $this->hashFind($keyNode, $keyChar); - if ($this->keyChars[$pos] === null) { - return -1; - } - return $this->resultNodes[$pos]; - } + $secondary = $this->secondaryHash($keyNode, $hash); + + while ($this->keyChars[$pos] !== null) { + if ($this->keyNodes[$pos] === $keyNode && $keyChar->equals($this->keyChars[$pos])) { + break; + } + $this->_numColl++; + $pos = ($pos + $secondary) % $this->tableSize; + } - /** - * Inserts the given result node for the (node, character) key pair. - * @return void - */ - public function put(int $keyNode, JavaObjectInterface $keyChar, int $resultNode) - { - $pos = $this->hashFind($keyNode, $keyChar); - if ($this->keyChars[$pos] == null) { - ++$this->_numStoredNodes; - $this->keyChars[$pos] = $keyChar; - $this->keyNodes[$pos] = $keyNode; - } - $this->resultNodes[$pos] = $resultNode; - } + return $pos; + } /** * Returns the primary hash value for a (node, character) key pair. + * * @return int */ private function primaryHash(int $keyNode, int $keyCharHash) { - $res = $keyCharHash ^ (13 * $keyNode); - return $res; - } + return $keyCharHash ^ (13 * $keyNode); + } /** * Returns the secondary hash value for a (node, character) key pair. + * * @return int */ private function secondaryHash(int $keyNode, int $keyCharHash) { - $result = $this->posMod(($keyCharHash ^ (1025 * $keyNode))); - if ($result == 0) { - return 2; - } - return $result; - } - - /** - * Returns the smallest non-negative number congruent to x modulo - * {@link #tableSize}. + $result = $this->posMod(($keyCharHash ^ (1025 * $keyNode))); + + if ($result == 0) { + return 2; + } + + return $result; + } + + /** + * Returns the smallest non-negative number congruent to x modulo + * {@link #tableSize}. + * * @return int - */ + */ private function posMod(int $x) { - $x %= $this->tableSize; - if ($x < 0) { - $x += $this->tableSize; - } - return $x; - } - - /** - * Extracts the list of child nodes for each node from the hash table - * entries as a linked list. All arrays are expected to be initially empty - * and of suitable size (i.e. for n nodes it should have size - * n given that nodes are numbered 0 to n-1). Those arrays will be - * filled from this method. - *

- * The method is package visible, as it is tighly coupled to the - * {@link SuffixTree} class. - * - * @param int[] nodeFirstIndex an array giving for each node the index where the first child - * will be stored (or -1 if it has no children). - * @param int[] nodeNextIndex this array gives the next index of the child list or -1 if - * this is the last one. - * @param int[] nodeChild this array stores the actual name (=number) of the mode in the - * child list. - * @return void - * @throws ArrayIndexOutOfBoundsException if any of the given arrays was too small. - */ - public function extractChildLists(array &$nodeFirstIndex, array &$nodeNextIndex, array &$nodeChild) - { - // Instead of Arrays.fill($nodeFirstIndex, -1); - foreach ($nodeFirstIndex as $k => $v) { - $nodeFirstIndex[$k] = -1; + $x %= $this->tableSize; + + if ($x < 0) { + $x += $this->tableSize; } - $free = 0; - for ($i = 0; $i < $this->tableSize; ++$i) { - if ($this->keyChars[$i] !== null) { - // insert $this->keyNodes[$i] -> $this->resultNodes[$i] - $nodeChild[$free] = $this->resultNodes[$i]; - $nodeNextIndex[$free] = $nodeFirstIndex[$this->keyNodes[$i]]; - $nodeFirstIndex[$this->keyNodes[$i]] = $free++; - } - } - } + + return $x; + } } diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index e7283d4e..82634bf1 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -1,4 +1,4 @@ -config = $config; - $content = file_get_contents($file); - $tokens = token_get_all($content); + $content = file_get_contents($file); + $tokens = token_get_all($content); foreach (array_keys($tokens) as $key) { $token = $tokens[$key]; @@ -79,6 +74,7 @@ public function postProcess(): void foreach ($cloneInfos as $cloneInfo) { /** @var int[] */ $others = $cloneInfo->otherClones->extractFirstList(); + for ($j = 0; $j < count($others); $j++) { $otherStart = $others[$j]; /** @var PhpToken */ From eea6450515c638976099d6b0e24463ce107bf751 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Wed, 23 Jun 2021 21:20:54 +0200 Subject: [PATCH 18/29] Psalm fixes (WIP) --- src/Detector/Strategy/AbstractStrategy.php | 9 ++++++++- src/Detector/Strategy/DefaultStrategy.php | 16 ++++++++-------- src/Detector/Strategy/SuffixTree/Sentinel.php | 6 ++++++ src/Detector/Strategy/SuffixTreeStrategy.php | 17 ++++++++++------- 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/Detector/Strategy/AbstractStrategy.php b/src/Detector/Strategy/AbstractStrategy.php index ae33870f..1d30cc20 100644 --- a/src/Detector/Strategy/AbstractStrategy.php +++ b/src/Detector/Strategy/AbstractStrategy.php @@ -37,7 +37,14 @@ abstract class AbstractStrategy T_NS_SEPARATOR => true, ]; - abstract public function processFile(string $file, CodeCloneMap $result, StrategyConfiguration $config): void; + protected $config; + + public function __construct(StrategyConfiguration $config) + { + $this->config = $config; + } + + abstract public function processFile(string $file, CodeCloneMap $result): void; public function postProcess(): void { diff --git a/src/Detector/Strategy/DefaultStrategy.php b/src/Detector/Strategy/DefaultStrategy.php index d2cf020b..7a90dd05 100644 --- a/src/Detector/Strategy/DefaultStrategy.php +++ b/src/Detector/Strategy/DefaultStrategy.php @@ -44,7 +44,7 @@ final class DefaultStrategy extends AbstractStrategy */ protected $hashes = []; - public function processFile(string $file, CodeCloneMap $result, StrategyConfiguration $config): void + public function processFile(string $file, CodeCloneMap $result): void { $buffer = file_get_contents($file); $currentTokenPositions = []; @@ -72,7 +72,7 @@ public function processFile(string $file, CodeCloneMap $result, StrategyConfigur $currentTokenRealPositions[$tokenNr++] = $token[2]; - if ($config->getFuzzy() && $token[0] === T_VARIABLE) { + if ($this->config->getFuzzy() && $token[0] === T_VARIABLE) { $token[1] = 'variable'; } @@ -90,7 +90,7 @@ public function processFile(string $file, CodeCloneMap $result, StrategyConfigur $found = false; $tokenNr = 0; - while ($tokenNr <= $count - $config->getMinTokens()) { + while ($tokenNr <= $count - $this->config->getMinTokens()) { $line = $currentTokenPositions[$tokenNr]; $realLine = $currentTokenRealPositions[$tokenNr]; @@ -99,7 +99,7 @@ public function processFile(string $file, CodeCloneMap $result, StrategyConfigur substr( $currentSignature, $tokenNr * 5, - $config->getMinTokens() * 5 + $this->config->getMinTokens() * 5 ), true ), @@ -120,13 +120,13 @@ public function processFile(string $file, CodeCloneMap $result, StrategyConfigur if ($found) { $fileA = $this->hashes[$firstHash][0]; $firstLineA = $this->hashes[$firstHash][1]; - $lastToken = ($tokenNr - 1) + $config->getMinTokens() - 1; + $lastToken = ($tokenNr - 1) + $this->config->getMinTokens() - 1; $lastLine = $currentTokenPositions[$lastToken]; $lastRealLine = $currentTokenRealPositions[$lastToken]; $numLines = $lastLine + 1 - $firstLine; $realNumLines = $lastRealLine + 1 - $firstRealLine; - if ($numLines >= $config->getMinLines() && + if ($numLines >= $this->config->getMinLines() && ($fileA !== $file || $firstLineA !== $firstRealLine)) { $result->add( @@ -152,13 +152,13 @@ public function processFile(string $file, CodeCloneMap $result, StrategyConfigur if ($found) { $fileA = $this->hashes[$firstHash][0]; $firstLineA = $this->hashes[$firstHash][1]; - $lastToken = ($tokenNr - 1) + $config->getMinTokens() - 1; + $lastToken = ($tokenNr - 1) + $this->config->getMinTokens() - 1; $lastLine = $currentTokenPositions[$lastToken]; $lastRealLine = $currentTokenRealPositions[$lastToken]; $numLines = $lastLine + 1 - $firstLine; $realNumLines = $lastRealLine + 1 - $firstRealLine; - if ($numLines >= $config->getMinLines() && + if ($numLines >= $this->config->getMinLines() && ($fileA !== $file || $firstLineA !== $firstRealLine)) { $result->add( new CodeClone( diff --git a/src/Detector/Strategy/SuffixTree/Sentinel.php b/src/Detector/Strategy/SuffixTree/Sentinel.php index aa30af34..569fe42a 100644 --- a/src/Detector/Strategy/SuffixTree/Sentinel.php +++ b/src/Detector/Strategy/SuffixTree/Sentinel.php @@ -20,6 +20,12 @@ class Sentinel implements JavaObjectInterface /** The hash value used. */ private $hash; + /** @var int Needed for compatiblity with PhpToken */ + public $line = -1; + + /** @var string Needed for compatiblity with PhpToken */ + public $file = "" + public function __construct() { $this->hash = (int) rand(0, PHP_INT_MAX); diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index 82634bf1..1e9a41e2 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -13,6 +13,7 @@ use function file_get_contents; use function is_array; use function token_get_all; +use Exception; use SebastianBergmann\PHPCPD\CodeClone; use SebastianBergmann\PHPCPD\CodeCloneFile; use SebastianBergmann\PHPCPD\CodeCloneMap; @@ -20,22 +21,22 @@ use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\CloneInfo; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\PhpToken; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Sentinel; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\JavaObjectInterface; final class SuffixTreeStrategy extends AbstractStrategy { /** - * @var PhpToken[] + * @var Token[] */ private $word = []; /** - * @var StrategyConfiguration + * @var ?CodeCloneMap */ - private $config; + private $result; - public function processFile(string $file, CodeCloneMap $result, StrategyConfiguration $config): void + public function processFile(string $file, CodeCloneMap $result): void { - $this->config = $config; $content = file_get_contents($file); $tokens = token_get_all($content); @@ -60,6 +61,10 @@ public function processFile(string $file, CodeCloneMap $result, StrategyConfigur public function postProcess(): void { + if (empty($this->result)) { + throw new Exception('Missing result'); + } + // Sentinel = End of word $this->word[] = new Sentinel(); @@ -77,9 +82,7 @@ public function postProcess(): void for ($j = 0; $j < count($others); $j++) { $otherStart = $others[$j]; - /** @var PhpToken */ $t = $this->word[$otherStart]; - /** @var PhpToken */ $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length - 1]; // If we stumbled upon the Sentinel, rewind one step. if ($lastToken instanceof Sentinel) { From e24e902217ea06f967a36b9dd0f2fca8469e3b1a Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Wed, 23 Jun 2021 21:45:18 +0200 Subject: [PATCH 19/29] Psalm fixes (WIP) --- .psalm/baseline.xml | 4 ++- src/CLI/Application.php | 8 +++--- src/CLI/ArgumentsBuilder.php | 2 +- src/Detector/Detector.php | 3 +-- .../Strategy/SuffixTree/AbstractToken.php | 27 +++++++++++++++++++ .../ApproximateCloneDetectingSuffixTree.php | 14 ++++------ src/Detector/Strategy/SuffixTree/Sentinel.php | 2 +- .../Strategy/SuffixTree/SuffixTree.php | 8 +++--- .../SuffixTree/{PhpToken.php => Token.php} | 20 +------------- src/Detector/Strategy/SuffixTreeStrategy.php | 6 ++--- 10 files changed, 50 insertions(+), 44 deletions(-) create mode 100644 src/Detector/Strategy/SuffixTree/AbstractToken.php rename src/Detector/Strategy/SuffixTree/{PhpToken.php => Token.php} (88%) diff --git a/.psalm/baseline.xml b/.psalm/baseline.xml index 3725f50d..4305d6d9 100644 --- a/.psalm/baseline.xml +++ b/.psalm/baseline.xml @@ -15,13 +15,15 @@ $argv - + $option[0] $option[1] $option[1] $option[1] $option[1] $option[1] + $option[1] + $option[1] $directories diff --git a/src/CLI/Application.php b/src/CLI/Application.php index 26974701..8e1b09ea 100644 --- a/src/CLI/Application.php +++ b/src/CLI/Application.php @@ -68,7 +68,7 @@ public function run(array $argv): int $config = new StrategyConfiguration($arguments); - $strategy = $this->pickStrategy($arguments->algorithm()); + $strategy = $this->pickStrategy($arguments->algorithm(), $config); $timer = new Timer; $timer->start(); @@ -97,15 +97,15 @@ private function printVersion(): void ); } - private function pickStrategy(?string $algorithm): AbstractStrategy + private function pickStrategy(?string $algorithm, StrategyConfiguration $config): AbstractStrategy { switch ($algorithm) { case null: case 'rabin-karp': - return new DefaultStrategy(); + return new DefaultStrategy($config); case 'suffixtree': - return new SuffixTreeStrategy(); + return new SuffixTreeStrategy($config); default: throw new Exception('Unsupported algorithm: ' . $algorithm); diff --git a/src/CLI/ArgumentsBuilder.php b/src/CLI/ArgumentsBuilder.php index babf87c3..a92c5a52 100644 --- a/src/CLI/ArgumentsBuilder.php +++ b/src/CLI/ArgumentsBuilder.php @@ -120,7 +120,7 @@ public function build(array $argv): Arguments break; case '--algorithm': - $algorithm = $option[1]; + $algorithm = (string) $option[1]; break; } diff --git a/src/Detector/Detector.php b/src/Detector/Detector.php index 8c2e737d..25a55b97 100644 --- a/src/Detector/Detector.php +++ b/src/Detector/Detector.php @@ -36,8 +36,7 @@ public function copyPasteDetection(iterable $files, StrategyConfiguration $confi $this->strategy->processFile( $file, - $result, - $config + $result ); } diff --git a/src/Detector/Strategy/SuffixTree/AbstractToken.php b/src/Detector/Strategy/SuffixTree/AbstractToken.php new file mode 100644 index 00000000..23764564 --- /dev/null +++ b/src/Detector/Strategy/SuffixTree/AbstractToken.php @@ -0,0 +1,27 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; + +abstract class AbstractToken +{ + /** @var string */ + public $tokenCode; + /** @var int */ + public $line; + /** @var string */ + public $file; + /** @var string */ + public $tokenName; + /** @var string */ + public $content; + abstract public function hashCode(): int; + abstract public function equals(AbstractToken $obj): bool; + abstract public function __toString(): string; +} diff --git a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php index 97ae12b1..189c5283 100644 --- a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php +++ b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php @@ -25,14 +25,14 @@ class ApproximateCloneDetectingSuffixTree extends SuffixTree * * @var int */ - protected $minLength; + protected $minLength = 70; /** * The number of leaves reachable from the given node (1 for leaves). * * @var int[] * */ - private $leafCount; + private $leafCount = []; /** * This is the distance between two entries in the {@link #cloneInfos} map. @@ -69,7 +69,7 @@ class ApproximateCloneDetectingSuffixTree extends SuffixTree * * @var int */ - private $headEquality; + private $headEquality = 10; /** * Create a new suffix tree from a given word. The word given as parameter @@ -106,10 +106,9 @@ public function __construct(array $word) * @param int $minLength the minimal length of a clone in tokens (not lines) * @param int $maxErrors the maximal number of errors/gaps allowed * @param int $headEquality the number of elements which have to be the same at the beginning of a clone - * - * @throws ConQATException + * @return CloneInfo[] */ - public function findClones(int $minLength, int $maxErrors, int $headEquality) + public function findClones(int $minLength, int $maxErrors, int $headEquality): array { $this->minLength = $minLength; $this->headEquality = $headEquality; @@ -236,8 +235,6 @@ private function initLeafCount(int $node): void * different from the length along the input word due to gaps) * @param int $maxErrors the number of errors still allowed * - * @throws ConQATException - * * @return bool whether some clone was reported */ private function matchWord(int $wordStart, int $wordPosition, int $node, int $nodeWordLength, int $maxErrors) @@ -451,7 +448,6 @@ private function calculateMaxLength( } /** - * @throws ConQATException */ private function reportClone( int $wordBegin, diff --git a/src/Detector/Strategy/SuffixTree/Sentinel.php b/src/Detector/Strategy/SuffixTree/Sentinel.php index 569fe42a..a74854a8 100644 --- a/src/Detector/Strategy/SuffixTree/Sentinel.php +++ b/src/Detector/Strategy/SuffixTree/Sentinel.php @@ -15,7 +15,7 @@ * it to the suffix tree. For the sentinel equality and object identity are * the same! */ -class Sentinel implements JavaObjectInterface +class Sentinel extends AbstractToken { /** The hash value used. */ private $hash; diff --git a/src/Detector/Strategy/SuffixTree/SuffixTree.php b/src/Detector/Strategy/SuffixTree/SuffixTree.php index b912f765..64e5d083 100644 --- a/src/Detector/Strategy/SuffixTree/SuffixTree.php +++ b/src/Detector/Strategy/SuffixTree/SuffixTree.php @@ -48,7 +48,7 @@ class SuffixTree protected $INFTY; /** The word we are working on. - * @var array */ + * @var AbstractToken[] */ protected $word; /** The number of nodes created so far. @@ -94,7 +94,7 @@ class SuffixTree * * @var int[] */ - protected $nodeChildFirst; + protected $nodeChildFirst = []; /** * This array gives the next index of the child list or -1 if this is the @@ -104,7 +104,7 @@ class SuffixTree * * @var int[] */ - protected $nodeChildNext; + protected $nodeChildNext = []; /** * This array stores the actual name (=number) of the mode in the child @@ -114,7 +114,7 @@ class SuffixTree * * @var int[] */ - protected $nodeChildNode; + protected $nodeChildNode = []; /** * The node we are currently at as a "global" variable (as it is always diff --git a/src/Detector/Strategy/SuffixTree/PhpToken.php b/src/Detector/Strategy/SuffixTree/Token.php similarity index 88% rename from src/Detector/Strategy/SuffixTree/PhpToken.php rename to src/Detector/Strategy/SuffixTree/Token.php index ba84ef25..01836851 100644 --- a/src/Detector/Strategy/SuffixTree/PhpToken.php +++ b/src/Detector/Strategy/SuffixTree/Token.php @@ -9,18 +9,8 @@ */ namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; -class PhpToken implements JavaObjectInterface +class PhpToken extends AbstractToken { - public $tokenCode; - - public $line; - - public $file; - - public $tokenName; - - public $content; - public function __construct( int $tokenCode, string $tokenName, @@ -81,12 +71,4 @@ public function equals(JavaObjectInterface $token): bool { return $token->hashCode() === $this->hashCode(); } - - /** - * @return string - */ - public function toString() - { - return $this->tokenName; - } } diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index 1e9a41e2..52f27ce8 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -19,14 +19,14 @@ use SebastianBergmann\PHPCPD\CodeCloneMap; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\ApproximateCloneDetectingSuffixTree; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\CloneInfo; -use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\PhpToken; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\AbstractToken; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Token; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Sentinel; -use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\JavaObjectInterface; final class SuffixTreeStrategy extends AbstractStrategy { /** - * @var Token[] + * @var AbstractToken[] */ private $word = []; From fe9e37b559a4b956aacf5859b6cb77547a63b74c Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Thu, 24 Jun 2021 20:45:47 +0200 Subject: [PATCH 20/29] Replace JavaObjectInterface with AbstractToken --- src/CLI/Application.php | 5 +---- src/Detector/Detector.php | 2 +- .../ApproximateCloneDetectingSuffixTree.php | 16 ++++++++-------- src/Detector/Strategy/SuffixTree/CloneInfo.php | 4 ++-- .../Strategy/SuffixTree/JavaObjectInterface.php | 17 ----------------- src/Detector/Strategy/SuffixTree/Sentinel.php | 8 +------- src/Detector/Strategy/SuffixTree/SuffixTree.php | 6 +----- .../Strategy/SuffixTree/SuffixTreeHashTable.php | 6 +++--- src/Detector/Strategy/SuffixTree/Token.php | 4 ++-- src/Detector/Strategy/SuffixTreeStrategy.php | 2 +- 10 files changed, 20 insertions(+), 50 deletions(-) delete mode 100644 src/Detector/Strategy/SuffixTree/JavaObjectInterface.php diff --git a/src/CLI/Application.php b/src/CLI/Application.php index 8e1b09ea..5a4a6ebd 100644 --- a/src/CLI/Application.php +++ b/src/CLI/Application.php @@ -73,10 +73,7 @@ public function run(array $argv): int $timer = new Timer; $timer->start(); - $clones = (new Detector($strategy))->copyPasteDetection( - $files, - $config - ); + $clones = (new Detector($strategy))->copyPasteDetection($files); (new Text)->printResult($clones, $arguments->verbose()); diff --git a/src/Detector/Detector.php b/src/Detector/Detector.php index 25a55b97..eb2a89d7 100644 --- a/src/Detector/Detector.php +++ b/src/Detector/Detector.php @@ -25,7 +25,7 @@ public function __construct(AbstractStrategy $strategy) $this->strategy = $strategy; } - public function copyPasteDetection(iterable $files, StrategyConfiguration $config): CodeCloneMap + public function copyPasteDetection(iterable $files): CodeCloneMap { $result = new CodeCloneMap; diff --git a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php index 189c5283..ca5ab9fd 100644 --- a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php +++ b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php @@ -167,7 +167,7 @@ public function findClones(int $minLength, int $maxErrors, int $headEquality): a for ($j = 0; $j < count($others); $j++) { $otherStart = $others[$j]; - /** @var PhpToken */ + /** @var AbstractToken */ $t = $this->word[$otherStart]; } } @@ -188,9 +188,9 @@ public function findClones(int $minLength, int $maxErrors, int $headEquality): a * This should return true, if the provided character is not allowed to * match with anything else (e.g. is a sentinel). */ - protected function mayNotMatch(JavaObjectInterface $character) + protected function mayNotMatch(AbstractToken $token) { - return $character instanceof Sentinel; + return $token instanceof Sentinel; } /** @@ -477,7 +477,7 @@ private function reportClone( $occurrences = 1 + $otherClones->size(); // check whether we may start from here - /** @var PhpToken */ + /** @var AbstractToken */ $t = $this->word[$wordBegin]; /** @var CloneInfo */ $newInfo = new CloneInfo($length, $wordBegin, $occurrences, $t, $otherClones); @@ -501,7 +501,7 @@ private function reportClone( for ($i = $wordBegin; $i < $wordEnd; $i += $this->INDEX_SPREAD) { $this->cloneInfos[$i][] = new CloneInfo($length - ($i - $wordBegin), $wordBegin, $occurrences, $t, $otherClones); } - /** @var PhpToken */ + /** @var AbstractToken */ $t = $this->word[$wordBegin]; for ($clone = 0; $clone < $otherClones->size(); $clone++) { @@ -509,7 +509,7 @@ private function reportClone( $otherLength = $otherClones->getSecond($clone); for ($j = 0; $j < $otherLength; $j++) { - /** @var PhpToken */ + /** @var AbstractToken */ $r = $this->word[$j + $start]; } @@ -532,9 +532,9 @@ private function reportClone( */ private function fillEDBuffer(int $i, int $j, int $iOffset, int $jOffset) { - /** @var JavaObjectInterface */ + /** @var AbstractToken */ $iChar = $this->word[$iOffset + $i - 1]; - /** @var JavaObjectInterface */ + /** @var AbstractToken */ $jChar = $this->word[$jOffset + $j - 1]; $insertDelete = 1 + min($this->edBuffer[$i - 1][$j], $this->edBuffer[$i][$j - 1]); diff --git a/src/Detector/Strategy/SuffixTree/CloneInfo.php b/src/Detector/Strategy/SuffixTree/CloneInfo.php index 490442a7..687dfbd5 100644 --- a/src/Detector/Strategy/SuffixTree/CloneInfo.php +++ b/src/Detector/Strategy/SuffixTree/CloneInfo.php @@ -27,7 +27,7 @@ class CloneInfo public $position; /** - * @var PhpToken + * @var AbstractToken */ public $token; @@ -46,7 +46,7 @@ class CloneInfo private $occurrences; /** Constructor. */ - public function __construct(int $length, int $position, int $occurrences, PhpToken $token, PairList $otherClones) + public function __construct(int $length, int $position, int $occurrences, Token $token, PairList $otherClones) { $this->length = $length; $this->position = $position; diff --git a/src/Detector/Strategy/SuffixTree/JavaObjectInterface.php b/src/Detector/Strategy/SuffixTree/JavaObjectInterface.php deleted file mode 100644 index e6b12f96..00000000 --- a/src/Detector/Strategy/SuffixTree/JavaObjectInterface.php +++ /dev/null @@ -1,17 +0,0 @@ - - * - * For the full copyright and license information, please view the LICENSE - * file that was distributed with this source code. - */ -namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; - -interface JavaObjectInterface -{ - public function hashCode(): int; - - public function equals(self $obj): bool; -} diff --git a/src/Detector/Strategy/SuffixTree/Sentinel.php b/src/Detector/Strategy/SuffixTree/Sentinel.php index a74854a8..e5861121 100644 --- a/src/Detector/Strategy/SuffixTree/Sentinel.php +++ b/src/Detector/Strategy/SuffixTree/Sentinel.php @@ -20,12 +20,6 @@ class Sentinel extends AbstractToken /** The hash value used. */ private $hash; - /** @var int Needed for compatiblity with PhpToken */ - public $line = -1; - - /** @var string Needed for compatiblity with PhpToken */ - public $file = "" - public function __construct() { $this->hash = (int) rand(0, PHP_INT_MAX); @@ -36,7 +30,7 @@ public function hashCode(): int return $this->hash; } - public function equals(JavaObjectInterface $obj): bool + public function equals(AbstractToken $obj): bool { // Original code uses physical object equality, not present in PHP. return $obj instanceof self; diff --git a/src/Detector/Strategy/SuffixTree/SuffixTree.php b/src/Detector/Strategy/SuffixTree/SuffixTree.php index 64e5d083..91898223 100644 --- a/src/Detector/Strategy/SuffixTree/SuffixTree.php +++ b/src/Detector/Strategy/SuffixTree/SuffixTree.php @@ -258,12 +258,8 @@ private function update(int $charPos): void * explicit if it not already is and this is not the end-point. It returns * true if the end-point was reached. The newly created (or reached) * explicit node is returned in the "global" variable. - * - * @param object $nextCharacter - * - * @return bool */ - private function testAndSplit(int $refWordEnd, JavaObjectInterface $nextCharacter) + private function testAndSplit(int $refWordEnd, AbstractToken $nextCharacter): bool { if ($this->currentNode < 0) { // trap state is always end state diff --git a/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php b/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php index 2eeb90d8..7993fa24 100644 --- a/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php +++ b/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php @@ -110,7 +110,7 @@ public function __construct(int $numNodes) * Returns the next node for the given (node, character) key pair or a * negative value if no next node is stored for this key. */ - public function get(int $keyNode, JavaObjectInterface $keyChar): int + public function get(int $keyNode, AbstractToken $keyChar): int { $pos = $this->hashFind($keyNode, $keyChar); @@ -124,7 +124,7 @@ public function get(int $keyNode, JavaObjectInterface $keyChar): int /** * Inserts the given result node for the (node, character) key pair. */ - public function put(int $keyNode, JavaObjectInterface $keyChar, int $resultNode): void + public function put(int $keyNode, AbstractToken $keyChar, int $resultNode): void { $pos = $this->hashFind($keyNode, $keyChar); @@ -179,7 +179,7 @@ public function extractChildLists(array &$nodeFirstIndex, array &$nodeNextIndex, * * @return int */ - private function hashFind(int $keyNode, JavaObjectInterface $keyChar) + private function hashFind(int $keyNode, AbstractToken $keyChar) { $this->_numFind++; /** @var int */ diff --git a/src/Detector/Strategy/SuffixTree/Token.php b/src/Detector/Strategy/SuffixTree/Token.php index 01836851..e12cca83 100644 --- a/src/Detector/Strategy/SuffixTree/Token.php +++ b/src/Detector/Strategy/SuffixTree/Token.php @@ -9,7 +9,7 @@ */ namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; -class PhpToken extends AbstractToken +class Token extends AbstractToken { public function __construct( int $tokenCode, @@ -67,7 +67,7 @@ public function hashCode(): int //return $tokenCode; } - public function equals(JavaObjectInterface $token): bool + public function equals(AbstractToken $token): bool { return $token->hashCode() === $this->hashCode(); } diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index 52f27ce8..aecb6a65 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -45,7 +45,7 @@ public function processFile(string $file, CodeCloneMap $result): void if (is_array($token)) { if (!isset($this->tokensIgnoreList[$token[0]])) { - $this->word[] = new PhpToken( + $this->word[] = new Token( $token[0], token_name($token[0]), $token[2], From 6775c1dd53d5225d18b2293eee07b0c4bc48272c Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Thu, 24 Jun 2021 20:50:31 +0200 Subject: [PATCH 21/29] Make it run --- .../SuffixTree/ApproximateCloneDetectingSuffixTree.php | 10 +++++----- src/Detector/Strategy/SuffixTree/Sentinel.php | 2 +- src/Detector/Strategy/SuffixTree/Token.php | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php index ca5ab9fd..5cf18100 100644 --- a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php +++ b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php @@ -46,7 +46,6 @@ class ApproximateCloneDetectingSuffixTree extends SuffixTree * * @var array */ - //private final ListMap cloneInfos = new ListMap(); private $cloneInfos = []; /** @@ -79,14 +78,14 @@ class ApproximateCloneDetectingSuffixTree extends SuffixTree * This only word correctly if the given word is closed using a sentinel * character. * - * @param array $word List of tokens to analyze + * @param AbstractToken[] $word List of tokens to analyze */ public function __construct(array $word) { + parent::__construct($word); + $arr = array_fill(0, $this->MAX_LENGTH, 0); $this->edBuffer = array_fill(0, $this->MAX_LENGTH, $arr); - - parent::__construct($word); $this->ensureChildLists(); $this->leafCount = array_fill(0, $this->numNodes, 0); $this->initLeafCount(0); @@ -147,9 +146,10 @@ public function findClones(int $minLength, int $maxErrors, int $headEquality): a $map = []; for ($index = 0; $index <= count($this->word); $index++) { + /** @var AbstractToken|null */ $existingClones = $this->cloneInfos[$index] ?? null; - if ($existingClones != null) { + if ($existingClones !== null) { foreach ($existingClones as $ci) { // length = number of tokens // TODO: min token length diff --git a/src/Detector/Strategy/SuffixTree/Sentinel.php b/src/Detector/Strategy/SuffixTree/Sentinel.php index e5861121..04ea7232 100644 --- a/src/Detector/Strategy/SuffixTree/Sentinel.php +++ b/src/Detector/Strategy/SuffixTree/Sentinel.php @@ -36,7 +36,7 @@ public function equals(AbstractToken $obj): bool return $obj instanceof self; } - public function toString(): string + public function __toString(): string { return '$'; } diff --git a/src/Detector/Strategy/SuffixTree/Token.php b/src/Detector/Strategy/SuffixTree/Token.php index e12cca83..412936d9 100644 --- a/src/Detector/Strategy/SuffixTree/Token.php +++ b/src/Detector/Strategy/SuffixTree/Token.php @@ -25,7 +25,7 @@ public function __construct( $this->file = $file; } - public function __toString() + public function __toString(): string { return $this->tokenName; } From 197cde3122dc985cb737b76fecdc02a8f6bb6dd9 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Thu, 24 Jun 2021 21:48:05 +0200 Subject: [PATCH 22/29] Psalm fixes, done --- .../Strategy/SuffixTree/AbstractToken.php | 2 +- .../ApproximateCloneDetectingSuffixTree.php | 52 ++----- .../Strategy/SuffixTree/CloneInfo.php | 2 +- src/Detector/Strategy/SuffixTree/PairList.php | 131 ++++++------------ src/Detector/Strategy/SuffixTree/Sentinel.php | 9 +- .../Strategy/SuffixTree/SuffixTree.php | 51 ++----- .../SuffixTree/SuffixTreeHashTable.php | 31 ++--- src/Detector/Strategy/SuffixTree/Token.php | 34 +---- src/Detector/Strategy/SuffixTreeStrategy.php | 1 - 9 files changed, 90 insertions(+), 223 deletions(-) diff --git a/src/Detector/Strategy/SuffixTree/AbstractToken.php b/src/Detector/Strategy/SuffixTree/AbstractToken.php index 23764564..b54f25a1 100644 --- a/src/Detector/Strategy/SuffixTree/AbstractToken.php +++ b/src/Detector/Strategy/SuffixTree/AbstractToken.php @@ -11,7 +11,7 @@ abstract class AbstractToken { - /** @var string */ + /** @var int */ public $tokenCode; /** @var int */ public $line; diff --git a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php index 5cf18100..93b75c79 100644 --- a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php +++ b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php @@ -44,7 +44,7 @@ class ApproximateCloneDetectingSuffixTree extends SuffixTree /** * This map stores for each position the relevant clone infos. * - * @var array + * @var array */ private $cloneInfos = []; @@ -146,30 +146,21 @@ public function findClones(int $minLength, int $maxErrors, int $headEquality): a $map = []; for ($index = 0; $index <= count($this->word); $index++) { - /** @var AbstractToken|null */ + /** @var CloneInfo[] */ $existingClones = $this->cloneInfos[$index] ?? null; - if ($existingClones !== null) { + if (!empty($existingClones)) { foreach ($existingClones as $ci) { // length = number of tokens // TODO: min token length if ($ci->length > $minLength) { - /** @var CloneInfo */ $previousCi = $map[$ci->token->line] ?? null; - if ($previousCi == null) { + if ($previousCi === null) { $map[$ci->token->line] = $ci; } elseif ($ci->length > $previousCi->length) { $map[$ci->token->line] = $ci; } - /** @var int[] */ - $others = $ci->otherClones->extractFirstList(); - - for ($j = 0; $j < count($others); $j++) { - $otherStart = $others[$j]; - /** @var AbstractToken */ - $t = $this->word[$otherStart]; - } } } } @@ -177,7 +168,7 @@ public function findClones(int $minLength, int $maxErrors, int $headEquality): a /** @var CloneInfo[] */ $values = array_values($map); - usort($values, static function ($a, $b) { + usort($values, static function (CloneInfo $a, CloneInfo $b): int { return $b->length - $a->length; }); @@ -188,7 +179,7 @@ public function findClones(int $minLength, int $maxErrors, int $headEquality): a * This should return true, if the provided character is not allowed to * match with anything else (e.g. is a sentinel). */ - protected function mayNotMatch(AbstractToken $token) + protected function mayNotMatch(AbstractToken $token): bool { return $token instanceof Sentinel; } @@ -250,8 +241,7 @@ private function matchWord(int $wordStart, int $wordPosition, int $node, int $no $currentNodeWordLength = min($this->nodeWordEnd[$node] - $this->nodeWordBegin[$node], $this->MAX_LENGTH - 1); - // do min edit distance - /** @var int */ + // Do min edit distance $currentLength = $this->calculateMaxLength( $wordStart, $wordPosition, @@ -457,15 +447,14 @@ private function reportClone( int $nodeWordLength ): void { - /** @var int */ $length = $wordEnd - $wordBegin; if ($length < $this->minLength || $nodeWordLength < $this->minLength) { return; } - /** @var PairList */ - $otherClones = new PairList(); + // NB: 0 and 0 are two indicate the template S and T for Psalm, in lack of generics. + $otherClones = new PairList(16, 0, 0); $this->findRemainingClones( $otherClones, $nodeWordLength, @@ -477,13 +466,10 @@ private function reportClone( $occurrences = 1 + $otherClones->size(); // check whether we may start from here - /** @var AbstractToken */ $t = $this->word[$wordBegin]; - /** @var CloneInfo */ $newInfo = new CloneInfo($length, $wordBegin, $occurrences, $t, $otherClones); for ($index = max(0, $wordBegin - $this->INDEX_SPREAD + 1); $index <= $wordBegin; $index++) { - /** @var CloneInfo */ $existingClones = $this->cloneInfos[$index] ?? null; if ($existingClones != null) { @@ -501,20 +487,12 @@ private function reportClone( for ($i = $wordBegin; $i < $wordEnd; $i += $this->INDEX_SPREAD) { $this->cloneInfos[$i][] = new CloneInfo($length - ($i - $wordBegin), $wordBegin, $occurrences, $t, $otherClones); } - /** @var AbstractToken */ $t = $this->word[$wordBegin]; for ($clone = 0; $clone < $otherClones->size(); $clone++) { $start = $otherClones->getFirst($clone); $otherLength = $otherClones->getSecond($clone); - - for ($j = 0; $j < $otherLength; $j++) { - /** @var AbstractToken */ - $r = $this->word[$j + $start]; - } - for ($i = 0; $i < $otherLength; $i += $this->INDEX_SPREAD) { - //$this->cloneInfos.add($start + $i, new CloneInfo($otherLength - $i, $wordBegin, occurrences, $t, $otherClones)); $this->cloneInfos[$start + $i][] = new CloneInfo($otherLength - $i, $wordBegin, $occurrences, $t, $otherClones); } } @@ -532,9 +510,7 @@ private function reportClone( */ private function fillEDBuffer(int $i, int $j, int $iOffset, int $jOffset) { - /** @var AbstractToken */ $iChar = $this->word[$iOffset + $i - 1]; - /** @var AbstractToken */ $jChar = $this->word[$jOffset + $j - 1]; $insertDelete = 1 + min($this->edBuffer[$i - 1][$j], $this->edBuffer[$i][$j - 1]); @@ -547,11 +523,11 @@ private function fillEDBuffer(int $i, int $j, int $iOffset, int $jOffset) * Fills a list of pairs giving the start positions and lengths of the * remaining clones. * - * @param array $clonePositions the clone positions being filled (start position and length) - * @param int $nodeWordLength the length of the word along the nodes - * @param int $currentNode the node we are currently at - * @param int $distance the distance along the word leading to the current node - * @param int $wordStart the start of the currently searched word + * @param PairList $clonePositions the clone positions being filled (start position and length) + * @param int $nodeWordLength the length of the word along the nodes + * @param int $currentNode the node we are currently at + * @param int $distance the distance along the word leading to the current node + * @param int $wordStart the start of the currently searched word */ private function findRemainingClones( PairList $clonePositions, diff --git a/src/Detector/Strategy/SuffixTree/CloneInfo.php b/src/Detector/Strategy/SuffixTree/CloneInfo.php index 687dfbd5..4187996c 100644 --- a/src/Detector/Strategy/SuffixTree/CloneInfo.php +++ b/src/Detector/Strategy/SuffixTree/CloneInfo.php @@ -46,7 +46,7 @@ class CloneInfo private $occurrences; /** Constructor. */ - public function __construct(int $length, int $position, int $occurrences, Token $token, PairList $otherClones) + public function __construct(int $length, int $position, int $occurrences, AbstractToken $token, PairList $otherClones) { $this->length = $length; $this->position = $position; diff --git a/src/Detector/Strategy/SuffixTree/PairList.php b/src/Detector/Strategy/SuffixTree/PairList.php index c4e45ee2..5051c384 100644 --- a/src/Detector/Strategy/SuffixTree/PairList.php +++ b/src/Detector/Strategy/SuffixTree/PairList.php @@ -9,6 +9,8 @@ */ namespace SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree; +use Exception; + /** * A list for storing pairs in a specific order. * @@ -16,6 +18,9 @@ * * @version $Rev: 51770 $ * @ConQAT.Rating GREEN Hash: 7459D6D0F59028B37DD23DD091BDCEEA + * + * @template T + * @template S */ class PairList { @@ -36,18 +41,22 @@ class PairList /** * The array used for storing the S. * - * @var object[] + * @var S[] */ private $firstElements; /** * The array used for storing the T. * - * @var object[] + * @var T[] */ private $secondElements; - public function __construct(int $initialCapacity = 16) + /** + * @param S $firstType + * @param T $secondType + */ + public function __construct(int $initialCapacity = 16, $firstType, $secondType) { if ($initialCapacity < 1) { $initialCapacity = 1; @@ -70,6 +79,9 @@ public function size(): int /** * Add the given pair to the list. + * + * @param S $first + * @param T $second */ public function add($first, $second): void { @@ -92,7 +104,11 @@ public function addAll(self $other): void } } - /** Returns the first element at given index. */ + /** + * Returns the first element at given index. + * + * @return S + */ public function getFirst(int $i) { $this->checkWithinBounds($i); @@ -100,14 +116,22 @@ public function getFirst(int $i) return $this->firstElements[$i]; } - /** Sets the first element at given index. */ + /** + * Sets the first element at given index. + * + * @param S $value + */ public function setFirst(int $i, $value): void { $this->checkWithinBounds($i); $this->firstElements[$i] = $value; } - /** Returns the second element at given index. */ + /** + * Returns the second element at given index. + * + * @return T + */ public function getSecond(int $i) { $this->checkWithinBounds($i); @@ -115,17 +139,23 @@ public function getSecond(int $i) return $this->secondElements[$i]; } - /** Sets the first element at given index. */ + /** + * Sets the first element at given index. + * @param T $value + */ public function setSecond(int $i, $value): void { $this->checkWithinBounds($i); $this->secondElements[$i] = $value; } - /** Creates a new list containing all first elements. */ + /** + * Creates a new list containing all first elements. + * + * @return S[] + */ public function extractFirstList(): array { - //array $result = new ArrayList($this->size + 1); $result = []; for ($i = 0; $i < $this->size; $i++) { @@ -135,10 +165,13 @@ public function extractFirstList(): array return $result; } - /** Creates a new list containing all second elements. */ + /** + * Creates a new list containing all second elements. + * + * @return T[] + */ public function extractSecondList(): array { - //$result = new ArrayList($this->size + 1); $result = []; for ($i = 0; $i < $this->size; $i++) { @@ -148,17 +181,6 @@ public function extractSecondList(): array return $result; } - /** - * Swaps the pairs of this list. Is S and T are different types, this will - * be extremely dangerous. - */ - public function swapPairs(): void - { - $temp = $this->firstElements; - $this->firstElements = $this->secondElements; - $this->secondElements = $temp; - } - /** Swaps the entries located at indexes $i and $j. */ public function swapEntries(int $i, int $j): void { @@ -182,26 +204,6 @@ public function removeLast(): void $this->size--; } - public function toString(): string - { - $result = ''; - $result += ('['); - - for ($i = 0; $i < $this->size; $i++) { - if ($i != 0) { - $result .= ','; - } - $result .= '('; - $result .= (string) $this->firstElements[$i]; - $result .= ','; - $result .= (string) $this->secondElements[$i]; - $result .= ')'; - } - $result .= ']'; - - return $result; - } - public function hashCode(): int { $prime = 31; @@ -211,49 +213,6 @@ public function hashCode(): int return $prime * $hash + crc32(serialize($this->secondElements)); } - public function equals(self $obj): bool - { - // TODO: Doesn't work in PHP - if ($this === $obj) { - return true; - } - - if (!($obj instanceof self)) { - return false; - } - - $other = $obj; - - if ($this->size !== $other->size) { - return false; - } - - for ($i = 0; $i < $this->size; $i++) { - if (!($this->firstElements[$i] == $other->firstElements[$i]) || - !($this->secondElements[$i] != $this->secondElements[$i])) { - return false; - } - } - - return true; - } - - /** Make sure there is space for at least the given amount of elements. */ - protected function ensureSpace(int $space): void - { - if ($space <= count($this->firstElements)) { - return; - } - - $oldFirst = $this->firstElements; - $oldSecond = $this->secondElements; - $newSize = count($this->firstElements) * 2; - - while ($newSize < $space) { - $newSize *= 2; - } - } - /** * Checks whether the given $i is within the bounds. Throws an * exception otherwise. @@ -261,7 +220,7 @@ protected function ensureSpace(int $space): void private function checkWithinBounds(int $i): void { if ($i < 0 || $i >= $this->size) { - throw new Exception('Out of bounds: ' + $i); + throw new Exception('Out of bounds: ' . $i); } } } diff --git a/src/Detector/Strategy/SuffixTree/Sentinel.php b/src/Detector/Strategy/SuffixTree/Sentinel.php index 04ea7232..3f0599cd 100644 --- a/src/Detector/Strategy/SuffixTree/Sentinel.php +++ b/src/Detector/Strategy/SuffixTree/Sentinel.php @@ -17,12 +17,17 @@ */ class Sentinel extends AbstractToken { - /** The hash value used. */ + /** @var int The hash value used. */ private $hash; public function __construct() { - $this->hash = (int) rand(0, PHP_INT_MAX); + $this->hash = rand(0, PHP_INT_MAX); + $this->tokenCode = -1; + $this->line = -1; + $this->file = ''; + $this->tokenName = ''; + $this->content = ''; } public function hashCode(): int diff --git a/src/Detector/Strategy/SuffixTree/SuffixTree.php b/src/Detector/Strategy/SuffixTree/SuffixTree.php index 91898223..13c54339 100644 --- a/src/Detector/Strategy/SuffixTree/SuffixTree.php +++ b/src/Detector/Strategy/SuffixTree/SuffixTree.php @@ -42,17 +42,20 @@ class SuffixTree { /** * Infinity in this context. - * * @var int */ protected $INFTY; - /** The word we are working on. - * @var AbstractToken[] */ + /** + * The word we are working on. + * @var AbstractToken[] + */ protected $word; - /** The number of nodes created so far. - * @var int */ + /** + * The number of nodes created so far. + * @var int + */ protected $numNodes = 0; /** @@ -139,14 +142,16 @@ class SuffixTree * * @var int */ - private $explicitNode; + private $explicitNode = 0; /** * Create a new suffix tree from a given word. The word given as parameter * is used internally and should not be modified anymore, so copy it before * if required. + * + * @param AbstractToken[] $word */ - public function __construct(array $word) + public function __construct($word) { $this->word = $word; $size = count($word); @@ -166,36 +171,6 @@ public function __construct(array $word) } } - /** - * Returns whether the given word is contained in the string given at - * construction time. - * - * @return bool - */ - public function containsWord(array $find) - { - $node = 0; - $findSize = count($find); - - for ($i = 0; $i < $findSize;) { - /** @var int */ - $next = $this->nextNode->get($node, $find[$i]); - - if ($next < 0) { - return false; - } - - for ($j = $this->nodeWordBegin[$next]; $j < $this->nodeWordEnd[$next] && $i < $findSize; ++$i, ++$j) { - if (!$this->word[$j]->equals($find[$i])) { - return false; - } - } - $node = $next; - } - - return true; - } - /** * This method makes sure the child lists are filled (required for * traversing the tree). @@ -276,7 +251,6 @@ private function testAndSplit(int $refWordEnd, AbstractToken $nextCharacter): bo return true; } - /** @var int */ $next = $this->nextNode->get($this->currentNode, $this->word[$this->refWordBegin]); if ($nextCharacter->equals($this->word[$this->nodeWordBegin[$next] + $refWordEnd - $this->refWordBegin])) { @@ -317,7 +291,6 @@ private function canonize(int $refWordEnd): void return; } - /** @var int */ $next = $this->nextNode->get( $this->currentNode, $this->word[$this->refWordBegin] diff --git a/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php b/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php index 7993fa24..588e7f8b 100644 --- a/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php +++ b/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php @@ -54,7 +54,7 @@ class SuffixTreeHashTable /** * Storage space for the character part of the key. * - * @var object[] + * @var array */ private $keyChars; @@ -146,19 +146,17 @@ public function put(int $keyNode, AbstractToken $keyChar, int $resultNode): void * The method is package visible, as it is tighly coupled to the * {@link SuffixTree} class. * - * @param int[] nodeFirstIndex an array giving for each node the index where the first child + * @param int[] $nodeFirstIndex an array giving for each node the index where the first child * will be stored (or -1 if it has no children) - * @param int[] nodeNextIndex this array gives the next index of the child list or -1 if + * @param int[] $nodeNextIndex this array gives the next index of the child list or -1 if * this is the last one - * @param int[] nodeChild this array stores the actual name (=number) of the mode in the + * @param int[] $nodeChild this array stores the actual name (=number) of the mode in the * child list - * - * @throws ArrayIndexOutOfBoundsException if any of the given arrays was too small */ public function extractChildLists(array &$nodeFirstIndex, array &$nodeNextIndex, array &$nodeChild): void { // Instead of Arrays.fill($nodeFirstIndex, -1); - foreach ($nodeFirstIndex as $k => $v) { + foreach (array_keys($nodeFirstIndex) as $k) { $nodeFirstIndex[$k] = -1; } $free = 0; @@ -176,17 +174,12 @@ public function extractChildLists(array &$nodeFirstIndex, array &$nodeNextIndex, /** * Returns the position of the (node,char) key in the hash map or the * position to insert it into if it is not yet in. - * - * @return int */ - private function hashFind(int $keyNode, AbstractToken $keyChar) + private function hashFind(int $keyNode, AbstractToken $keyChar): int { $this->_numFind++; - /** @var int */ $hash = $keyChar->hashCode(); - /** @var int */ $pos = $this->posMod($this->primaryHash($keyNode, $hash)); - /** @var int */ $secondary = $this->secondaryHash($keyNode, $hash); while ($this->keyChars[$pos] !== null) { @@ -202,20 +195,16 @@ private function hashFind(int $keyNode, AbstractToken $keyChar) /** * Returns the primary hash value for a (node, character) key pair. - * - * @return int */ - private function primaryHash(int $keyNode, int $keyCharHash) + private function primaryHash(int $keyNode, int $keyCharHash): int { return $keyCharHash ^ (13 * $keyNode); } /** * Returns the secondary hash value for a (node, character) key pair. - * - * @return int */ - private function secondaryHash(int $keyNode, int $keyCharHash) + private function secondaryHash(int $keyNode, int $keyCharHash): int { $result = $this->posMod(($keyCharHash ^ (1025 * $keyNode))); @@ -229,10 +218,8 @@ private function secondaryHash(int $keyNode, int $keyCharHash) /** * Returns the smallest non-negative number congruent to x modulo * {@link #tableSize}. - * - * @return int */ - private function posMod(int $x) + private function posMod(int $x): int { $x %= $this->tableSize; diff --git a/src/Detector/Strategy/SuffixTree/Token.php b/src/Detector/Strategy/SuffixTree/Token.php index 412936d9..d34f9074 100644 --- a/src/Detector/Strategy/SuffixTree/Token.php +++ b/src/Detector/Strategy/SuffixTree/Token.php @@ -32,39 +32,7 @@ public function __toString(): string public function hashCode(): int { - return (int) crc32($this->content); - - //static $cashedHashCode = null; - //if ($cashedHashCode !== null) { - //return $cashedHashCode; - //} - - // Code below mimics 32-bit integer. Probably not needed. - /* - $value = $this->content; - $hashCode = 0; - $offset= 0; - $limit = strlen($value) + $offset; - for ($i = $offset; $i < $limit; $i++) { - $hashCode = $hashCode * 31 + ord($value[$i]); - //if (is_float($hashCode)) { - //die('nooo'); - //} - // NB: Simulate 32-bit int. - // @see https://stackoverflow.com/questions/15557407/how-to-use-a-32bit-integer-on-a-64bit-installation-of-php - //$hashCode = $hashCode & 0xFFFFFFFF; - $hashCode = $hashCode & 0xFFFFFFFF; - if ($hashCode & 0x80000000) { - $hashCode = $hashCode & ~0x80000000; - $hashCode = -2147483648 + $hashCode; - } - - } - //$cashedHashCode = $hashCode; - return $hashCode; - */ - //return $this->content->hashCode(); - //return $tokenCode; + return crc32($this->content); } public function equals(AbstractToken $token): bool diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index aecb6a65..326926cd 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -69,7 +69,6 @@ public function postProcess(): void $this->word[] = new Sentinel(); $tree = new ApproximateCloneDetectingSuffixTree($this->word); - /** @var CloneInfo[] */ $cloneInfos = $tree->findClones( $this->config->getMinTokens(), $this->config->getEditDistance(), From 893dbfdca18285c68bfec1f3b59a16b35a911c7d Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Thu, 24 Jun 2021 21:49:34 +0200 Subject: [PATCH 23/29] Run cs-fix --- src/Detector/Detector.php | 1 - .../Strategy/SuffixTree/AbstractToken.php | 11 ++++- .../ApproximateCloneDetectingSuffixTree.php | 45 +++++++++---------- src/Detector/Strategy/SuffixTree/PairList.php | 3 +- src/Detector/Strategy/SuffixTree/Sentinel.php | 18 ++++---- .../Strategy/SuffixTree/SuffixTree.php | 3 ++ .../SuffixTree/SuffixTreeHashTable.php | 16 +++---- src/Detector/Strategy/SuffixTreeStrategy.php | 15 +++---- 8 files changed, 59 insertions(+), 53 deletions(-) diff --git a/src/Detector/Detector.php b/src/Detector/Detector.php index eb2a89d7..a9acbe40 100644 --- a/src/Detector/Detector.php +++ b/src/Detector/Detector.php @@ -11,7 +11,6 @@ use SebastianBergmann\PHPCPD\CodeCloneMap; use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy; -use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration; final class Detector { diff --git a/src/Detector/Strategy/SuffixTree/AbstractToken.php b/src/Detector/Strategy/SuffixTree/AbstractToken.php index b54f25a1..96425032 100644 --- a/src/Detector/Strategy/SuffixTree/AbstractToken.php +++ b/src/Detector/Strategy/SuffixTree/AbstractToken.php @@ -13,15 +13,22 @@ abstract class AbstractToken { /** @var int */ public $tokenCode; + /** @var int */ public $line; + /** @var string */ public $file; + /** @var string */ public $tokenName; + /** @var string */ public $content; - abstract public function hashCode(): int; - abstract public function equals(AbstractToken $obj): bool; + abstract public function __toString(): string; + + abstract public function hashCode(): int; + + abstract public function equals(self $obj): bool; } diff --git a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php index 93b75c79..896fb4db 100644 --- a/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php +++ b/src/Detector/Strategy/SuffixTree/ApproximateCloneDetectingSuffixTree.php @@ -105,6 +105,7 @@ public function __construct(array $word) * @param int $minLength the minimal length of a clone in tokens (not lines) * @param int $maxErrors the maximal number of errors/gaps allowed * @param int $headEquality the number of elements which have to be the same at the beginning of a clone + * * @return CloneInfo[] */ public function findClones(int $minLength, int $maxErrors, int $headEquality): array @@ -369,8 +370,7 @@ private function calculateMaxLength( int $node, int $maxErrors, int $currentNodeWordLength - ) - { + ) { $this->edBuffer[0][0] = 0; $currentLength = 1; @@ -397,11 +397,11 @@ private function calculateMaxLength( $best = min( $best, $this->fillEDBuffer( - $k, - $currentLength, - $wordPosition, - $this->nodeWordBegin[$node] - ) + $k, + $currentLength, + $wordPosition, + $this->nodeWordBegin[$node] + ) ); } @@ -409,21 +409,21 @@ private function calculateMaxLength( $best = min( $best, $this->fillEDBuffer( - $currentLength, - $k, - $wordPosition, - $this->nodeWordBegin[$node] - ) + $currentLength, + $k, + $wordPosition, + $this->nodeWordBegin[$node] + ) ); } $best = min( $best, $this->fillEDBuffer( - $currentLength, - $currentLength, - $wordPosition, - $this->nodeWordBegin[$node] - ) + $currentLength, + $currentLength, + $wordPosition, + $this->nodeWordBegin[$node] + ) ); if ($best > $maxErrors || @@ -437,16 +437,13 @@ private function calculateMaxLength( return $currentLength; } - /** - */ private function reportClone( int $wordBegin, int $wordEnd, int $currentNode, int $nodeWordPos, int $nodeWordLength - ): void - { + ): void { $length = $wordEnd - $wordBegin; if ($length < $this->minLength || $nodeWordLength < $this->minLength) { @@ -466,7 +463,7 @@ private function reportClone( $occurrences = 1 + $otherClones->size(); // check whether we may start from here - $t = $this->word[$wordBegin]; + $t = $this->word[$wordBegin]; $newInfo = new CloneInfo($length, $wordBegin, $occurrences, $t, $otherClones); for ($index = max(0, $wordBegin - $this->INDEX_SPREAD + 1); $index <= $wordBegin; $index++) { @@ -492,6 +489,7 @@ private function reportClone( for ($clone = 0; $clone < $otherClones->size(); $clone++) { $start = $otherClones->getFirst($clone); $otherLength = $otherClones->getSecond($clone); + for ($i = 0; $i < $otherLength; $i += $this->INDEX_SPREAD) { $this->cloneInfos[$start + $i][] = new CloneInfo($otherLength - $i, $wordBegin, $occurrences, $t, $otherClones); } @@ -535,8 +533,7 @@ private function findRemainingClones( int $currentNode, int $distance, int $wordStart - ): void - { + ): void { for ($nextNode = $this->nodeChildFirst[$currentNode]; $nextNode >= 0; $nextNode = $this->nodeChildNext[$nextNode]) { $node = $this->nodeChildNode[$nextNode]; $this->findRemainingClones($clonePositions, $nodeWordLength, $node, $distance diff --git a/src/Detector/Strategy/SuffixTree/PairList.php b/src/Detector/Strategy/SuffixTree/PairList.php index 5051c384..c0b851ec 100644 --- a/src/Detector/Strategy/SuffixTree/PairList.php +++ b/src/Detector/Strategy/SuffixTree/PairList.php @@ -56,7 +56,7 @@ class PairList * @param S $firstType * @param T $secondType */ - public function __construct(int $initialCapacity = 16, $firstType, $secondType) + public function __construct(int $initialCapacity, $firstType, $secondType) { if ($initialCapacity < 1) { $initialCapacity = 1; @@ -141,6 +141,7 @@ public function getSecond(int $i) /** * Sets the first element at given index. + * * @param T $value */ public function setSecond(int $i, $value): void diff --git a/src/Detector/Strategy/SuffixTree/Sentinel.php b/src/Detector/Strategy/SuffixTree/Sentinel.php index 3f0599cd..ad241485 100644 --- a/src/Detector/Strategy/SuffixTree/Sentinel.php +++ b/src/Detector/Strategy/SuffixTree/Sentinel.php @@ -22,12 +22,17 @@ class Sentinel extends AbstractToken public function __construct() { - $this->hash = rand(0, PHP_INT_MAX); + $this->hash = rand(0, PHP_INT_MAX); $this->tokenCode = -1; - $this->line = -1; - $this->file = ''; + $this->line = -1; + $this->file = ''; $this->tokenName = ''; - $this->content = ''; + $this->content = ''; + } + + public function __toString(): string + { + return '$'; } public function hashCode(): int @@ -40,9 +45,4 @@ public function equals(AbstractToken $obj): bool // Original code uses physical object equality, not present in PHP. return $obj instanceof self; } - - public function __toString(): string - { - return '$'; - } } diff --git a/src/Detector/Strategy/SuffixTree/SuffixTree.php b/src/Detector/Strategy/SuffixTree/SuffixTree.php index 13c54339..73085e13 100644 --- a/src/Detector/Strategy/SuffixTree/SuffixTree.php +++ b/src/Detector/Strategy/SuffixTree/SuffixTree.php @@ -42,18 +42,21 @@ class SuffixTree { /** * Infinity in this context. + * * @var int */ protected $INFTY; /** * The word we are working on. + * * @var AbstractToken[] */ protected $word; /** * The number of nodes created so far. + * * @var int */ protected $numNodes = 0; diff --git a/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php b/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php index 588e7f8b..4bf807ee 100644 --- a/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php +++ b/src/Detector/Strategy/SuffixTree/SuffixTreeHashTable.php @@ -54,7 +54,7 @@ class SuffixTreeHashTable /** * Storage space for the character part of the key. * - * @var array + * @var array */ private $keyChars; @@ -147,11 +147,11 @@ public function put(int $keyNode, AbstractToken $keyChar, int $resultNode): void * {@link SuffixTree} class. * * @param int[] $nodeFirstIndex an array giving for each node the index where the first child - * will be stored (or -1 if it has no children) - * @param int[] $nodeNextIndex this array gives the next index of the child list or -1 if - * this is the last one - * @param int[] $nodeChild this array stores the actual name (=number) of the mode in the - * child list + * will be stored (or -1 if it has no children) + * @param int[] $nodeNextIndex this array gives the next index of the child list or -1 if + * this is the last one + * @param int[] $nodeChild this array stores the actual name (=number) of the mode in the + * child list */ public function extractChildLists(array &$nodeFirstIndex, array &$nodeNextIndex, array &$nodeChild): void { @@ -178,8 +178,8 @@ public function extractChildLists(array &$nodeFirstIndex, array &$nodeNextIndex, private function hashFind(int $keyNode, AbstractToken $keyChar): int { $this->_numFind++; - $hash = $keyChar->hashCode(); - $pos = $this->posMod($this->primaryHash($keyNode, $hash)); + $hash = $keyChar->hashCode(); + $pos = $this->posMod($this->primaryHash($keyNode, $hash)); $secondary = $this->secondaryHash($keyNode, $hash); while ($this->keyChars[$pos] !== null) { diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index 326926cd..614cee14 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -17,11 +17,10 @@ use SebastianBergmann\PHPCPD\CodeClone; use SebastianBergmann\PHPCPD\CodeCloneFile; use SebastianBergmann\PHPCPD\CodeCloneMap; -use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\ApproximateCloneDetectingSuffixTree; -use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\CloneInfo; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\AbstractToken; -use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Token; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\ApproximateCloneDetectingSuffixTree; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Sentinel; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Token; final class SuffixTreeStrategy extends AbstractStrategy { @@ -37,8 +36,8 @@ final class SuffixTreeStrategy extends AbstractStrategy public function processFile(string $file, CodeCloneMap $result): void { - $content = file_get_contents($file); - $tokens = token_get_all($content); + $content = file_get_contents($file); + $tokens = token_get_all($content); foreach (array_keys($tokens) as $key) { $token = $tokens[$key]; @@ -68,7 +67,7 @@ public function postProcess(): void // Sentinel = End of word $this->word[] = new Sentinel(); - $tree = new ApproximateCloneDetectingSuffixTree($this->word); + $tree = new ApproximateCloneDetectingSuffixTree($this->word); $cloneInfos = $tree->findClones( $this->config->getMinTokens(), $this->config->getEditDistance(), @@ -81,8 +80,8 @@ public function postProcess(): void for ($j = 0; $j < count($others); $j++) { $otherStart = $others[$j]; - $t = $this->word[$otherStart]; - $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length - 1]; + $t = $this->word[$otherStart]; + $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length - 1]; // If we stumbled upon the Sentinel, rewind one step. if ($lastToken instanceof Sentinel) { $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length - 2]; From 75d1e22f2bf6042cce2579b8ef994aaa325701fd Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Sat, 26 Jun 2021 12:28:06 +0200 Subject: [PATCH 24/29] Phpunit (WIP) --- src/Detector/Strategy/AbstractStrategy.php | 5 + src/Detector/Strategy/SuffixTreeStrategy.php | 2 +- tests/unit/DetectorTest.php | 115 +++++++++++-------- 3 files changed, 72 insertions(+), 50 deletions(-) diff --git a/src/Detector/Strategy/AbstractStrategy.php b/src/Detector/Strategy/AbstractStrategy.php index 1d30cc20..bd4e923a 100644 --- a/src/Detector/Strategy/AbstractStrategy.php +++ b/src/Detector/Strategy/AbstractStrategy.php @@ -44,6 +44,11 @@ public function __construct(StrategyConfiguration $config) $this->config = $config; } + public function setConfig(StrategyConfiguration $config) + { + $this->config = $config; + } + abstract public function processFile(string $file, CodeCloneMap $result): void; public function postProcess(): void diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index 614cee14..839c277a 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -81,7 +81,7 @@ public function postProcess(): void for ($j = 0; $j < count($others); $j++) { $otherStart = $others[$j]; $t = $this->word[$otherStart]; - $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length - 1]; + $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length]; // If we stumbled upon the Sentinel, rewind one step. if ($lastToken instanceof Sentinel) { $lastToken = $this->word[$cloneInfo->position + $cloneInfo->length - 2]; diff --git a/tests/unit/DetectorTest.php b/tests/unit/DetectorTest.php index c7d61813..816151ce 100644 --- a/tests/unit/DetectorTest.php +++ b/tests/unit/DetectorTest.php @@ -13,7 +13,11 @@ use function next; use function sort; use PHPUnit\Framework\TestCase; +use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy; use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy; +use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration; +use SebastianBergmann\PHPCPD\ArgumentsBuilder; /** * @covers \SebastianBergmann\PHPCPD\Detector\Detector @@ -28,11 +32,11 @@ final class DetectorTest extends TestCase /** * @dataProvider strategyProvider * - * @psalm-param class-string $strategy + * @psalm-param AbstractStrategy $strategy */ - public function testDetectingSimpleClonesWorks(string $strategy): void + public function testDetectingSimpleClonesWorks(AbstractStrategy $strategy): void { - $clones = (new Detector(new $strategy))->copyPasteDetection( + $clones = (new Detector($strategy))->copyPasteDetection( [__DIR__ . '/../fixture/Math.php'] ); @@ -117,18 +121,19 @@ public function testDetectingSimpleClonesWorks(string $strategy): void /** * @dataProvider strategyProvider - * - * @psalm-param class-string $strategy */ - public function testDetectingExactDuplicateFilesWorks(string $strategy): void + public function testDetectingExactDuplicateFilesWorks(AbstractStrategy $strategy): void { - $clones = (new Detector(new $strategy))->copyPasteDetection( + $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '50']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy->setConfig($config); + + $clones = (new Detector($strategy))->copyPasteDetection( [ __DIR__ . '/../fixture/a.php', __DIR__ . '/../fixture/b.php', - ], - 20, - 60 + ] ); $clones = $clones->clones(); @@ -149,22 +154,24 @@ public function testDetectingExactDuplicateFilesWorks(string $strategy): void /** * @dataProvider strategyProvider - * - * @psalm-param class-string $strategy */ - public function testDetectingClonesInMoreThanTwoFiles(string $strategy): void + public function testDetectingClonesInMoreThanTwoFiles(AbstractStrategy $strategy): void { - $clones = (new Detector(new $strategy))->copyPasteDetection( + $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '60']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy->setConfig($config); + + $clones = (new Detector($strategy))->copyPasteDetection( [ __DIR__ . '/../fixture/a.php', __DIR__ . '/../fixture/b.php', __DIR__ . '/../fixture/c.php', - ], - 20, - 60 + ] ); $clones = $clones->clones(); + //var_dump($clones); $files = $clones[0]->files(); sort($files); @@ -187,18 +194,18 @@ public function testDetectingClonesInMoreThanTwoFiles(string $strategy): void /** * @dataProvider strategyProvider - * - * @psalm-param class-string $strategy */ - public function testClonesAreIgnoredIfTheySpanLessTokensThanMinTokens(string $strategy): void + public function testClonesAreIgnoredIfTheySpanLessTokensThanMinTokens(AbstractStrategy $strategy): void { - $clones = (new Detector(new $strategy))->copyPasteDetection( + $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '61']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy->setConfig($config); + $clones = (new Detector($strategy))->copyPasteDetection( [ __DIR__ . '/../fixture/a.php', __DIR__ . '/../fixture/b.php', - ], - 20, - 61 + ] ); $this->assertCount(0, $clones->clones()); @@ -206,18 +213,18 @@ public function testClonesAreIgnoredIfTheySpanLessTokensThanMinTokens(string $st /** * @dataProvider strategyProvider - * - * @psalm-param class-string $strategy */ - public function testClonesAreIgnoredIfTheySpanLessLinesThanMinLines(string $strategy): void + public function testClonesAreIgnoredIfTheySpanLessLinesThanMinLines(AbstractStrategy $strategy): void { - $clones = (new Detector(new $strategy))->copyPasteDetection( + $argv = [1 => '.', '--min-lines', '21', '--min-tokens', '60']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy->setConfig($config); + $clones = (new Detector($strategy))->copyPasteDetection( [ __DIR__ . '/../fixture/a.php', __DIR__ . '/../fixture/b.php', - ], - 21, - 60 + ] ); $this->assertCount(0, $clones->clones()); @@ -225,19 +232,18 @@ public function testClonesAreIgnoredIfTheySpanLessLinesThanMinLines(string $stra /** * @dataProvider strategyProvider - * - * @psalm-param class-string $strategy */ - public function testFuzzyClonesAreFound(string $strategy): void + public function testFuzzyClonesAreFound(AbstractStrategy $strategy): void { - $clones = (new Detector(new $strategy))->copyPasteDetection( + $argv = [1 => '.', '--min-lines', '5', '--min-tokens', '20', '--fuzzy', 'true']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy->setConfig($config); + $clones = (new Detector($strategy))->copyPasteDetection( [ __DIR__ . '/../fixture/a.php', __DIR__ . '/../fixture/d.php', - ], - 5, - 20, - true + ] ); $this->assertCount(1, $clones->clones()); @@ -245,25 +251,30 @@ public function testFuzzyClonesAreFound(string $strategy): void /** * @dataProvider strategyProvider - * - * @psalm-param class-string $strategy */ - public function testStripComments(string $strategy): void + public function testStripComments(AbstractStrategy $strategy): void { - $detector = new Detector(new $strategy); + $argv = [1 => '.', '--min-lines', '8', '--min-tokens', '10', '--fuzzy', 'true']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy->setConfig($config); + + $detector = new Detector($strategy); $clones = $detector->copyPasteDetection( [ __DIR__ . '/../fixture/e.php', __DIR__ . '/../fixture/f.php', - ], - 8, - 10, - true + ] ); $this->assertCount(0, $clones->clones()); + $argv = [1 => '.', '--min-lines', '7', '--min-tokens', '10', '--fuzzy', 'true']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy->setConfig($config); + $clones = $detector->copyPasteDetection( [ __DIR__ . '/../fixture/e.php', @@ -278,12 +289,18 @@ public function testStripComments(string $strategy): void } /** - * @psalm-return list + * @psalm-return list */ public function strategyProvider(): array { + // Build default config. + $argv = [1 => '.']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + return [ - [DefaultStrategy::class], + //[new DefaultStrategy($config)], + [new SuffixTreeStrategy($config)] ]; } } From 873a934ad8023328af5f7e37d062cd81bbc17cc5 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Sat, 26 Jun 2021 12:35:03 +0200 Subject: [PATCH 25/29] Psalm + cs fix --- src/Detector/Strategy/AbstractStrategy.php | 2 +- tests/unit/DetectorTest.php | 40 +++++++++++----------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/Detector/Strategy/AbstractStrategy.php b/src/Detector/Strategy/AbstractStrategy.php index bd4e923a..ff6bb4d9 100644 --- a/src/Detector/Strategy/AbstractStrategy.php +++ b/src/Detector/Strategy/AbstractStrategy.php @@ -44,7 +44,7 @@ public function __construct(StrategyConfiguration $config) $this->config = $config; } - public function setConfig(StrategyConfiguration $config) + public function setConfig(StrategyConfiguration $config): void { $this->config = $config; } diff --git a/tests/unit/DetectorTest.php b/tests/unit/DetectorTest.php index 816151ce..3bc012c7 100644 --- a/tests/unit/DetectorTest.php +++ b/tests/unit/DetectorTest.php @@ -13,11 +13,11 @@ use function next; use function sort; use PHPUnit\Framework\TestCase; +use SebastianBergmann\PHPCPD\ArgumentsBuilder; use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy; use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy; -use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy; use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration; -use SebastianBergmann\PHPCPD\ArgumentsBuilder; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy; /** * @covers \SebastianBergmann\PHPCPD\Detector\Detector @@ -124,9 +124,9 @@ public function testDetectingSimpleClonesWorks(AbstractStrategy $strategy): void */ public function testDetectingExactDuplicateFilesWorks(AbstractStrategy $strategy): void { - $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '50']; + $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '50']; $arguments = (new ArgumentsBuilder)->build($argv); - $config = new StrategyConfiguration($arguments); + $config = new StrategyConfiguration($arguments); $strategy->setConfig($config); $clones = (new Detector($strategy))->copyPasteDetection( @@ -157,9 +157,9 @@ public function testDetectingExactDuplicateFilesWorks(AbstractStrategy $strategy */ public function testDetectingClonesInMoreThanTwoFiles(AbstractStrategy $strategy): void { - $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '60']; + $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '60']; $arguments = (new ArgumentsBuilder)->build($argv); - $config = new StrategyConfiguration($arguments); + $config = new StrategyConfiguration($arguments); $strategy->setConfig($config); $clones = (new Detector($strategy))->copyPasteDetection( @@ -172,7 +172,7 @@ public function testDetectingClonesInMoreThanTwoFiles(AbstractStrategy $strategy $clones = $clones->clones(); //var_dump($clones); - $files = $clones[0]->files(); + $files = $clones[0]->files(); sort($files); $file = current($files); @@ -197,9 +197,9 @@ public function testDetectingClonesInMoreThanTwoFiles(AbstractStrategy $strategy */ public function testClonesAreIgnoredIfTheySpanLessTokensThanMinTokens(AbstractStrategy $strategy): void { - $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '61']; + $argv = [1 => '.', '--min-lines', '20', '--min-tokens', '61']; $arguments = (new ArgumentsBuilder)->build($argv); - $config = new StrategyConfiguration($arguments); + $config = new StrategyConfiguration($arguments); $strategy->setConfig($config); $clones = (new Detector($strategy))->copyPasteDetection( [ @@ -216,9 +216,9 @@ public function testClonesAreIgnoredIfTheySpanLessTokensThanMinTokens(AbstractSt */ public function testClonesAreIgnoredIfTheySpanLessLinesThanMinLines(AbstractStrategy $strategy): void { - $argv = [1 => '.', '--min-lines', '21', '--min-tokens', '60']; + $argv = [1 => '.', '--min-lines', '21', '--min-tokens', '60']; $arguments = (new ArgumentsBuilder)->build($argv); - $config = new StrategyConfiguration($arguments); + $config = new StrategyConfiguration($arguments); $strategy->setConfig($config); $clones = (new Detector($strategy))->copyPasteDetection( [ @@ -235,9 +235,9 @@ public function testClonesAreIgnoredIfTheySpanLessLinesThanMinLines(AbstractStra */ public function testFuzzyClonesAreFound(AbstractStrategy $strategy): void { - $argv = [1 => '.', '--min-lines', '5', '--min-tokens', '20', '--fuzzy', 'true']; + $argv = [1 => '.', '--min-lines', '5', '--min-tokens', '20', '--fuzzy', 'true']; $arguments = (new ArgumentsBuilder)->build($argv); - $config = new StrategyConfiguration($arguments); + $config = new StrategyConfiguration($arguments); $strategy->setConfig($config); $clones = (new Detector($strategy))->copyPasteDetection( [ @@ -254,9 +254,9 @@ public function testFuzzyClonesAreFound(AbstractStrategy $strategy): void */ public function testStripComments(AbstractStrategy $strategy): void { - $argv = [1 => '.', '--min-lines', '8', '--min-tokens', '10', '--fuzzy', 'true']; + $argv = [1 => '.', '--min-lines', '8', '--min-tokens', '10', '--fuzzy', 'true']; $arguments = (new ArgumentsBuilder)->build($argv); - $config = new StrategyConfiguration($arguments); + $config = new StrategyConfiguration($arguments); $strategy->setConfig($config); $detector = new Detector($strategy); @@ -270,9 +270,9 @@ public function testStripComments(AbstractStrategy $strategy): void $this->assertCount(0, $clones->clones()); - $argv = [1 => '.', '--min-lines', '7', '--min-tokens', '10', '--fuzzy', 'true']; + $argv = [1 => '.', '--min-lines', '7', '--min-tokens', '10', '--fuzzy', 'true']; $arguments = (new ArgumentsBuilder)->build($argv); - $config = new StrategyConfiguration($arguments); + $config = new StrategyConfiguration($arguments); $strategy->setConfig($config); $clones = $detector->copyPasteDetection( @@ -294,13 +294,13 @@ public function testStripComments(AbstractStrategy $strategy): void public function strategyProvider(): array { // Build default config. - $argv = [1 => '.']; + $argv = [1 => '.']; $arguments = (new ArgumentsBuilder)->build($argv); - $config = new StrategyConfiguration($arguments); + $config = new StrategyConfiguration($arguments); return [ //[new DefaultStrategy($config)], - [new SuffixTreeStrategy($config)] + [new SuffixTreeStrategy($config)], ]; } } From 44b9df9a48e2a88215e2bc8600e2825b612566f6 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Sat, 26 Jun 2021 14:21:21 +0200 Subject: [PATCH 26/29] Some small notes --- src/Detector/Strategy/SuffixTreeStrategy.php | 3 + tests/fixture/editdistance1.php | 27 ++++++++ tests/fixture/editdistance2.php | 24 +++++++ tests/unit/DetectorTest.php | 3 +- tests/unit/EditDistanceTest.php | 68 ++++++++++++++++++++ 5 files changed, 123 insertions(+), 2 deletions(-) create mode 100644 tests/fixture/editdistance1.php create mode 100644 tests/fixture/editdistance2.php create mode 100644 tests/unit/EditDistanceTest.php diff --git a/src/Detector/Strategy/SuffixTreeStrategy.php b/src/Detector/Strategy/SuffixTreeStrategy.php index 839c277a..73528e9c 100644 --- a/src/Detector/Strategy/SuffixTreeStrategy.php +++ b/src/Detector/Strategy/SuffixTreeStrategy.php @@ -22,6 +22,9 @@ use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Sentinel; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Token; +/** + * For the design of the algorithm, all credits go to the authors of "Do Code Clones Matter?". + */ final class SuffixTreeStrategy extends AbstractStrategy { /** diff --git a/tests/fixture/editdistance1.php b/tests/fixture/editdistance1.php new file mode 100644 index 00000000..61a13c3a --- /dev/null +++ b/tests/fixture/editdistance1.php @@ -0,0 +1,27 @@ +question_l10ns->rows->row)) { + // Edit difference here. + if ($bTranslateLinksFields) { + $insertdata['question'] = translateLinks('survey', $iOldSID, $iNewSID, $insertdata['question']); + $insertdata['help'] = translateLinks('survey', $iOldSID, $iNewSID, $insertdata['help']); + } + $oQuestionL10n = new QuestionL10n(); + $oQuestionL10n->question = $insertdata['question']; + $oQuestionL10n->help = $insertdata['help']; + $oQuestionL10n->language = $insertdata['language']; + unset($insertdata['question']); + unset($insertdata['help']); + unset($insertdata['language']); +} + +// For some reason, two exact files will lead to one 0-line clone. +$a = 10; diff --git a/tests/fixture/editdistance2.php b/tests/fixture/editdistance2.php new file mode 100644 index 00000000..14b44676 --- /dev/null +++ b/tests/fixture/editdistance2.php @@ -0,0 +1,24 @@ +question_l10ns->rows->row)) { + // Edit difference here. + if ($options['translinkfields']) { + $insertdata['question'] = translateLinks('survey', $iOldSID, $iNewSID, $insertdata['question']); + $insertdata['help'] = translateLinks('survey', $iOldSID, $iNewSID, $insertdata['help']); + } + $oQuestionL10n = new QuestionL10n(); + $oQuestionL10n->question = $insertdata['question']; + $oQuestionL10n->help = $insertdata['help']; + $oQuestionL10n->language = $insertdata['language']; + unset($insertdata['question']); + unset($insertdata['help']); + unset($insertdata['language']); +} + +foo(); diff --git a/tests/unit/DetectorTest.php b/tests/unit/DetectorTest.php index 3bc012c7..1d744974 100644 --- a/tests/unit/DetectorTest.php +++ b/tests/unit/DetectorTest.php @@ -299,8 +299,7 @@ public function strategyProvider(): array $config = new StrategyConfiguration($arguments); return [ - //[new DefaultStrategy($config)], - [new SuffixTreeStrategy($config)], + [new DefaultStrategy($config)] ]; } } diff --git a/tests/unit/EditDistanceTest.php b/tests/unit/EditDistanceTest.php new file mode 100644 index 00000000..c3548497 --- /dev/null +++ b/tests/unit/EditDistanceTest.php @@ -0,0 +1,68 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ +namespace SebastianBergmann\PHPCPD\Detector; + +use function current; +use function next; +use function sort; +use PHPUnit\Framework\TestCase; +use SebastianBergmann\PHPCPD\ArgumentsBuilder; +use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy; +use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy; +use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration; +use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy; + +/** + * @covers \SebastianBergmann\PHPCPD\Detector\Detector + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy + * + * @uses \SebastianBergmann\PHPCPD\CodeClone + * @uses \SebastianBergmann\PHPCPD\CodeCloneFile + * @uses \SebastianBergmann\PHPCPD\CodeCloneMap + */ +final class EditDistanceTest extends TestCase +{ + public function testEditDistanceWithSuffixtree(): void + { + $argv = [1 => '.', '--min-tokens', '60']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy = new SuffixTreeStrategy($config); + + $clones = (new Detector($strategy))->copyPasteDetection( + [ + __DIR__ . '/../fixture/editdistance1.php', + __DIR__ . '/../fixture/editdistance2.php' + ], + ); + + $clones = $clones->clones(); + $this->assertCount(1, $clones); + } + + public function testEditDistanceWithRabinkarp(): void + { + $argv = [1 => '.', '--min-tokens', '60']; + $arguments = (new ArgumentsBuilder)->build($argv); + $config = new StrategyConfiguration($arguments); + $strategy = new DefaultStrategy($config); + + $clones = (new Detector($strategy))->copyPasteDetection( + [ + __DIR__ . '/../fixture/editdistance1.php', + __DIR__ . '/../fixture/editdistance2.php' + ], + ); + + $clones = $clones->clones(); + $this->assertCount(0, $clones); + } +} From bbb32038c44d6a7a1e93cdc9d70228a0999cf354 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Sat, 26 Jun 2021 14:22:51 +0200 Subject: [PATCH 27/29] Apply cs fix --- tests/unit/DetectorTest.php | 3 +-- tests/unit/EditDistanceTest.php | 12 ++++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/tests/unit/DetectorTest.php b/tests/unit/DetectorTest.php index 1d744974..7189dc2f 100644 --- a/tests/unit/DetectorTest.php +++ b/tests/unit/DetectorTest.php @@ -17,7 +17,6 @@ use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy; use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy; use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration; -use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy; /** * @covers \SebastianBergmann\PHPCPD\Detector\Detector @@ -299,7 +298,7 @@ public function strategyProvider(): array $config = new StrategyConfiguration($arguments); return [ - [new DefaultStrategy($config)] + [new DefaultStrategy($config)], ]; } } diff --git a/tests/unit/EditDistanceTest.php b/tests/unit/EditDistanceTest.php index c3548497..0107914e 100644 --- a/tests/unit/EditDistanceTest.php +++ b/tests/unit/EditDistanceTest.php @@ -9,12 +9,8 @@ */ namespace SebastianBergmann\PHPCPD\Detector; -use function current; -use function next; -use function sort; use PHPUnit\Framework\TestCase; use SebastianBergmann\PHPCPD\ArgumentsBuilder; -use SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy; use SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy; use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration; use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy; @@ -35,12 +31,12 @@ public function testEditDistanceWithSuffixtree(): void $argv = [1 => '.', '--min-tokens', '60']; $arguments = (new ArgumentsBuilder)->build($argv); $config = new StrategyConfiguration($arguments); - $strategy = new SuffixTreeStrategy($config); + $strategy = new SuffixTreeStrategy($config); $clones = (new Detector($strategy))->copyPasteDetection( [ __DIR__ . '/../fixture/editdistance1.php', - __DIR__ . '/../fixture/editdistance2.php' + __DIR__ . '/../fixture/editdistance2.php', ], ); @@ -53,12 +49,12 @@ public function testEditDistanceWithRabinkarp(): void $argv = [1 => '.', '--min-tokens', '60']; $arguments = (new ArgumentsBuilder)->build($argv); $config = new StrategyConfiguration($arguments); - $strategy = new DefaultStrategy($config); + $strategy = new DefaultStrategy($config); $clones = (new Detector($strategy))->copyPasteDetection( [ __DIR__ . '/../fixture/editdistance1.php', - __DIR__ . '/../fixture/editdistance2.php' + __DIR__ . '/../fixture/editdistance2.php', ], ); From 33a3b95746ac3f17c1e10353a23d8c8c3e8985d3 Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Wed, 30 Jun 2021 21:59:13 +0200 Subject: [PATCH 28/29] Add cover annotations to tests --- tests/unit/DetectorTest.php | 4 ++++ tests/unit/EditDistanceTest.php | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/tests/unit/DetectorTest.php b/tests/unit/DetectorTest.php index 7189dc2f..22a3a6e2 100644 --- a/tests/unit/DetectorTest.php +++ b/tests/unit/DetectorTest.php @@ -21,6 +21,10 @@ /** * @covers \SebastianBergmann\PHPCPD\Detector\Detector * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy + * @covers \SebastianBergmann\PHPCPD\Arguments + * @covers \SebastianBergmann\PHPCPD\ArgumentsBuilder * * @uses \SebastianBergmann\PHPCPD\CodeClone * @uses \SebastianBergmann\PHPCPD\CodeCloneFile diff --git a/tests/unit/EditDistanceTest.php b/tests/unit/EditDistanceTest.php index 0107914e..e80a2b6c 100644 --- a/tests/unit/EditDistanceTest.php +++ b/tests/unit/EditDistanceTest.php @@ -19,6 +19,17 @@ * @covers \SebastianBergmann\PHPCPD\Detector\Detector * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\ApproximateCloneDetectingSuffixTree + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\CloneInfo + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\PairList + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Sentinel + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\SuffixTree + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\SuffixTreeHashTable + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Token + * @covers \SebastianBergmann\PHPCPD\Arguments + * @covers \SebastianBergmann\PHPCPD\ArgumentsBuilder * * @uses \SebastianBergmann\PHPCPD\CodeClone * @uses \SebastianBergmann\PHPCPD\CodeCloneFile From 6231b93ca271b808891e261c78d9948139d1ef2d Mon Sep 17 00:00:00 2001 From: Olle Haerstedt Date: Tue, 20 Jul 2021 16:45:53 +0200 Subject: [PATCH 29/29] Apply cs fix --- tests/unit/DetectorTest.php | 6 +++--- tests/unit/EditDistanceTest.php | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/unit/DetectorTest.php b/tests/unit/DetectorTest.php index 22a3a6e2..fefcb0f8 100644 --- a/tests/unit/DetectorTest.php +++ b/tests/unit/DetectorTest.php @@ -19,12 +19,12 @@ use SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration; /** + * @covers \SebastianBergmann\PHPCPD\Arguments + * @covers \SebastianBergmann\PHPCPD\ArgumentsBuilder * @covers \SebastianBergmann\PHPCPD\Detector\Detector + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration - * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy - * @covers \SebastianBergmann\PHPCPD\Arguments - * @covers \SebastianBergmann\PHPCPD\ArgumentsBuilder * * @uses \SebastianBergmann\PHPCPD\CodeClone * @uses \SebastianBergmann\PHPCPD\CodeCloneFile diff --git a/tests/unit/EditDistanceTest.php b/tests/unit/EditDistanceTest.php index e80a2b6c..51fbf14b 100644 --- a/tests/unit/EditDistanceTest.php +++ b/tests/unit/EditDistanceTest.php @@ -16,11 +16,12 @@ use SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy; /** + * @covers \SebastianBergmann\PHPCPD\Arguments + * @covers \SebastianBergmann\PHPCPD\ArgumentsBuilder * @covers \SebastianBergmann\PHPCPD\Detector\Detector + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\DefaultStrategy - * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\StrategyConfiguration - * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\AbstractStrategy * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\ApproximateCloneDetectingSuffixTree * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\CloneInfo * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\PairList @@ -28,8 +29,7 @@ * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\SuffixTree * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\SuffixTreeHashTable * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTree\Token - * @covers \SebastianBergmann\PHPCPD\Arguments - * @covers \SebastianBergmann\PHPCPD\ArgumentsBuilder + * @covers \SebastianBergmann\PHPCPD\Detector\Strategy\SuffixTreeStrategy * * @uses \SebastianBergmann\PHPCPD\CodeClone * @uses \SebastianBergmann\PHPCPD\CodeCloneFile