Skip to content

Commit

Permalink
Improved tokenizing of context sensitive keywords
Browse files Browse the repository at this point in the history
  • Loading branch information
kukulich committed Dec 21, 2021
1 parent 67c82d9 commit 5eebcf0
Show file tree
Hide file tree
Showing 5 changed files with 872 additions and 149 deletions.
6 changes: 6 additions & 0 deletions package.xml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ http://pear.php.net/dtd/package-2.0.xsd">
<file baseinstalldir="" name="BackfillNumericSeparatorTest.php" role="test" />
<file baseinstalldir="" name="BitwiseOrTest.inc" role="test" />
<file baseinstalldir="" name="BitwiseOrTest.php" role="test" />
<file baseinstalldir="" name="ContextSensitiveKeywordsTest.inc" role="test" />
<file baseinstalldir="" name="ContextSensitiveKeywordsTest.php" role="test" />
<file baseinstalldir="" name="DefaultKeywordTest.inc" role="test" />
<file baseinstalldir="" name="DefaultKeywordTest.php" role="test" />
<file baseinstalldir="" name="DoubleArrowTest.inc" role="test" />
Expand Down Expand Up @@ -2097,6 +2099,8 @@ http://pear.php.net/dtd/package-2.0.xsd">
<install as="CodeSniffer/Core/Tokenizer/BackfillNumericSeparatorTest.inc" name="tests/Core/Tokenizer/BackfillNumericSeparatorTest.inc" />
<install as="CodeSniffer/Core/Tokenizer/BitwiseOrTest.php" name="tests/Core/Tokenizer/BitwiseOrTest.php" />
<install as="CodeSniffer/Core/Tokenizer/BitwiseOrTest.inc" name="tests/Core/Tokenizer/BitwiseOrTest.inc" />
<install as="CodeSniffer/Core/Tokenizer/ContextSensitiveKeywordsTest.php" name="tests/Core/Tokenizer/ContextSensitiveKeywordsTest.php" />
<install as="CodeSniffer/Core/Tokenizer/ContextSensitiveKeywordsTest.inc" name="tests/Core/Tokenizer/ContextSensitiveKeywordsTest.inc" />
<install as="CodeSniffer/Core/Tokenizer/DefaultKeywordTest.php" name="tests/Core/Tokenizer/DefaultKeywordTest.php" />
<install as="CodeSniffer/Core/Tokenizer/DefaultKeywordTest.inc" name="tests/Core/Tokenizer/DefaultKeywordTest.inc" />
<install as="CodeSniffer/Core/Tokenizer/DoubleArrowTest.php" name="tests/Core/Tokenizer/DoubleArrowTest.php" />
Expand Down Expand Up @@ -2191,6 +2195,8 @@ http://pear.php.net/dtd/package-2.0.xsd">
<install as="CodeSniffer/Core/Tokenizer/BackfillNumericSeparatorTest.inc" name="tests/Core/Tokenizer/BackfillNumericSeparatorTest.inc" />
<install as="CodeSniffer/Core/Tokenizer/BitwiseOrTest.php" name="tests/Core/Tokenizer/BitwiseOrTest.php" />
<install as="CodeSniffer/Core/Tokenizer/BitwiseOrTest.inc" name="tests/Core/Tokenizer/BitwiseOrTest.inc" />
<install as="CodeSniffer/Core/Tokenizer/ContextSensitiveKeywordsTest.php" name="tests/Core/Tokenizer/ContextSensitiveKeywordsTest.php" />
<install as="CodeSniffer/Core/Tokenizer/ContextSensitiveKeywordsTest.inc" name="tests/Core/Tokenizer/ContextSensitiveKeywordsTest.inc" />
<install as="CodeSniffer/Core/Tokenizer/DefaultKeywordTest.php" name="tests/Core/Tokenizer/DefaultKeywordTest.php" />
<install as="CodeSniffer/Core/Tokenizer/DefaultKeywordTest.inc" name="tests/Core/Tokenizer/DefaultKeywordTest.inc" />
<install as="CodeSniffer/Core/Tokenizer/DoubleArrowTest.php" name="tests/Core/Tokenizer/DoubleArrowTest.php" />
Expand Down
265 changes: 116 additions & 149 deletions src/Tokenizers/PHP.php
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,64 @@ protected function tokenize($string)
echo PHP_EOL;
}

/*
Tokenize context sensitive keyword as string when it should be string.
*/

if ($tokenIsArray === true
&& isset(Util\Tokens::$contextSensitiveKeywords[$token[0]]) === true
&& isset($this->tstringContexts[$finalTokens[$lastNotEmptyToken]['code']]) === true
) {
$preserveKeyword = false;

// `new class` should be preserved
if ($token[0] === T_CLASS && $finalTokens[$lastNotEmptyToken]['code'] === T_NEW) {
$preserveKeyword = true;
}

// `new class extends` `new class implements` should be preserved
if (($token[0] === T_EXTENDS || $token[0] === T_IMPLEMENTS)
&& $finalTokens[$lastNotEmptyToken]['code'] === T_CLASS
) {
$preserveKeyword = true;
}

// `namespace\` should be preserved
if ($token[0] === T_NAMESPACE) {
for ($i = ($stackPtr + 1); $i < $numTokens; $i++) {
if (is_array($tokens[$i]) === false) {
break;
}

if (isset(Util\Tokens::$emptyTokens[$tokens[$i][0]]) === true) {
continue;
}

if ($tokens[$i][0] === T_NS_SEPARATOR) {
$preserveKeyword = true;
}

break;
}
}

if ($preserveKeyword === false) {
if (PHP_CODESNIFFER_VERBOSITY > 1) {
$type = Util\Tokens::tokenName($token[0]);
echo "\t\t* token $stackPtr changed from $type to T_STRING".PHP_EOL;
}

$finalTokens[$newStackPtr] = [
'code' => T_STRING,
'type' => 'T_STRING',
'content' => $token[1],
];

$newStackPtr++;
continue;
}
}//end if

/*
Parse doc blocks into something that can be easily iterated over.
*/
Expand Down Expand Up @@ -1113,6 +1171,7 @@ protected function tokenize($string)
&& $tokenIsArray === true
&& $token[0] === T_STRING
&& strtolower($token[1]) === 'yield'
&& isset($this->tstringContexts[$finalTokens[$lastNotEmptyToken]['code']]) === false
) {
if (isset($tokens[($stackPtr + 1)]) === true
&& isset($tokens[($stackPtr + 2)]) === true
Expand Down Expand Up @@ -1446,57 +1505,42 @@ protected function tokenize($string)

if ($tokenIsArray === true
&& $token[0] === T_DEFAULT
&& isset($this->tstringContexts[$finalTokens[$lastNotEmptyToken]['code']]) === false
) {
if (isset($this->tstringContexts[$finalTokens[$lastNotEmptyToken]['code']]) === false) {
for ($x = ($stackPtr + 1); $x < $numTokens; $x++) {
if ($tokens[$x] === ',') {
// Skip over potential trailing comma (supported in PHP).
continue;
}

if (is_array($tokens[$x]) === false
|| isset(Util\Tokens::$emptyTokens[$tokens[$x][0]]) === false
) {
// Non-empty, non-comma content.
break;
}
for ($x = ($stackPtr + 1); $x < $numTokens; $x++) {
if ($tokens[$x] === ',') {
// Skip over potential trailing comma (supported in PHP).
continue;
}

if (isset($tokens[$x]) === true
&& is_array($tokens[$x]) === true
&& $tokens[$x][0] === T_DOUBLE_ARROW
if (is_array($tokens[$x]) === false
|| isset(Util\Tokens::$emptyTokens[$tokens[$x][0]]) === false
) {
// Modify the original token stack for the double arrow so that
// future checks can disregard the double arrow token more easily.
// For match expression "case" statements, this is handled
// in PHP::processAdditional().
$tokens[$x][0] = T_MATCH_ARROW;
if (PHP_CODESNIFFER_VERBOSITY > 1) {
echo "\t\t* token $x changed from T_DOUBLE_ARROW to T_MATCH_ARROW".PHP_EOL;
}

$newToken = [];
$newToken['code'] = T_MATCH_DEFAULT;
$newToken['type'] = 'T_MATCH_DEFAULT';
$newToken['content'] = $token[1];
// Non-empty, non-comma content.
break;
}
}

if (PHP_CODESNIFFER_VERBOSITY > 1) {
echo "\t\t* token $stackPtr changed from T_DEFAULT to T_MATCH_DEFAULT".PHP_EOL;
}
if (isset($tokens[$x]) === true
&& is_array($tokens[$x]) === true
&& $tokens[$x][0] === T_DOUBLE_ARROW
) {
// Modify the original token stack for the double arrow so that
// future checks can disregard the double arrow token more easily.
// For match expression "case" statements, this is handled
// in PHP::processAdditional().
$tokens[$x][0] = T_MATCH_ARROW;
if (PHP_CODESNIFFER_VERBOSITY > 1) {
echo "\t\t* token $x changed from T_DOUBLE_ARROW to T_MATCH_ARROW".PHP_EOL;
}

$finalTokens[$newStackPtr] = $newToken;
$newStackPtr++;
continue;
}//end if
} else {
// Definitely not the "default" keyword.
$newToken = [];
$newToken['code'] = T_STRING;
$newToken['type'] = 'T_STRING';
$newToken['code'] = T_MATCH_DEFAULT;
$newToken['type'] = 'T_MATCH_DEFAULT';
$newToken['content'] = $token[1];

if (PHP_CODESNIFFER_VERBOSITY > 1) {
echo "\t\t* token $stackPtr changed from T_DEFAULT to T_STRING".PHP_EOL;
echo "\t\t* token $stackPtr changed from T_DEFAULT to T_MATCH_DEFAULT".PHP_EOL;
}

$finalTokens[$newStackPtr] = $newToken;
Expand Down Expand Up @@ -1693,52 +1737,16 @@ protected function tokenize($string)
}

/*
The string-like token after a function keyword should always be
tokenized as T_STRING even if it appears to be a different token,
such as when writing code like: function default(): foo
so go forward and change the token type before it is processed.
Note: this should not be done for `function Level\Name` within a
group use statement for the PHP 8 identifier name tokens as it
would interfere with the re-tokenization of those.
This is a special condition for T_ARRAY tokens used for
function return types. We want to keep the parenthesis map clean,
so let's tag these tokens as T_STRING.
*/

if ($tokenIsArray === true
&& ($token[0] === T_FUNCTION
|| $token[0] === T_FN)
&& $finalTokens[$lastNotEmptyToken]['code'] !== T_USE
) {
if ($token[0] === T_FUNCTION) {
for ($x = ($stackPtr + 1); $x < $numTokens; $x++) {
if (is_array($tokens[$x]) === false
|| (isset(Util\Tokens::$emptyTokens[$tokens[$x][0]]) === false
&& $tokens[$x][1] !== '&')
) {
// Non-empty content.
break;
}
}

if ($x < $numTokens
&& is_array($tokens[$x]) === true
&& $tokens[$x][0] !== T_STRING
&& $tokens[$x][0] !== T_NAME_QUALIFIED
) {
if (PHP_CODESNIFFER_VERBOSITY > 1) {
$oldType = Util\Tokens::tokenName($tokens[$x][0]);
echo "\t\t* token $x changed from $oldType to T_STRING".PHP_EOL;
}

$tokens[$x][0] = T_STRING;
}
}//end if

/*
This is a special condition for T_ARRAY tokens used for
function return types. We want to keep the parenthesis map clean,
so let's tag these tokens as T_STRING.
*/

// Go looking for the colon to start the return type hint.
// Start by finding the closing parenthesis of the function.
$parenthesisStack = [];
Expand Down Expand Up @@ -1926,31 +1934,31 @@ function return types. We want to keep the parenthesis map clean,
$newStackPtr++;
}
} else {
if ($tokenIsArray === true && $token[0] === T_STRING) {
// Some T_STRING tokens should remain that way
// due to their context.
if (isset($this->tstringContexts[$finalTokens[$lastNotEmptyToken]['code']]) === true) {
// Special case for syntax like: return new self
// where self should not be a string.
if ($finalTokens[$lastNotEmptyToken]['code'] === T_NEW
&& strtolower($token[1]) === 'self'
) {
$finalTokens[$newStackPtr] = [
'content' => $token[1],
'code' => T_SELF,
'type' => 'T_SELF',
];
} else {
$finalTokens[$newStackPtr] = [
'content' => $token[1],
'code' => T_STRING,
'type' => 'T_STRING',
];
}
// Some T_STRING tokens should remain that way due to their context.
if ($tokenIsArray === true
&& $token[0] === T_STRING
&& isset($this->tstringContexts[$finalTokens[$lastNotEmptyToken]['code']]) === true
) {
// Special case for syntax like: return new self
// where self should not be a string.
if ($finalTokens[$lastNotEmptyToken]['code'] === T_NEW
&& strtolower($token[1]) === 'self'
) {
$finalTokens[$newStackPtr] = [
'content' => $token[1],
'code' => T_SELF,
'type' => 'T_SELF',
];
} else {
$finalTokens[$newStackPtr] = [
'content' => $token[1],
'code' => T_STRING,
'type' => 'T_STRING',
];
}

$newStackPtr++;
continue;
}//end if
$newStackPtr++;
continue;
}//end if

$newToken = null;
Expand Down Expand Up @@ -2114,16 +2122,6 @@ function return types. We want to keep the parenthesis map clean,
$newToken['type'] = 'T_FINALLY';
}

// This is a special case for the PHP 5.5 classname::class syntax
// where "class" should be T_STRING instead of T_CLASS.
if (($newToken['code'] === T_CLASS
|| $newToken['code'] === T_FUNCTION)
&& $finalTokens[$lastNotEmptyToken]['code'] === T_DOUBLE_COLON
) {
$newToken['code'] = T_STRING;
$newToken['type'] = 'T_STRING';
}

// This is a special case for PHP 5.6 use function and use const
// where "function" and "const" should be T_STRING instead of T_FUNCTION
// and T_CONST.
Expand Down Expand Up @@ -2819,34 +2817,11 @@ protected function processAdditional()
$this->tokens[$i]['code'] = T_STRING;
$this->tokens[$i]['type'] = 'T_STRING';
}
} else if ($this->tokens[$i]['code'] === T_CONST) {
// Context sensitive keywords support.
for ($x = ($i + 1); $i < $numTokens; $x++) {
if (isset(Util\Tokens::$emptyTokens[$this->tokens[$x]['code']]) === false) {
// Non-whitespace content.
break;
}
}

if ($this->tokens[$x]['code'] !== T_STRING) {
if (PHP_CODESNIFFER_VERBOSITY > 1) {
$line = $this->tokens[$x]['line'];
$type = $this->tokens[$x]['type'];
echo "\t* token $x on line $line changed from $type to T_STRING".PHP_EOL;
}

$this->tokens[$x]['code'] = T_STRING;
$this->tokens[$x]['type'] = 'T_STRING';
}
} else if ($this->tokens[$i]['code'] === T_READONLY
|| ($this->tokens[$i]['code'] === T_STRING
&& strtolower($this->tokens[$i]['content']) === 'readonly')
} else if ($this->tokens[$i]['code'] === T_STRING
&& strtolower($this->tokens[$i]['content']) === 'readonly'
) {
/*
Adds "readonly" keyword support:
PHP < 8.1: Converts T_STRING to T_READONLY
PHP >= 8.1: Converts some T_READONLY to T_STRING because token_get_all()
without the TOKEN_PARSE flag cannot distinguish between them in some situations.
Adds "readonly" keyword support for PHP < 8.1.
*/

$allowedAfter = [
Expand Down Expand Up @@ -2890,22 +2865,14 @@ protected function processAdditional()
}
}

if ($this->tokens[$i]['code'] === T_STRING && $shouldBeReadonly === true) {
if ($shouldBeReadonly === true) {
if (PHP_CODESNIFFER_VERBOSITY > 1) {
$line = $this->tokens[$i]['line'];
echo "\t* token $i on line $line changed from T_STRING to T_READONLY".PHP_EOL;
}

$this->tokens[$i]['code'] = T_READONLY;
$this->tokens[$i]['type'] = 'T_READONLY';
} else if ($this->tokens[$i]['code'] === T_READONLY && $shouldBeReadonly === false) {
if (PHP_CODESNIFFER_VERBOSITY > 1) {
$line = $this->tokens[$i]['line'];
echo "\t* token $i on line $line changed from T_READONLY to T_STRING".PHP_EOL;
}

$this->tokens[$i]['code'] = T_STRING;
$this->tokens[$i]['type'] = 'T_STRING';
}

continue;
Expand Down
Loading

0 comments on commit 5eebcf0

Please sign in to comment.