pmd · oowekyala · Apr 7, 2024 · Apr 8, 2024 · Apr 8, 2024 · Apr 8, 2024
diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/impl/AntlrCpdLexer.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/impl/AntlrCpdLexer.java
@@ -12,6 +12,7 @@
 
 import net.sourceforge.pmd.cpd.CpdLexer;
 import net.sourceforge.pmd.lang.TokenManager;
+import net.sourceforge.pmd.lang.ast.impl.antlr4.AntlrLexerBehavior;
 import net.sourceforge.pmd.lang.ast.impl.antlr4.AntlrToken;
 import net.sourceforge.pmd.lang.ast.impl.antlr4.AntlrTokenManager;
 import net.sourceforge.pmd.lang.document.TextDocument;
@@ -23,7 +24,15 @@ public abstract class AntlrCpdLexer extends CpdLexerBase<AntlrToken> {
     @Override
     protected final TokenManager<AntlrToken> makeLexerImpl(TextDocument doc) throws IOException {
         CharStream charStream = CharStreams.fromReader(doc.newReader(), doc.getFileId().getAbsolutePath());
-        return new AntlrTokenManager(getLexerForSource(charStream), doc);
+        return new AntlrTokenManager(getLexerForSource(charStream), doc, getLexerBehavior());
+    }
+
+    /**
+     * Override this method to customize some aspects of the
+     * lexer.
+     */
+    protected AntlrLexerBehavior getLexerBehavior() {
+        return new AntlrLexerBehavior();
     }
 
     protected abstract Lexer getLexerForSource(CharStream charStream);

diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrLexerBehavior.java b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrLexerBehavior.java
@@ -0,0 +1,32 @@
+/**
+ * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
+ */
+
+package net.sourceforge.pmd.lang.ast.impl.antlr4;
+
+import org.antlr.v4.runtime.Token;
+
+import net.sourceforge.pmd.cpd.CpdLanguageProperties;
+
+/**
+ * Strategy to customize some aspects of the mapping
+ * from Antlr tokens to PMD/CPD tokens.
+ */
+public class AntlrLexerBehavior {
+
+
+    /**
+     * Return the image that the token should have, possibly applying a transformation.
+     * The default just returns {@link Token#getText()}.
+     * Transformations here are usually normalizations, for instance, mapping
+     * the image of all keywords to uppercase/lowercase to implement case-insensitivity,
+     * or replacing the image of literals by a placeholder to implement {@link CpdLanguageProperties#CPD_ANONYMIZE_LITERALS}.
+     *
+     * @param token A token from the Antlr Lexer
+     *
+     * @return The image
+     */
+    protected String getTokenImage(Token token) {
+        return token.getText();
+    }
+}
diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrToken.java b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrToken.java
@@ -17,9 +17,13 @@
  */
 public class AntlrToken implements GenericToken<AntlrToken> {
 
-    private final Token token;
     private final AntlrToken previousComment;
     private final TextDocument textDoc;
+    private final String image;
+    private final int endOffset;
+    private final int startOffset;
+    private final int channel;
+    private final int kind;
     AntlrToken next;
 
 
@@ -30,10 +34,22 @@ public class AntlrToken implements GenericToken<AntlrToken> {
      * @param previousComment The previous comment
      * @param textDoc         The text document
      */
-    public AntlrToken(final Token token, final AntlrToken previousComment, TextDocument textDoc) {
-        this.token = token;
+    AntlrToken(final Token token, final AntlrToken previousComment, TextDocument textDoc, AntlrLexerBehavior behavior) {
         this.previousComment = previousComment;
         this.textDoc = textDoc;
+        this.image = behavior.getTokenImage(token);
+        this.startOffset = token.getStartIndex();
+        this.endOffset = token.getStopIndex() + 1; // exclusive
+        this.channel = token.getChannel();
+        this.kind = token.getType();
+    }
+
+    /**
+     * @deprecated Don't create antlr tokens directly, use an {@link AntlrTokenManager}
+     */
+    @Deprecated
+    public AntlrToken(final Token token, final AntlrToken previousComment, TextDocument textDoc) {
+        this(token, previousComment, textDoc, new AntlrLexerBehavior());
     }
 
     @Override
@@ -48,13 +64,13 @@ public AntlrToken getPreviousComment() {
 
     @Override
     public CharSequence getImageCs() {
-        return token.getText();
+        return image;
     }
 
     /** Returns a text region with the coordinates of this token. */
     @Override
     public TextRegion getRegion() {
-        return TextRegion.fromBothOffsets(token.getStartIndex(), token.getStopIndex() + 1);
+        return TextRegion.fromBothOffsets(startOffset, endOffset);
     }
 
     @Override
@@ -74,14 +90,14 @@ public int compareTo(AntlrToken o) {
 
     @Override
     public int getKind() {
-        return token.getType();
+        return kind;
     }
 
     public boolean isHidden() {
         return !isDefault();
     }
 
     public boolean isDefault() {
-        return token.getChannel() == Lexer.DEFAULT_TOKEN_CHANNEL;
+        return channel == Lexer.DEFAULT_TOKEN_CHANNEL;
     }
 }
diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrTokenManager.java b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrTokenManager.java
@@ -20,12 +20,20 @@ public class AntlrTokenManager implements TokenManager<AntlrToken> {
 
     private final Lexer lexer;
     private final TextDocument textDoc;
+    private final AntlrLexerBehavior behavior;
     private AntlrToken previousToken;
 
 
     public AntlrTokenManager(final Lexer lexer, final TextDocument textDocument) {
+        this(lexer, textDocument, new AntlrLexerBehavior());
+    }
+
+    public AntlrTokenManager(final Lexer lexer,
+                             final TextDocument textDocument,
+                             final AntlrLexerBehavior behavior) {
         this.lexer = lexer;
         this.textDoc = textDocument;
+        this.behavior = behavior;
         resetListeners();
     }
 
@@ -40,7 +48,7 @@ public AntlrToken getNextToken() {
 
     private AntlrToken getNextTokenFromAnyChannel() {
         final AntlrToken previousComment = previousToken != null && previousToken.isHidden() ? previousToken : null;
-        final AntlrToken currentToken = new AntlrToken(lexer.nextToken(), previousComment, textDoc);
+        final AntlrToken currentToken = new AntlrToken(lexer.nextToken(), previousComment, textDoc, this.behavior);
         if (previousToken != null) {
             previousToken.next = currentToken;
         }

diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/javacc/JavaccToken.java b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/javacc/JavaccToken.java
@@ -147,6 +147,11 @@ public String getImage() {
         return image.toString();
     }
 
+    /** Returns the original text of the token. The image may be normalized. */
+    public Chars getText() {
+        return document.getTextDocument().sliceOriginalText(getRegion());
+    }
+
     @Override
     public final TextRegion getRegion() {
         return TextRegion.fromBothOffsets(startOffset, endOffset);