-
Notifications
You must be signed in to change notification settings - Fork 768
Description
I think there is an issue with the TeraTerm lexer which causes Pygments to guess it for many code snippets (more than 50% of the random snippets I try).
Here's an example of text that is correctly identified as C in Pygments 2.3.1 but identified as TerraTerm in Pygments 2.4.0 and above (including 2.4.2):
from pygments.lexers import guess_lexer
TEST_C = '''
#include <stdio.h>
#include <stdlib.h>
int main(void);
int main(void) {
uint8_t x = 42;
uint8_t y = x + 1;
/* exit 1 for success! */
return 1;
}
'''
print(guess_lexer(TEST_C))I think it might be because of this special scoring logic:
pygments/pygments/lexers/teraterm.py
Lines 152 to 158 in c3fdd7b
| # Turtle and Tera Term macro files share the same file extension | |
| # but each has a recognizable and distinct syntax. | |
| def analyse_text(text): | |
| result = 0.0 | |
| if re.search(TeraTermLexer.tokens['commands'][0][0], text): | |
| result += 0.60 | |
| return result |
I'm not quite sure what the fix is here; it seems like analyze_text is used to pick between filename matches, but in this case where no filename is provided, a large number of samples will go for TerraTerm as most languages use some of the commands it's looking for:
pygments/pygments/lexers/teraterm.py
Lines 57 to 97 in c3fdd7b
| 'commands': [ | |
| ( | |
| r'(?i)\b(' | |
| r'basename|beep|bplusrecv|bplussend|break|bringupbox|' | |
| r'callmenu|changedir|checksum16|checksum16file|' | |
| r'checksum32|checksum32file|checksum8|checksum8file|' | |
| r'clearscreen|clipb2var|closesbox|closett|code2str|' | |
| r'connect|continue|crc16|crc16file|crc32|crc32file|' | |
| r'cygconnect|delpassword|dirname|dirnamebox|disconnect|' | |
| r'dispstr|do|else|elseif|enablekeyb|end|endif|enduntil|' | |
| r'endwhile|exec|execcmnd|exit|expandenv|fileclose|' | |
| r'fileconcat|filecopy|filecreate|filedelete|filelock|' | |
| r'filemarkptr|filenamebox|fileopen|fileread|filereadln|' | |
| r'filerename|filesearch|fileseek|fileseekback|filestat|' | |
| r'filestrseek|filestrseek2|filetruncate|fileunlock|' | |
| r'filewrite|filewriteln|findclose|findfirst|findnext|' | |
| r'flushrecv|foldercreate|folderdelete|foldersearch|for|' | |
| r'getdate|getdir|getenv|getfileattr|gethostname|' | |
| r'getipv4addr|getipv6addr|getmodemstatus|getpassword|' | |
| r'getspecialfolder|gettime|gettitle|getttdir|getver|' | |
| r'if|ifdefined|include|inputbox|int2str|intdim|' | |
| r'ispassword|kmtfinish|kmtget|kmtrecv|kmtsend|listbox|' | |
| r'loadkeymap|logautoclosemode|logclose|loginfo|logopen|' | |
| r'logpause|logrotate|logstart|logwrite|loop|makepath|' | |
| r'messagebox|mpause|next|passwordbox|pause|quickvanrecv|' | |
| r'quickvansend|random|recvln|regexoption|restoresetup|' | |
| r'return|rotateleft|rotateright|scprecv|scpsend|send|' | |
| r'sendbreak|sendbroadcast|sendfile|sendkcode|sendln|' | |
| r'sendlnbroadcast|sendlnmulticast|sendmulticast|setbaud|' | |
| r'setdate|setdebug|setdir|setdlgpos|setdtr|setecho|' | |
| r'setenv|setexitcode|setfileattr|setflowctrl|' | |
| r'setmulticastname|setpassword|setrts|setsync|settime|' | |
| r'settitle|show|showtt|sprintf|sprintf2|statusbox|' | |
| r'str2code|str2int|strcompare|strconcat|strcopy|strdim|' | |
| r'strinsert|strjoin|strlen|strmatch|strremove|' | |
| r'strreplace|strscan|strspecial|strsplit|strtrim|' | |
| r'testlink|then|tolower|toupper|unlink|until|uptime|' | |
| r'var2clipb|wait|wait4all|waitevent|waitln|waitn|' | |
| r'waitrecv|waitregex|while|xmodemrecv|xmodemsend|' | |
| r'yesnobox|ymodemrecv|ymodemsend|zmodemrecv|zmodemsend' | |
| r')\b', |