* lang_cpp/parsing/parse_cpp.ml: Use Parse_info.tokenize_all_and_adju…

…st_pos This will help semgrep/semgrep#1925 The helper tokenize_all_and_adjust_pos correctly intercept Lexical_error and adjust the file position of the token inside the Lexical_error. When I introduced this helper function, I forgot to use it for the C/C++ parser (not sure why, maybe because the code was also handling ExpandedTok). test plan: $ semgrep -l c -e 'FOO' /tmp/foo.c ran 1 rules on 1 files: 0 findings 1 files could not be analyzed; run with --verbose for details or run with --strict to exit non-zero if any file cannot be analyzed does not generate Python backtrace anymore. Same with $ /home/pad/semgrep/_build/default/cli/Main.exe -dump_ast /tmp/foo.c /tmp/foo.c:3:0: Lexical error: unrecognised symbol, in token rule:# Raised at file "parsing/Parse_code.ml", line 144, characters 24-27 Called from file "parsing/Parse_code.ml", line 236, characters 18-48 Called from file "cli/Main.ml", line 855, characters 6-72 Called from file "pfff/h_program-lang/Error_code.ml", line 388, characters 4-8 no more "NO FILE INFO YET" exn.
semgrep · Nov 9, 2020 · f0b5b5b · f0b5b5b
1 parent 5b7211c
commit f0b5b5b
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 36 deletions.
diff --git a/h_program-lang/Parse_info.ml b/h_program-lang/Parse_info.ml
@@ -588,8 +588,13 @@ let tokenize_all_and_adjust_pos ?(unicode_hack=false)
     { ii with token =
       (* could assert pinfo.filename = file ? *)
        match ii.token with
-       | OriginTok pi -> OriginTok(complete_token_location_large file table pi)
-       | _ -> raise Todo
+       | OriginTok pi -> 
+         OriginTok(complete_token_location_large file table pi)
+       | ExpandedTok (pi,vpi, off) ->
+         ExpandedTok(complete_token_location_large file table pi,vpi,  off)
+       | FakeTokStr (s,vpi_opt) -> 
+         FakeTokStr (s,vpi_opt)
+       | Ab -> raise Impossible
     }      
   in
   let rec tokens_aux acc = 

diff --git a/lang_cpp/parsing/parse_cpp.ml b/lang_cpp/parsing/parse_cpp.ml
@@ -119,40 +119,10 @@ let is_same_line_or_close line tok =
 (*****************************************************************************)
 
 (* called by parse below *)
-let tokens2 file = 
- let table     = Parse_info.full_charpos_to_pos_large file in
-
- Common.with_open_infile file (fun chan -> 
-  let lexbuf = Lexing.from_channel chan in
-
-    let rec tokens_aux () = 
-      let tok = Lexer.token lexbuf in
-      (* fill in the line and col information *)
-      let tok = tok |> TH.visitor_info_of_tok (fun ii -> 
-        { ii with PI.token=
-          (* could assert pinfo.filename = file ? *)
-          match ii.PI.token with
-          |  PI.OriginTok pi ->
-             PI.OriginTok (Parse_info.complete_token_location_large file 
-                             table pi)
-          | PI.ExpandedTok (pi,vpi, off) ->
-              PI.ExpandedTok(
-                (Parse_info.complete_token_location_large file table pi),vpi, 
-                off)
-          | PI.FakeTokStr (s,vpi_opt) -> PI.FakeTokStr (s,vpi_opt)
-          | PI.Ab -> raise Impossible
-      })
-      in
-
-      if TH.is_eof tok
-      then [tok]
-      else tok::(tokens_aux ())
-    in
-    tokens_aux ()
- )
-
-let tokens a = 
-  Common.profile_code "Parse_cpp.tokens" (fun () -> tokens2 a)
+let tokens file = 
+  Parse_info.tokenize_all_and_adjust_pos ~unicode_hack:false
+    file Lexer.token TH.visitor_info_of_tok TH.is_eof
+[@@profiling]
 
 (*****************************************************************************)
 (* Fuzzy parsing *)