Implement "#include".

rui314 · Aug 25, 2018 · a382606 · a382606
1 parent e188ffd
commit a382606
Show file tree

Hide file tree

Showing 8 changed files with 142 additions and 62 deletions.
diff --git a/9cc.h b/9cc.h
@@ -131,12 +131,18 @@ typedef struct {
   char len;
 
   // For error reporting
+  char *buf;
+  char *filename;
   char *start;
 } Token;
 
-Vector *tokenize(char *p);
+Vector *tokenize(char *path, bool add_eof);
 noreturn void bad_token(Token *t, char *msg);
 
+/// preprocess.c
+
+Vector *preprocess(Vector *tokens);
+
 /// parse.c
 
 enum {
@@ -353,7 +359,3 @@ extern char *regs32[];
 extern int num_regs;
 
 void gen_x86(Vector *globals, Vector *fns);
-
-/// main.c
-
-char *filename;
diff --git a/Makefile b/Makefile
@@ -11,10 +11,13 @@ test: 9cc test/test.c
 	./9cc -test
 
 	@gcc -E -P test/test.c | ./9cc - > tmp-test1.s
-	@./9cc test/token.c > tmp-test2.s
 	@gcc -c -o tmp-test2.o test/gcc.c
-	@gcc -static -o tmp-test tmp-test1.s tmp-test2.s tmp-test2.o
-	@./tmp-test
+	@gcc -static -o tmp-test1 tmp-test1.s tmp-test2.o
+	@./tmp-test1
+
+	@./9cc test/token.c > tmp-test2.s
+	@gcc -static -o tmp-test2 tmp-test2.s
+	@./tmp-test2
 
 clean:
 	rm -f 9cc *.o *~ tmp* a.out test/*~

diff --git a/main.c b/main.c
@@ -1,31 +1,5 @@
 #include "9cc.h"
 
-char *filename;
-
-static char *read_file(char *filename) {
-  FILE *fp = stdin;
-  if (strcmp(filename, "-")) {
-    fp = fopen(filename, "r");
-    if (!fp) {
-      perror(filename);
-      exit(1);
-    }
-  }
-
-  StringBuilder *sb = new_sb();
-  char buf[4096];
-  for (;;) {
-    int nread = fread(buf, 1, sizeof(buf), fp);
-    if (nread == 0)
-      break;
-    sb_append_n(sb, buf, nread);
-  }
-
-  if (sb->data[sb->len] != '\n')
-    sb_add(sb, '\n');
-  return sb_get(sb);
-}
-
 void usage() { error("Usage: 9cc [-test] [-dump-ir1] [-dump-ir2] <file>"); }
 
 int main(int argc, char **argv) {
@@ -37,24 +11,24 @@ int main(int argc, char **argv) {
     return 0;
   }
 
+  char *path;
   bool dump_ir1 = false;
   bool dump_ir2 = false;
 
   if (argc == 3 && !strcmp(argv[1], "-dump-ir1")) {
     dump_ir1 = true;
-    filename = argv[2];
+    path = argv[2];
   } else if (argc == 3 && !strcmp(argv[1], "-dump-ir2")) {
     dump_ir2 = true;
-    filename = argv[2];
+    path = argv[2];
   } else {
     if (argc != 2)
       usage();
-    filename = argv[1];
+    path = argv[1];
   }
 
   // Tokenize and parse.
-  char *input = read_file(filename);
-  Vector *tokens = tokenize(input);
+  Vector *tokens = tokenize(path, true);
   Vector *nodes = parse(tokens);
   Vector *globals = sema(nodes);
   Vector *fns = gen_ir(nodes);

diff --git a/preprocess.c b/preprocess.c
@@ -0,0 +1,35 @@
+// C preprocessor
+
+#include "9cc.h"
+
+Vector *preprocess(Vector *tokens) {
+  Vector *v = new_vec();
+
+  for (int i = 0; i < tokens->len;) {
+    Token *t = tokens->data[i];
+    if (t->ty != '#') {
+      i++;
+      vec_push(v, t);
+      continue;
+    }
+
+    t = tokens->data[++i];
+    if (t->ty != TK_IDENT || strcmp(t->name, "include"))
+      bad_token(t, "'include' expected");
+
+    t = tokens->data[++i];
+    if (t->ty != TK_STR)
+      bad_token(t, "string expected");
+
+    char *path = t->str;
+
+    t = tokens->data[++i];
+    if (t->ty != '\n')
+      bad_token(t, "newline expected");
+
+    Vector *nv = tokenize(path, false);
+    for (int i = 0; i < nv->len; i++)
+      vec_push(v, nv->data[i]);
+  }
+  return v;
+}
diff --git a/test/test1.inc b/test/test1.inc
@@ -0,0 +1,7 @@
+int printf();
+
+int main() {
+#include "test/test2.inc"
+  1; 2;
+  return 0;
+}
diff --git a/test/test2.inc b/test/test2.inc
@@ -0,0 +1 @@
+printf("OK\n");
diff --git a/test/token.c b/test/token.c
@@ -1,8 +1,4 @@
-// This file contains tests for the tokenizer.
-//
-// Note that we don't actually use the function defined by this file
-// because we are interested only in knowing whether the tokenizer can
-// tokenize this file or not.
+// This file contains tests for the tokenizer and the preprocessor.
 
 // a line comment\
 continues\
@@ -12,3 +8,5 @@ to this line
 /* block comment
 **
 */
+
+#include "test/test1.inc"

diff --git a/token.c b/token.c
@@ -2,16 +2,16 @@
 
 // Error reporting
 
-static char *input_file;
+static char *buf;
+static char *filename;
 
 // Finds a line pointed by a given pointer from the input file
 // to print it out.
-static void print_line(char *pos) {
-  char *start = input_file;
+static void print_line(char *start, char *path, char *pos) {
   int line = 0;
   int col = 0;
 
-  for (char *p = input_file; p; p++) {
+  for (char *p = start; p; p++) {
     if (*p == '\n') {
       start = p + 1;
       line++;
@@ -24,7 +24,7 @@ static void print_line(char *pos) {
       continue;
     }
 
-    fprintf(stderr, "error at %s:%d:%d\n\n", filename, line + 1, col + 1);
+    fprintf(stderr, "error at %s:%d:%d\n\n", path, line + 1, col + 1);
 
     int linelen = strchr(p, '\n') - start;
     fprintf(stderr, "%.*s\n", linelen, start);
@@ -37,7 +37,7 @@ static void print_line(char *pos) {
 }
 
 noreturn void bad_token(Token *t, char *msg) {
-  print_line(t->start);
+  print_line(t->buf, t->filename, t->start);
   error(msg);
 }
 
@@ -53,6 +53,8 @@ static Token *add(int ty, char *start) {
   Token *t = calloc(1, sizeof(Token));
   t->ty = ty;
   t->start = start;
+  t->filename = filename;
+  t->buf = buf;
   vec_push(tokens, t);
   return t;
 }
@@ -80,6 +82,30 @@ static char escaped[256] = {
         ['v'] = '\v', ['e'] = '\033', ['E'] = '\033',
 };
 
+static char *read_file(char *path) {
+  FILE *fp = stdin;
+  if (strcmp(path, "-")) {
+    fp = fopen(path, "r");
+    if (!fp) {
+      perror(path);
+      exit(1);
+    }
+  }
+
+  StringBuilder *sb = new_sb();
+  char buf[4096];
+  for (;;) {
+    int nread = fread(buf, 1, sizeof(buf), fp);
+    if (nread == 0)
+      break;
+    sb_append_n(sb, buf, nread);
+  }
+
+  if (sb->data[sb->len] != '\n')
+    sb_add(sb, '\n');
+  return sb_get(sb);
+}
+
 static Map *keyword_map() {
   Map *map = new_map();
   map_puti(map, "_Alignof", TK_ALIGNOF);
@@ -104,7 +130,7 @@ static char *block_comment(char *pos) {
   for (char *p = pos + 2; *p; p++)
     if (!strncmp(p, "*/", 2))
       return p + 2;
-  print_line(pos);
+  print_line(buf, filename, pos);
   error("unclosed comment");
 }
 
@@ -215,10 +241,18 @@ static char *number(char *p) {
 
 // Tokenized input is stored to this array.
 static void scan() {
-  char *p = input_file;
+  char *p = buf;
 
 loop:
   while (*p) {
+    // New line (preprocessor-only token)
+    if (*p == '\n') {
+      add(*p, p);
+      p++;
+      continue;
+    }
+
+    // Whitespace
     if (isspace(*p)) {
       p++;
       continue;
@@ -262,7 +296,7 @@ static void scan() {
     }
 
     // Single-letter symbol
-    if (strchr("+-*/;=(),{}<>[]&.!?:|^%~", *p)) {
+    if (strchr("+-*/;=(),{}<>[]&.!?:|^%~#", *p)) {
       add(*p, p);
       p++;
       continue;
@@ -280,15 +314,13 @@ static void scan() {
       continue;
     }
 
-    print_line(p);
+    print_line(buf, filename, p);
     error("cannot tokenize");
   }
-
-  add(TK_EOF, p);
 }
 
 static void canonicalize_newline() {
-  char *p = input_file;
+  char *p = buf;
   for (char *q = p; *q;) {
     if (q[0] == '\r' && q[1] == '\n')
       q++;
@@ -298,7 +330,7 @@ static void canonicalize_newline() {
 }
 
 static void remove_backslash_newline() {
-  char *p = input_file;
+  char *p = buf;
   for (char *q = p; *q;) {
     if (q[0] == '\\' && q[1] == '\n')
       q += 2;
@@ -308,6 +340,16 @@ static void remove_backslash_newline() {
   *p = '\0';
 }
 
+static void strip_newlines() {
+  Vector *v = new_vec();
+  for (int i = 0; i < tokens->len; i++) {
+    Token *t = tokens->data[i];
+    if (t->ty != '\n')
+      vec_push(v, t);
+  }
+  tokens = v;
+}
+
 static void append(Token *x, Token *y) {
   StringBuilder *sb = new_sb();
   sb_append_n(sb, x->str, x->len - 1);
@@ -333,14 +375,32 @@ static void join_string_literals() {
   tokens = v;
 }
 
-Vector *tokenize(char *p) {
+Vector *tokenize(char *path, bool add_eof) {
+  if (!keywords)
+    keywords = keyword_map();
+
+  Vector *tokens_ = tokens;
+  char *filename_ = filename;
+  char *buf_ = buf;
+
   tokens = new_vec();
-  keywords = keyword_map();
-  input_file = p;
+  filename = path;
+  buf = read_file(path);
 
   canonicalize_newline();
   remove_backslash_newline();
+
   scan();
+  if (add_eof)
+    add(TK_EOF, buf);
+
+  tokens = preprocess(tokens);
+  strip_newlines();
   join_string_literals();
-  return tokens;
+
+  Vector *ret = tokens;
+  buf = buf_;
+  tokens = tokens_;
+  filename = filename_;
+  return ret;
 }