In [6]:
import re
import pandas as pd
from SymTable import SymTable
from PIF import PIF
from FA import FA

class Scanner:
    def __init__(self, path, size=5):
        self.path = path
        self.__pif = PIF()
        self.__sti = SymTable(size)
        self.__stc = SymTable(size)
        self.IdFA = FA("FA/Id.txt")
        self.IntFA = FA("FA/Int.txt")

        def read_tokens(path):
            # Create list of tokens read from file but sorted by length descending
            with open(path, 'r') as f:
                list = f.read().splitlines()
                list.sort(key=len, reverse=True)
                return list  

        # Read tokens from files
        self.reserved_words = read_tokens('tokens/reserved_words.in')
        self.operators = read_tokens('tokens/operators.in')
        self.separators = read_tokens('tokens/separators.in')
    
    def isConstant(self, token):
        # int: either zero or a non-zero digit followed by zero or more digits
        # string: a sequence of characters enclosed by double quotes
        return self.IntFA.check(token) or re.match(r'^\"[a-z]+\"$', token, re.IGNORECASE)

    def isIdentifier(self, token):
        # a sequence of letters, digits, starting with a letter
        return self.IdFA.check(token)

    def tokenize(self, line):
        # Add spaces around separators
        for separator in self.separators:
            line = re.sub(rf"({re.escape(separator)})", r' \1 ', line)
        
        for operator in self.operators:
            # Add spaces around binary operators that must be surrounded by identifiers or constants
            line = re.sub(rf"([a-z0-9\"]+)\s*({re.escape(operator)})\s*([a-z0-9\"-]+)", r'\1 \2 \3', line, re.IGNORECASE)
            # Add spaces around unary operators that must be followed by an identifier or constant
            line = re.sub(rf"^\s*({re.escape(operator)})\s*([a-z0-9\"-]+)", r'\1 \2', line, re.IGNORECASE)

        return line.split()  
    
    def classify(self, token):
        if token in self.reserved_words:
            return 'res'
        elif token in self.operators:
            return 'op'
        elif token in self.separators:
            return 'sep'
        elif self.isConstant(token):
            return 'con'
        elif self.isIdentifier(token):
            return 'id'
        else:
            return 'error'  

    def scan(self):
        """
        For each line in the source file:
        1. Tokenize the line
        2. Classify each token
        3. Add the token to the PIF
        4. Add the token to the ST if it is an identifier or constant
        5. Print an error message if the token is invalid
        """
        with open(self.path, 'r') as f:
            dict = {'token':[], 'type':[]}
            lines = f.readlines()
            for i, line in enumerate(lines):
                tokens = line.split()
                tokens = self.tokenize(line)
                for token in tokens:
                    type = self.classify(token)
                    dict['token'].append(token)
                    dict['type'].append(type)
                    if type in ['id', 'con']:
                        if type == 'id':
                            pos = self.__sti.getPosition(token)
                        else:
                            pos = self.__stc.getPosition(token)
                        self.__pif.append((type, pos))
                    elif type in ['res', 'op', 'sep']:
                        self.__pif.append((token, -1))
                    else:
                        print(f"\033[91mLine {i + 1}: Lexical error: Invalid token {token}\033[0m")
                # print(f"Line {i + 1}: {tokens}")
            self.tokens = pd.DataFrame(dict)

    def save(self):
        # Save PIF and ST to files in the same directory as the source file
        source = "".join(self.path.split('.')[:-1])
        with open(source + "_PIF.out", 'w') as f:
            for token, pos in self.__pif.getElements():
                f.write(f"{token} {pos}\n")
        with open(source + "_STI.out", 'w') as f:
            f.write("Hash Table\n")
            f.write(str(self.__sti))
        with open(source + "_STC.out", 'w') as f:
            f.write("Hash Table\n")
            f.write(str(self.__stc))

    def printPIF(self):
        print('Program Internal Form:')
        display(pd.DataFrame(self.pif, columns=['token', 'pos']))
        
    def printST(self):
        print('Symbol Table (identifiers):')
        print(self.__sti)
        print('Symbol Table (constants):')
        print(self.__stc)

    def printTokens(self):
        display(self.tokens)

In [10]:
obj = Scanner('IO/p1err.txt')
obj.scan()
obj.save()
obj.printTokens()

[91mLine 1: Lexical error: Invalid token 1a[0m
[91mLine 18: Lexical error: Invalid token |[0m


Unnamed: 0,token,type
0,1a,error
1,",",sep
2,b,id
3,",",sep
4,c,id
5,",",sep
6,max,id
7,:,op
8,int,res
9,>>,op
