In [5]:
#Token class is used here to create the individual tokens that were identified in lexer class
#__init__ constructor used to initialize the object 
#__repr__ special method in python that will give a string representation of the object

class Token:
    def __init__(self, token_identity, value):
        self.token_identity = token_identity
        self.value = value
        
    def __repr__(self):
        return f"{self.token_identity}: {self.value}"
    


import re     #python module used for the regular expressions

class Lexer: 
    math_operations = [       #list of all the operations. Each tuple defines the regular expression of each token type
        (r'\+', 'ADDITION'),
        (r'\-', 'SUBTRACTION'),
        (r'\*', 'MULTIPLICATION'),
        (r'/', 'DIVISION'),
        (r'%', 'MODULO'),
        (r'\(', 'GROUPING_SYMBOL'),
        (r'\)', 'GROUPING_SYMBOL'),
        (r'=', 'ASSIGNMENT'),
        (r'==', 'EQUALS'),
        (r'<', 'LESS_THAN'),
        (r'<=', 'LESS_THAN_OR_EQUAL_TO'),
        (r'>', 'GREATER_THAN_OR_EQUAL_TO'),
        (r'&&', 'LOGICAL_AND'),
        (r'\|\|', 'LOGICAL_OR'),
        (r'[a-z]', 'VARIABLE_IDENTIFIER'),
        (r'\d+(i8|i16|i32|i64)?', 'INTEGER_LITERAL'),
        (r'\d+\.\d+', 'FLOATING_POINT_LITERAL'),
        (r'\s+', None),       #whitespaces will be ignored and not identified as a token identity (i.e. 'None')
        
    ]
    
    #Tokenization function will perform the lexical analysis on the given text
    #the function will return a list of the individual tokens identified from the text
    def Tokenization(self, text):
        the_tokens = []       
        current_position = 0     #index of current character that will be analyzed in the text
        
        while current_position < len(text):   #while loop that will iterate all the text
            matcher = None    #variable that will store the string (from text) that are a match with one of the operations in math_operantions
            for operator, token_identity in self.math_operations:      #for loop that iterates all the tuples in math_operantions. Each tuples regex(operator) and token identity(token_identity)
                matcher = re.match(operator, text[current_position:])   #re.match() re module function used here to try to match the current 'operator' (regex) against the text at the current position (index). If a match is succesful then the object is assigned to matcher variable
                if matcher:   #if the above code is succesful and has a match then we continue
                    value = matcher.group(0)   #using .group(0) to group the entire substring of text that was found as a match starting from first index (0). the matched value(character(s) from text) is assigned to variable value
                    if token_identity:    #checking to make sure that matcher is not a whitespace. this would happen if 'operator' matched the whitespace regex which means the token_identity would be 'None'. If it's 'None' then we can't enter the if statement
                        the_tokens.append(Token(token_identity, value))  #creating a Token object with the matched token identity and it's value then appending to the tokens list
                    current_position = current_position + len(value)  #moving the current position to the next character(s) to be tokenized by adding it to the length of variable value so we know that we aren't tokenizing a piece of text that was already part of a substring that was already tokenized 
                    break
            
        return the_tokens    #while loop ends and the_tokens list containing all the identified tokens in the text is returned


#commands used to run this code in jupyter notebook
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
textfile_example = "exampleHW3.txt"

with open(textfile_example, "r") as f:
    text = f.read()
    
lexer = Lexer()   #lexer object is created
output_tokens = lexer.Tokenization(text)   #using lexer object to tokenize the text

print(output_tokens)
print("Total number of tokens identified: ", len(output_tokens))

[VARIABLE_IDENTIFIER: a, ASSIGNMENT: =, GROUPING_SYMBOL: (, INTEGER_LITERAL: 12, MULTIPLICATION: *, INTEGER_LITERAL: 3, MULTIPLICATION: *, INTEGER_LITERAL: 4, ADDITION: +, INTEGER_LITERAL: 7, GROUPING_SYMBOL: ), DIVISION: /, INTEGER_LITERAL: 2]
Total number of tokens identified:  13
