In [1]:
#
### CONSTANTS
# 

import string

LETTERS = string.ascii_letters
DIGITS = '0123456789' 
LETTERS_DIGITS = LETTERS + DIGITS





#
### ERRORS
#

class Error:
    def __init__(self,error_name,details):
        self.error_name = error_name
        self.details = details
        
    def as_string(self):
        result = f'{self.error_name} : {self.details}'
        return result

class IllegalCharError(Error):
    def __init__(self, details):
        super().__init__('Illegal Character',details) 
        

#
### TOKENS
#

TT_INT = 'TT_INT'
TT_FLOAT = 'FLOAT'
TT_PLUS = 'PLUS'
TT_MINUS = 'MINUS'
TT_MUL = 'MUL'
TT_DIV = 'DIV' 
TT_LPAREN = 'LPAREN' 
TT_RPAREN = 'RPAREN'  
TT_EOF = 'EOF' 

TT_LBRACKET = 'LBRACKET'
TT_RBRACKET = 'RBRACKET'

TT_INDENTIFIER = 'INDENTIFIER'
TT_KEYWORD = 'KEYWORD' 
TT_EQ = 'EQ'   

TT_COMMA = 'COMMA'

TT_NEWLINE = 'NEWLINE'

TT_PHOTO = 'PHOTO' 
TT_PERCENT = '%'

KEYWORDS = [ 
    'VAR',  
    'extract',
    'from',
    'compute', 
    "mean",
    "stdev",
     "max",
    "min",
    "median",
    "mode",
     "sum", 
    'on', 
    'each',
    'rows',
    'to'
    
]

class Token:
    def __init__(self,type_, value = None):
        self.type = type_
        self.value = value
    
    def __repr__(self):
        if self.value: return f'{self.type} : {self.value}'
        return f'{self.type}' 
    
    def matches(self, type_, value):
        return self.type == type_ and self.value == value
    
#
### LEXER
# 

class Lexer:
    def __init__(self,text):
        self.text = text
        self.pos = -1
        self.current_char = None
        self.advance()
        
    def advance(self):
        self.pos += 1
        self.current_char = self.text[self.pos] if self.pos < len(self.text) else None
    
    def make_tokens(self):
        tokens = list()
        
        while self.current_char != None:
            if self.current_char in '  \t':
                self.advance() 
                
            elif self.current_char in ';\n':
                tokens.append(Token(TT_NEWLINE))
                self.advance()
                
            elif self.current_char in DIGITS:
                tokens.append(self.make_number())  
                
            elif self.current_char in LETTERS:
                tokens.append(self.make_identifier())
                
                
            elif self.current_char == '+':
                tokens.append(Token(TT_PLUS))
                self.advance()  
                
            elif self.current_char == ',':
                tokens.append(Token(TT_COMMA))
                self.advance() 
                
            elif self.current_char == '-':
                tokens.append(Token(TT_MINUS))
                self.advance() 
            elif self.current_char == '*':
                tokens.append(Token(TT_MUL))
                self.advance() 
            elif self.current_char == '/':
                tokens.append(Token(TT_DIV))
                self.advance() 
            
            elif self.current_char == '%':
                tokens.append(Token(TT_PERCENT))
                self.advance() 
                
            elif self.current_char == '(':
                tokens.append(Token(TT_LPAREN))
                self.advance() 
            elif self.current_char == ')':
                tokens.append(Token(TT_RPAREN))
                self.advance()  
                
            elif self.current_char == '[':
                tokens.append(Token(TT_LBRACKET))
                self.advance() 
            elif self.current_char == ']':
                tokens.append(Token(TT_RBRACKET))
                self.advance()  
                
            elif self.current_char == '=':
                tokens.append(Token(TT_EQ))
                self.advance() 
            else:
                char = self.current_char
                self.advance()
                return [], IllegalCharError("'" + char + "'")
            
            
        tokens.append(Token(TT_EOF))
        return tokens, None
    
    def make_number(self):
        num_str =''
        dot_count = 0
        
        while self.current_char != None and self.current_char in DIGITS + '.':
            if self.current_char == '.':
                if dot_count == 1:
                    print("ERROR: INVALID FLOAT") 
                    break
                dot_count +=1 
                num_str += '.'
            
            else:
                num_str += self.current_char 
            
            self.advance()
        
        if dot_count == 0:
            return Token(TT_INT, int(num_str))
        else:
            return Token(TT_FLOAT, float(num_str))   
        
    
    def make_identifier(self):
        id_str = ''
        
        dot_count = 0
        
        while self.current_char != None and self.current_char in LETTERS_DIGITS + '_' + '.':
            if self.current_char == '.':
                if dot_count == 1:
                    print("ERROR: INVALID PHOTO FORMAT! 2 DOTS!") 
                    break
                dot_count +=1 
                self.advance()
                photo_format = '.'
                while self.current_char != None and self.current_char in LETTERS:
                    photo_format += self.current_char 
                    self.advance() 
                    
                if len(photo_format) > 4: 
                    print("INVALID PHOTO FORMAT, PLEASE USE ONLY J.jpg or .png")
                    break
        
                id_str += photo_format
                return Token(TT_PHOTO, id_str)
            
            else:
                id_str += self.current_char 
                self.advance()  
            

        tok_type = TT_KEYWORD if id_str in KEYWORDS else TT_INDENTIFIER
        return Token(tok_type, id_str)
            
        

def run(text):
    lexer = Lexer(text)
    tokens, error = lexer.make_tokens() 
    if error: return None, error
    
    #Generate AST
    print(tokens)
    parser = Parser(tokens)
    ast = parser.parse() 
    
    return ast, None
    

In [2]:
# 
### NODES 
# 

class NumberNode:
    def __init__(self,tok):
        self.tok = tok
    
    def __repr__(self):
        return f'{self.tok}' 
    

class BinOpNode:
    def __init__(self, left_node, op_tok, right_node):
        self.left_node = left_node
        self.op_tok = op_tok
        self.right_node = right_node
    def __repr__(self):
        return f'({self.left_node}, {self.op_tok}, {self.right_node})' 
    
    
class VarAccesNode:
    def __init__(self, var_name_tok):
        self.var_name_tok = var_name_tok    
    
        
class VarAssignNode:
    def __init__(self, var_name_tok, value_node):
        self.var_name_tok = var_name_tok
        self.value_node = value_node
    
    def __repr__(self): 
        return f'({self.var_name_tok}, {self.value_node})'
    
    

class ColsAssignNode:
    def __init__(self,value_node):
        self.var_name_tok = 'COLUMNS'
        self.value_node = value_node
    
    def __repr__(self): 
        return f'({self.var_name_tok}, {self.value_node})' 
    
    
class RowsAssignNode:
    def __init__(self,value_node):
        self.var_name_tok = 'ROWS'
        self.value_node = value_node
    
    def __repr__(self): 
        return f'({self.var_name_tok}, {self.value_node})'
     
    
    

class ComputationNode:
    def __init__(self,method, var_name_tok, details = None):
        self.method = method
        self.var_name_tok = var_name_tok
        self.details = details
    
    def __repr__(self):
        if self.details != None:
            return f'{ VarAssignNode (VarAssignNode(self.var_name_tok, self.details), self.method)}' 
        else:
            return f'{VarAssignNode(self.var_name_tok, self.method)}'  
        
        
class ComputationDetailsNode:
     def __init__(self,tok1, int_ , tok2):
        self.int = int_
        self.tok1 = tok1
        self.tok2 = tok2
    
     def __repr__(self):
        return f'({self.tok1} {self.int} {self.tok2})'  
    

    
class PercentMethodNode:
     def __init__(self,tok, int_):
        self.int = int_
        self.tok = tok
    
     def __repr__(self):
        return f'({self.tok}, {self.int})'

        
class PercentNode:
     def __init__(self,int_1, tok= None, int_2 = None):
        self.int_1 = int_1
        self.tok = tok
        self.int_2 = int_2
    
     def __repr__(self):
        if self.tok != None:
            return f'({self.int_1}, {self.tok},{self.int_2})'
        else:
            return f'{NumberNode(self.int_1)}'

    
    
    
    
    

#
### PARSER
# 


class Parser:
    def __init__(self,tokens):
        self.tokens = tokens
        self.tok_idx = -1
        self.advance()
        
    def advance(self):
        self.tok_idx +=1 
        if self.tok_idx < len(self.tokens):
            self.current_tok = self.tokens[self.tok_idx]
        
        return self.current_tok  
    
    def parse(self):
        res = self.statements()
        return res
    
    def statements(self):
        statements = []
        
        def statement_loop():
            while self.current_tok.type == TT_NEWLINE:
                self.advance()
            
            statement = self.expr()
            print(statement)
            if  not statement:
                return False
            statements.append(statement)
            return True

        
        while True:
            if statement_loop() == False:
                break
            
                
        return statements
    
    def factor(self):
        tok = self.current_tok
        
        if tok.type in (TT_INT, TT_FLOAT):
            self.advance()
            return NumberNode(tok)
    
    def term(self):
        return self.bin_op(self.factor, (TT_MUL, TT_DIV))
    
    def expr(self):
        
        if self.current_tok.matches(TT_KEYWORD, 'VAR'):
            self.advance()
            
            if self.current_tok.type != TT_INDENTIFIER:
                print ("EXPECTED IDENT!!!!!")
                return None
            
            var_name = self.current_tok
            self.advance()
            
            if self.current_tok.type != TT_EQ:
                print( "EXPECTED =")
                return None
    
            self.advance()
            expr = self.expr()
            return VarAssignNode(var_name, expr)

        
        
        if self.current_tok.matches(TT_KEYWORD, 'extract'):
            self.advance()
            return self.data_extraction()    

        if self.current_tok.matches(TT_KEYWORD, 'compute'):
            self.advance()
            if self.current_tok.type == TT_INT:
                return self.computation_percent()
            return self.computation() 
        
        
        
        
        return self.bin_op(self.term, (TT_PLUS, TT_MINUS))
    
    
    
    
    def computation_percent(self):
        nr_percent = self.current_tok
        
        self.advance()
        
        if self.current_tok.type != TT_PERCENT:
            print('ERRRO % pls') 
        tok = self.current_tok
        self.advance()
        
        range_ = self.range_rows_percent()
        
        if self.current_tok.type != TT_INDENTIFIER:
            print('Expected ident')
        ident = self.current_tok
        self.advance()
        
        return ComputationNode( PercentMethodNode(tok,nr_percent), ident, range_ )
        
    
    def range_rows_percent(self):
        if self.current_tok.type != TT_LBRACKET:
            return None
        self.advance() 
        
        if self.current_tok.type != TT_INT:
            print('Expected int') 
        
        range1 = self.current_tok
        range2 = None
        tok = None
        self.advance()
        
        if self.current_tok.matches(TT_KEYWORD, 'to'):
            tok  = self.current_tok
            
            self.advance() 
            if self.current_tok.type != TT_INT:
                print('Expected int')
            range2 = self.current_tok 
            self.advance()
            
        if self.current_tok.type != TT_RBRACKET:
            print("Close")
        
        self.advance() 
        
        if self.current_tok.type != TT_INDENTIFIER:
            print('Expected ident')
        
            
        return RowsAssignNode( PercentNode(range1,tok,range2))
            
        
        
        
        
        
    
    
    
    
    def computation(self):
        if (self.current_tok.matches(TT_KEYWORD, 'mean') or 
            self.current_tok.matches(TT_KEYWORD, 'stdev') or 
              self.current_tok.matches(TT_KEYWORD, 'max') or 
            self.current_tok.matches(TT_KEYWORD, 'min') or 
            self.current_tok.matches(TT_KEYWORD, 'median') or 
            self.current_tok.matches(TT_KEYWORD, 'mode') or 
            self.current_tok.matches(TT_KEYWORD, 'sum') ):
            
            computation_method = self.current_tok
    
            self.advance() 
            if self.current_tok.matches(TT_KEYWORD, 'on'):
                self.advance()
                
                if self.current_tok.type != TT_INDENTIFIER:
                    print ("EXPECTED Dataset!!!!!!")
                
         
                var_name = self.current_tok
                self.advance()
                temp = var_name
                var_name = self.check_compute_cols(var_name)
                if var_name == None:
                    var_name = temp 
                    

                                
                
                
                return ComputationNode(computation_method,var_name, self.check_computation_details() )
            else:
                print("eerr")
            
        else:
            print("ERR")
                
                
                
    def check_compute_cols(self, var_name):
        if self.current_tok.type != TT_LBRACKET:
            return None
        self.advance() 
        if self.current_tok.type != TT_INDENTIFIER:
            return None 

        tok1 = self.current_tok
        self.advance() 
        temp_list = [ tok1]  
        
        self.string_list(temp_list) 
        
        if self.current_tok.type != TT_RBRACKET:
            print( "EXPECTED ]") 
            
            
        self.advance()
        
        return  VarAssignNode(var_name,ColsAssignNode(temp_list))

        
            
        
    
        
                
                
    def check_computation_details(self):
        
        if self.current_tok.type != TT_LBRACKET:
            return None
        self.advance() 
        
        if self.current_tok.matches(TT_KEYWORD, 'each'):
            tok1 = self.current_tok
            self.advance()
            
            if self.current_tok.type != TT_INT:
                print('Expected int') 
            int_ = self.current_tok
            self.advance()
            
            if self.current_tok.matches(TT_KEYWORD, 'rows'):
                tok2 = self.current_tok
                self.advance()
                if self.current_tok.type != TT_RBRACKET:
                    print('CLOSE THE DOOORS')
                
                self.advance()
                
                return ComputationDetailsNode(tok1, int_, tok2) 
            
        
        
        print('ERORR in providing the details')
        return None
                
            
            
    
    def string_list(self,temp_list):
       
        while self.current_tok.type == TT_COMMA:
            self.advance() 
            if self.current_tok.type != TT_INDENTIFIER:
                print ("EXPECTED IDENT!!!!!")
            temp_list.append(self.current_tok)
        
            self.advance()
        
            
            
        
                
    
    
    def data_extraction(self):
        
        if self.current_tok.type != TT_LBRACKET:
            print("LEFT BRACKET ?????")
        self.advance() 

        if self.current_tok.type != TT_INDENTIFIER:
            print ("EXPECTED IDENT!!!!!") 

        extract_var_name = self.current_tok
        self.advance() 
        temp_list = [ extract_var_name]  
        
        self.string_list(temp_list)
            

        if self.current_tok.type != TT_RBRACKET:
            print( "EXPECTED ]") 
            
        self.advance() 
        
        if not self.current_tok.matches(TT_KEYWORD, 'from'):
            print("WHERE FROM ???????????")
        
        op_tok = self.current_tok
        
        self.advance() 

        if self.current_tok.type != TT_PHOTO:
            print ("PLEASE SPECIFY THE PHOTO WITH FORMAT!!!!!!") 
        
        if len(temp_list) > 1:
            right = VarAssignNode(temp_list[-1], self.current_tok )
            for x in reversed ( temp_list[:-1]):
                right = BinOpNode(x, self.current_tok ,right )
            del temp_list  
            self.advance()
            return right

        res = VarAssignNode(extract_var_name, self.current_tok ) 
        self.advance()
        

        return res
    
  
            

            
        
          
        
        
        
    def bin_op(self, func, ops):

        left = func()

        
        while self.current_tok.type in ops:
            op_tok = self.current_tok.type
            self.advance()
            right = func() 
            left = BinOpNode(left, op_tok, right) 
        
        
        return left
            
        

In [3]:
text = open("test.txt", "r").read()
#text = 'VAR m = compute mode on b'

result, error = run(text)
#print(result)

[KEYWORD : VAR, INDENTIFIER : b, EQ, KEYWORD : compute, KEYWORD : mean, KEYWORD : on, INDENTIFIER : a, LBRACKET, INDENTIFIER : Name, COMMA, INDENTIFIER : Age, RBRACKET, NEWLINE, KEYWORD : VAR, INDENTIFIER : m, EQ, KEYWORD : compute, KEYWORD : max, KEYWORD : on, INDENTIFIER : a, LBRACKET, INDENTIFIER : Name, RBRACKET, LBRACKET, KEYWORD : each, TT_INT : 3, KEYWORD : rows, RBRACKET, NEWLINE, KEYWORD : VAR, INDENTIFIER : m, EQ, KEYWORD : compute, KEYWORD : mode, KEYWORD : on, INDENTIFIER : b, NEWLINE, KEYWORD : VAR, INDENTIFIER : n, EQ, KEYWORD : compute, TT_INT : 30, %, LBRACKET, TT_INT : 2, KEYWORD : to, TT_INT : 4, RBRACKET, INDENTIFIER : a, NEWLINE, KEYWORD : VAR, INDENTIFIER : n, EQ, KEYWORD : compute, TT_INT : 54, %, LBRACKET, TT_INT : 2, RBRACKET, INDENTIFIER : a, NEWLINE, KEYWORD : VAR, INDENTIFIER : n, EQ, KEYWORD : compute, TT_INT : 9, %, INDENTIFIER : a, NEWLINE, EOF]
(INDENTIFIER : b, ((INDENTIFIER : a, (COLUMNS, [INDENTIFIER : Name, INDENTIFIER : Age])), KEYWORD : mean))
(INDE

In [5]:
result

[(INDENTIFIER : b, ((INDENTIFIER : a, (COLUMNS, [INDENTIFIER : Name, INDENTIFIER : Age])), KEYWORD : mean)),
 (INDENTIFIER : m, (((INDENTIFIER : a, (COLUMNS, [INDENTIFIER : Name])), (KEYWORD : each TT_INT : 3 KEYWORD : rows)), KEYWORD : max)),
 (INDENTIFIER : m, (INDENTIFIER : b, KEYWORD : mode)),
 (INDENTIFIER : n, ((INDENTIFIER : a, (ROWS, (TT_INT : 2, KEYWORD : to,TT_INT : 4))), (%, TT_INT : 30))),
 (INDENTIFIER : n, ((INDENTIFIER : a, (ROWS, TT_INT : 2)), (%, TT_INT : 54))),
 (INDENTIFIER : n, (INDENTIFIER : a, (%, TT_INT : 9)))]