# Symbolic Regression Implementation

In [None]:
import numpy as np
from icecream import ic
from gxgp.node import Node as GXNode
from gxgp.draw import draw
from typing import Optional, List, Dict, Any

data = np.load('../data/problem_6.npz')  
x_validation = data['x'] 
y_validation = data['y']

num_vars = x_validation.shape[0]
VARIABLES = [f"x{i}" for i in range(num_vars)]


## Tree Node Definition

In [None]:
import random

class Node:
    def __init__(self, value, children=None):
        """
        value could be:
            - a float (constant);
            - a string (a variable name like 'x0','x1', ...):
            - a Python function ( math.sin, operator.add, ...)
        """
        self.value = value
        self.children = children if children is not None else []
    
    def is_leaf(self):
        return len(self.children) == 0
    
    def __str__(self):
        if self.is_leaf():
            return str(self.value)
        else:
            return f"({self.value} {' '.join(map(str, self.children))})"  

## Tree Node Evaluation

In [None]:
def evaluate_children(node: Node, x: np.ndarray) -> np.ndarray:
    if node.is_leaf():
        val = node.value
        
        if isinstance(val, (int, float)):
            return np.full(x.shape[1], float(val))
        
        if isinstance(val, str) and val in VARIABLES:
            try:
                idx = VARIABLES.index(val)
            except ValueError:
                raise ValueError(f"Unknown Variable: {val}")
            return x[idx, :]
          
        raise ValueError(f"Unknown leaf value: {val}")
    
    else:
        op = node.value
        children_values = [evaluate_children(child, x) for child in node.children]

        if len(children_values) == 1:
            return safe_apply_unary(op, children_values[0])
        elif len(children_values) == 2:
            return safe_apply_binary(op, children_values[0], children_values[1])
        else:
            raise ValueError("Children number not supported")

def safe_apply_unary(func, arr):
    try:
        return func(arr)
    except Exception as e:
        raise ValueError(f"Unary operator {func} failed with error: {e}") 

def safe_apply_binary(func, arr1, arr2):
    try:
        return func(arr1, arr2)
    except Exception as e:
        raise ValueError(f"Binary operator {func} failed with error: {e}") 


DEPTH_MAX = 6
CONST_MAX = 10
CONST_MIN = -10

def sin_fn(x): return np.sin(x)
def cos_fn(x): return np.cos(x)
def neg_fn(x): return -x
def abs_fn(x): return np.abs(x)
def log_safe(x): return np.where(x <= 0, 0.0, np.log(x))
def sqrt_safe(x): return np.sqrt(np.abs(x))
def exp_safe(x): return np.exp(np.clip(x, -700, 700))

def add_fn(a, b): return a + b
def sub_fn(a, b): return a - b
def mul_fn(a, b): return a * b
def div_safe(a, b): return np.where(np.abs(b) < 1e-12, 1.0, a / b)

def create_function_set_with_weights() -> List[Dict[str, Any]]:
    """
    Creates a weighted function set from unary and binary functions.
    Returns a list of dictionaries each describing a function.
    """

    function_set = []

    function_set.extend([
        {'function': add_fn, 'arity': 2, 'symbol': '+', 'weight': 1.0},
        {'function': sub_fn, 'arity': 2, 'symbol': '-', 'weight': 1.0},
        {'function': mul_fn, 'arity': 2, 'symbol': '*', 'weight': 1.0},
        {'function': div_safe, 'arity': 2, 'symbol': '/', 'weight': 0.7},  
    ])

    function_set.extend([
        {'function': sin_fn, 'arity': 1, 'symbol': 'sin', 'weight': 0.6},
        {'function': cos_fn, 'arity': 1, 'symbol': 'cos', 'weight': 0.6},
        {'function': neg_fn, 'arity': 1, 'symbol': 'neg', 'weight': 0.8},
        {'function': abs_fn, 'arity': 1, 'symbol': 'abs', 'weight': 0.9},
        {'function': log_safe, 'arity': 1, 'symbol': 'log', 'weight': 0.5},
        {'function': sqrt_safe, 'arity': 1, 'symbol': 'sqrt', 'weight': 0.6},
        {'function': exp_safe, 'arity': 1, 'symbol': 'exp', 'weight': 0.4}, 
    ])

    return function_set


## Tree Generation

In [None]:
FUNCTION_SET = create_function_set_with_weights()
UNARY_FUNCS = [f for f in FUNCTION_SET if f['arity'] == 1 and f['weight'] > 0.0]
BINARY_FUNCS = [f for f in FUNCTION_SET if f['arity'] == 2 and f['weight'] > 0.0]

def generate_subtree(
    depth: int,
    force_var: Optional[str] = None,  
    constant_min=CONST_MIN,
    constant_max=CONST_MAX
) -> Node:
    """
    Generate recursively a subtree with a variable var at the root.
    """
    if depth <= 0:
        if force_var is not None:
            return Node(force_var)
        elif random.random() < 0.5 and VARIABLES:
            return Node(random.choice(VARIABLES))
        else:
            return Node(random.uniform(constant_min, constant_max))
        
    if random.random() < 0.3 and UNARY_FUNCS:
        op_dict = random.choices(
            UNARY_FUNCS,
            weights=[f['weight'] for f in UNARY_FUNCS],
            k=1
        )[0]
        child = generate_subtree(depth - 1, force_var, constant_min, constant_max)
        return Node(op_dict['function'], [child])
    
    else:
        op_dict = random.choices(
            BINARY_FUNCS,
            weights=[f['weight'] for f in BINARY_FUNCS],
            k=1
        )[0]
        left = generate_subtree(depth - 1, force_var, constant_min, constant_max)
        right = generate_subtree(depth - 1, None, constant_min, constant_max)
        return Node(op_dict['function'], [left, right])

def generate_random_tree(
        depth=DEPTH_MAX, 
        constant_min=CONST_MIN, 
        constant_max=CONST_MAX
) -> Node:
    """
    Generate a tree which includes all the variables.
    """
    if not VARIABLES:
        raise ValueError("There is no variable available.")
    elif len(VARIABLES) == 1:
        return generate_subtree(depth, force_var=VARIABLES[0], constant_min=constant_min, constant_max=constant_max)
    else:
        return generate_subtree(depth, force_var=random.choice(VARIABLES), constant_min=constant_min, constant_max=constant_max)

## Expression String Representation

In [None]:
log_safe = lambda x: np.where(x <= 0, 0.0, np.log(x))
sqrt_safe = lambda x: np.sqrt(np.abs(x))
exp_safe = lambda x: np.exp(np.clip(x, -700, 700))
div_safe = lambda a, b: np.where(np.abs(b) < 1e-12, 1.0, a / b)

DISPLAY_NAME_MAP = {
    id(np.add): '+', 
    id(np.subtract): '-', 
    id(np.multiply): '*', 
    id(np.sin): 'sin',
    id(np.cos): 'cos',
    id(np.negative): 'neg',
    id(np.abs): 'abs',
    id(log_safe): 'log',
    id(sqrt_safe): 'sqrt',
    id(exp_safe): 'exp',
    id(div_safe): '/',
}

def tree_to_string(node: Node) -> str:
    """ It Returns a string representation of the tree node """
    if node.is_leaf():
        val = node.value
        if isinstance(val, (float, int)) and not isinstance(val, bool):
            return f"{float(val):.3f}"
        if isinstance(val, str):
            idx = val[1]  
            return f"x{idx}"
        return str(val)

    op = node.value
    op_name = DISPLAY_NAME_MAP.get(id(op), getattr(op, '__name__', str(op)))
    child_strs = [tree_to_string(child) for child in node.children]
    if len(child_strs) == 1:
        return f"{op_name}({child_strs[0]})"
    elif len(child_strs) == 2:
        return f"({child_strs[0]} {op_name} {child_strs[1]})"
    else:
        raise ValueError("Unsupported number of children")

## Conversion Function


In [None]:
def make_var_func(var_name: str):
    """It creates a function that returns the value of a variable from the keyword arguments."""
    def var_func(**kwargs):
        return kwargs[var_name]
    var_func.__name__ = var_name
    return var_func

def make_const_func(const_val: float):
    """It creates a function that returns a constant value."""
    def const_func(**kwargs):
        return const_val
    const_func.__name__ = f"{const_val:.3f}"
    return const_func

def convert_to_gxgp_node(my_node, subtree=None) -> GXNode:
    """
    It converts a Node object to a GXNode object.
    """
    if subtree is None:
        subtree = set()

    if my_node.is_leaf():
        val = my_node.value
        if isinstance(val, str):
            var_func = make_var_func(val)
            gx_node = GXNode(var_func, [], name=val)      
        else:
            const_func = make_const_func(float(val))
            gx_node = GXNode(const_func, [], name=f"{float(val):.3f}") 
        subtree.add(gx_node)
        return gx_node
    
    
    op = my_node.value
    converted_children = []
    op_name = DISPLAY_NAME_MAP.get(id(op), getattr(op, '__name__', str(op)))
    for child in my_node.children if my_node.children is not None else []:
        converted_child = convert_to_gxgp_node(child, subtree)
        converted_children.append(converted_child)

    gx_node = GXNode(op, converted_children, name=op_name)
    subtree.add(gx_node)
    return gx_node



## Fitness Function

In [None]:
def fitness(individual: Node, x: np.ndarray, y: np.ndarray, penalty_factor: float = 0.001) -> float:
    """ It calculates the fitness of an individual. """
    try:
        y_pred = evaluate_children(individual, x)
        if not np.all(np.isfinite(y_pred)):
            return 1e10 
        mse = np.mean((y - y_pred) ** 2)
        complexity_penalty = penalty_factor * len(get_all_nodes(individual))
        return mse 
    
    except Exception as e:
        return 1e10 

## Selection Function

In [None]:
def tournament_selection(population, x, y, k=3):
    """ It returns the best individual from a random tournament. """
    if len(population) < k:
        contenders = population
    else:
        contenders = random.sample(population, k)
    best = min(contenders, key=lambda ind: fitness(ind, x, y))
    return best


## Crossover Function

In [None]:
def crossover(parent1: Node, parent2: Node) -> Node:
    """Crossover between two parents."""
    child1 = clone_tree(parent1)
    child2 = clone_tree(parent2)
    
    internal_nodes1 = [node for node in get_all_nodes(child1) if not node.is_leaf()]
    internal_nodes2 = [node for node in get_all_nodes(child2) if not node.is_leaf()]
    
    if internal_nodes1 and internal_nodes2:
        node1 = random.choice(internal_nodes1)
        node2 = random.choice(internal_nodes2)
    
        node1.value, node1.children, node2.value, node2.children = node2.value, node2.children, node1.value, node1.children
    
    return child1


def get_random_node(tree: Node) -> Node:
    """ It returns a random node from the tree. """
    all_nodes = get_all_nodes(tree)
    return random.choice(all_nodes)

def get_all_nodes(tree: Node) -> list:
    """ It returns a list with all the nodes of the tree. """
    nodes = [tree]
    for c in tree.children:
        nodes += get_all_nodes(c)
    return nodes

def clone_tree(node: Node) -> Node:
    """ It creates a deep copy of the tree. """
    new_node = Node(node.value)
    new_node.children = [clone_tree(c) for c in node.children]
    return new_node


## Mutation Functions

In [None]:
def mutation(individual: Node, mutation_rate=0.4) -> Node:
    """With a certain probability it mutates a node of the tree."""
    mutant = clone_tree(individual)
    if random.random() < mutation_rate:
        internal_nodes = [node for node in get_all_nodes(mutant) if not node.is_leaf()]
        leaf_nodes = [node for node in get_all_nodes(mutant) if node.is_leaf()]
        
        if internal_nodes and (not leaf_nodes or random.random() < 0.5):
            
            node_to_mutate = random.choice(internal_nodes)
            new_subtree = generate_random_tree(depth=DEPTH_MAX, constant_min=CONST_MIN, constant_max=CONST_MAX)
            node_to_mutate.value = new_subtree.value
            node_to_mutate.children = new_subtree.children
        elif leaf_nodes:
            node_to_mutate = random.choice(leaf_nodes)
            if random.random() < 0.5:
                new_var = random.choice(VARIABLES)
                node_to_mutate.value = new_var
            else:
                node_to_mutate.value = random.uniform(CONST_MIN, CONST_MAX)
                node_to_mutate.children = []
    
    return mutant

def hoist_mutation(individual: Node) -> Node:
    """Performs hoist mutation by replacing the individual with a randomly chosen subtree."""
    all_nodes = get_all_nodes(individual)
    subtrees = [n for n in all_nodes if not n.is_leaf()]

    if not subtrees:
        return clone_tree(individual)
    return clone_tree(random.choice(subtrees))

## Depth Tree Control

In [None]:
def enforce_max_depth(node: Node, max_depth: int = 3, current_depth: int = 0):
    """It reduces the depth of the tree to a maximum depth."""
    if current_depth >= max_depth:
        if VARIABLES:
            node.value = random.choice(VARIABLES + [random.uniform(CONST_MIN, CONST_MAX)])
        else:
            node.value = random.uniform(CONST_MIN, CONST_MAX)
        node.children = []
    else:
        for c in node.children:
            enforce_max_depth(c, max_depth, current_depth+1)


## Genetic Programming Algorithm

In [None]:
def run_genetic_programming(x: np.ndarray, y: np.ndarray,
                            population_size=50000,
                            generations=200,
                            elite_size=2,
                            max_depth=6):
    """ It executes the genetic programming algorithm. """

    population = []
    half_pop = population_size // 2
    for i in range(half_pop):
        tree = generate_random_tree(depth=2)
        enforce_max_depth(tree, max_depth=max_depth)
        population.append(tree)

    for i in range(population_size - half_pop):
        tree = generate_random_tree(depth=4)
        enforce_max_depth(tree, max_depth=max_depth)
        population.append(tree)
    
    best_overall = None
    best_fitness = float('inf')
    
    hall_of_fame = []  

    for g in range(generations):
        scored_pop = [(ind, fitness(ind, x, y)) for ind in population]
        scored_pop.sort(key=lambda x: x[1]) 
        
        best_current, best_current_fit = scored_pop[0]
      
        if best_current_fit < best_fitness:
            best_overall = clone_tree(best_current)
            best_fitness = best_current_fit
        
        best_str = tree_to_string(best_current)
        print(f"[Gen {g}] Best MSE: {best_current_fit:.40f} => {best_str}")
        
        hall_of_fame.append((clone_tree(best_current), best_current_fit))
        
        new_population = [ind for ind, fit in scored_pop[:elite_size]]
        
        while len(new_population) < population_size:
            p1 = tournament_selection(population, x, y, k=3)
            p2 = tournament_selection(population, x, y, k=3)
            offspring = crossover(p1, p2)
            if random.random() < 0.10:
                offspring = hoist_mutation(offspring)
            else:
                offspring = mutation(offspring)
            enforce_max_depth(offspring, max_depth=max_depth)
            new_population.append(offspring)
        
        population = new_population
    
    return best_overall, best_fitness, hall_of_fame


## Training

In [None]:
N = x_validation.shape[1]
print("\nTraining:\n")
TRAIN_SIZE = N // 10
print(f"TRAIN_SIZE = {TRAIN_SIZE}")

train_indexes = np.random.choice(N, size=TRAIN_SIZE, replace=False)

x_train = x_validation[:, train_indexes]
y_train = y_validation[train_indexes]

population_sizes = [1000, 10_000, 50_000]
generations_list = [100, 200]
elite_sizes = [2, 4]

for pop_size in population_sizes:
    for gens in generations_list:
        for elite in elite_sizes:
            print(f"\nTraining with population_size={pop_size}, generations={gens}, elite_size={elite}")
            best_individual_training, best_fit_training, hall_of_fame_training = run_genetic_programming(
                x_train, y_train,
                population_size=pop_size,
                generations=gens,
                elite_size=elite,
                max_depth=6
            )


            expr_str_training = tree_to_string(best_individual_training)
            print(f"\nResults: pop={pop_size}, gen={gens}, elite={elite}")
            print(f"\nBest expression found = {expr_str_training}, MSE = {best_fit_training}")

            gx_best_individual_training = convert_to_gxgp_node(best_individual_training)
            print("Final Expression Tree (GP on training set):")
            file_name = f"tree_pop{pop_size}_gen{gens}_elite{elite}.png"
            draw(gx_best_individual_training, file_name)



## Test

In [None]:
print("Test:\n")
best_individual, best_fit, hall_of_fame = run_genetic_programming(
    x_validation, y_validation,
    population_size=50_000,
    generations=200,
    elite_size=4
)
print(best_individual)
expr_str = tree_to_string(best_individual)
print(f"\nBest expression found = {expr_str}, MSE = {best_fit}")

gx_best_individual = convert_to_gxgp_node(best_individual)
print("Final Expression Tree (GP on test set):")
draw(gx_best_individual)
