In [1]:
import torch
from torch import nn
import random, numpy
import numpy as np
from copy import deepcopy
from game import Game, Move, RandomPlayer, MyPlayer,translate_number_to_position_direction,translate_number_to_position,TrainedPlayer
import torch.nn.init as init

In [2]:
class Genome:
    def __init__(self, ):
        self.genotype = create_rand_gen()
        self.fitness = float("-inf")

cell_max_values = [
    (1, 8),    # num_matches
    (100, 1_000),  # max_dim_replay_buff
    (2, 80),   # time_to_update
    (0.2, 1.0),   # gamma
    (50, 500)    # batch_size
]

def create_rand_gen(): # creates the genome, referred to the scale of the values we'd like to estimate    
    # Define the maximum values for each cell
    # Generate a random vector
    genome = [random.uniform(cell[0], cell[1]) if isinstance(cell[0], float) else random.randint(cell[0], cell[1]) for cell in cell_max_values]
    genome[4] = min(genome[1], genome[4]) # max_dim_replay_buff > batch_size
    return genome


In [3]:
class ReplayBuffer:     
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size        
        self.buffer = []
        self.position = 0
        
    def add_experience(self, experience):        
        if len(self.buffer) < self.buffer_size:
            self.buffer.append(experience)        
        else:
            self.buffer[self.position] = experience        
            self.position = (self.position + 1) % self.buffer_size
            
    def sample_batch(self, batch_size):
        batch_indices = np.random.choice(len(self.buffer), batch_size, replace=True)
        batch = [self.buffer[i] for i in batch_indices]
        return batch

In [4]:
from game import Player
from tqdm import tqdm


def training(individual: 'Genome', starts_first: bool):
    num_iterations = 100
    num_matches = individual.genotype[0]
    max_dim_replay_buff = individual.genotype[1]
    time_to_update = individual.genotype[2]
    gamma = individual.genotype[3]
    batch_size = individual.genotype[4]
    agent_to_train = MyPlayer()
    opponent = RandomPlayer()

    loss_tot = []
    
    taboo_set = set()
    replay_buff = ReplayBuffer(max_dim_replay_buff) # replay buffer, from which we sample for BATCH learning
    torch.set_grad_enabled(True)
    
    for step in range(num_iterations):  
        # we fill the replay buffer with experiences made with matches
        for match in range(num_matches):
            winner = -1
            g = Game()
            g.current_player_idx = int(starts_first)
            players = [agent_to_train, opponent]

            go = False # needed in order to compute the next state
            while winner < 0:
                    g.current_player_idx += 1
                    g.current_player_idx %= len(players)
                    prev_state=deepcopy(g)
                    from_pos, slide = players[g.current_player_idx].make_move(g)
                    g._Game__move(from_pos, slide, g.current_player_idx)

                    
                    if g.current_player_idx==0:
                        reward=g.compute_reward()
                        go = True
                        
                    elif go and g.current_player_idx==1:
                        if (tuple(prev_state.get_flat_board()), agent_to_train.last_action_number, tuple(deepcopy(g).get_flat_board())) not in taboo_set :
                            replay_row=(prev_state.get_flat_board(), agent_to_train.last_action_number, deepcopy(g), reward, reward==1)
                            taboo_set.add((tuple(prev_state.get_flat_board()),agent_to_train.last_action_number, tuple(deepcopy(g).get_flat_board())))
                            replay_buff.add_experience(replay_row)

                    if g.check_winner() != -1:
                        break
            
        # Now we sample a batch of data from the ReplayBuffer in order to train the Agent
        batch_to_train = replay_buff.sample_batch(batch_size)
      
        #divide the batch
        state_batch, action_num, next_state_batch, reward, done = zip(*batch_to_train) 

        #forward the Generator
        q_values = agent_to_train.GeneratorNet(torch.tensor(state_batch, dtype=torch.float32)).to(agent_to_train.device)
        q_values_target = torch.zeros(batch_size, 44).to(agent_to_train.device)


        #update q_values target by using Bellman Equation  
        for i in range(batch_size): 
            if done[i] == False:
                q_values_target[i, action_num[i]] = reward[i] + gamma * torch.tensor(agent_to_train.compute_target(next_state_batch[i])).to(agent_to_train.device).item()
            else:
               q_values_target[i, action_num[i]] = reward[i] 
          
        agent_to_train.optimizer.zero_grad()
        loss_curr=agent_to_train.criterion(q_values,q_values_target).to(agent_to_train.device)
        loss_curr.backward()
    
        agent_to_train.optimizer.step()

        loss_tot.append(loss_curr)
      
        if (step % time_to_update) == 0:
            # update the parameter of the TargetNet
            agent_to_train.copy_params_TargetNet()
            
    return  agent_to_train.GeneratorNet.state_dict()
            
      

In [5]:
POPULATION_SIZE = 10
OFFSPRING_SIZE = 4
LOCI = 5
MUTATION_PROBABILITY = 0.5
BIT_MUTATION_PROBABILITY = 0.5
NUM_GENERATION = 20

In [6]:
from random import randint
from copy import deepcopy

import tqdm


def new_offspring(parent1: 'Genome', parent2: 'Genome') -> 'Genome':
    if random.random() < MUTATION_PROBABILITY:
        return mutate(parent1)
    return three_cut_xover(parent1, parent2)
    
def mutate(parent: 'Genome') -> 'Genome':
    new_offspring = deepcopy(parent)

    for _ in range(LOCI):
        if random.random() < BIT_MUTATION_PROBABILITY:
            index_to_mutate = randint(0, LOCI-1)
            new_offspring.genotype[index_to_mutate] = random.uniform(cell_max_values[index_to_mutate][0], cell_max_values[index_to_mutate][1]) if isinstance(cell_max_values[index_to_mutate][0], float) else random.randint(cell_max_values[index_to_mutate][0], cell_max_values[index_to_mutate][1])
            new_offspring.genotype[4] = min(new_offspring.genotype[1], new_offspring.genotype[4]) # max_dim_replay_buff > batch_size
    
    return new_offspring

def three_cut_xover(ind1: 'Genome', ind2: 'Genome') -> 'Genome':
    one_cut_point = randint(0, int((LOCI)*0.3))
    two_cut_point = randint(int((LOCI)*0.3), int((LOCI)*0.6))
    three_cut_point = randint(int((LOCI)*0.6), LOCI - 1)
  
    # Order the cut points
    cut_points = sorted([one_cut_point, two_cut_point, three_cut_point])
    
    new_ind = Genome()
    new_ind.genotype = (ind1.genotype[:cut_points[0]] +
                        ind2.genotype[cut_points[0]:cut_points[1]] +
                        ind1.genotype[cut_points[1]:cut_points[2]] +
                        ind2.genotype[cut_points[2]:])
    
    new_ind.genotype[4] = min(new_ind.genotype[1], new_ind.genotype[4]) 
    
    assert len(new_ind.genotype) == LOCI
    return new_ind

def compute_fitness(individual: 'Genome', starts_first: bool):
    TrainedGeneratorNet = TrainedPlayer()
    trained_weights = training(individual, starts_first)
    TrainedGeneratorNet.GeneratorNet.load_state_dict(trained_weights) # TRAINED NETWORK to use in INFERENCE PHASE

    wins = 0
    num_match_test = 100

    for step in range(num_match_test):
        player=RandomPlayer()
        g=Game()
        g.current_player_idx = int(starts_first)
        winner=g.play(TrainedGeneratorNet, player)
        
        if winner==0:
            wins+=1

    individual.fitness = (wins/num_match_test)*100
    

In [7]:

def genetic_algorithm(starts_first: bool):
        population = [Genome() for _ in range(POPULATION_SIZE)]
        for i in range(OFFSPRING_SIZE):
                compute_fitness(population[i], starts_first)

        population.sort(key=lambda i: i.fitness, reverse=True)
        best_fitness = population[0].fitness
        best_individual = population[0]
        gen = 0

        while  gen < NUM_GENERATION:                
                for _ in range(OFFSPRING_SIZE):
                        offspring = new_offspring(population[0], population[1])
                        compute_fitness(offspring, starts_first)

                population.extend([offspring])
                population.sort(key=lambda i: i.fitness, reverse=True)
                population = population[:POPULATION_SIZE]
                best_individual = population[0]
                best_fitness = population[0].fitness
                gen += 1
                print(f"Best individual (until now): {best_individual.genotype}")
                print(f"Best fitness: {best_fitness}")


        print(f"Best individual with fitness: {best_fitness}")
        
        return best_individual



In [8]:
BEST_INDIVIDUAL_FIRST = genetic_algorithm(True)
BEST_INDIVIDUAL_SECOND = genetic_algorithm(False)

  q_values = agent_to_train.GeneratorNet(torch.tensor(state_batch, dtype=torch.float32)).to(agent_to_train.device)


Best individual (until now): [1, 364, 77, 0.8617795894984199, 350]
Best fitness: 71.0
Best individual (until now): [1, 378, 77, 0.5948568911764831, 211]
Best fitness: 83.0
Best individual (until now): [1, 378, 77, 0.5948568911764831, 350]
Best fitness: 89.0
Best individual (until now): [1, 378, 77, 0.5948568911764831, 350]
Best fitness: 89.0
Best individual (until now): [1, 378, 77, 0.5948568911764831, 350]
Best fitness: 89.0
Best individual (until now): [1, 378, 77, 0.5948568911764831, 348]
Best fitness: 93.0
Best individual (until now): [1, 378, 77, 0.5948568911764831, 348]
Best fitness: 93.0
Best individual (until now): [1, 378, 77, 0.5948568911764831, 348]
Best fitness: 93.0
Best individual (until now): [1, 378, 77, 0.5948568911764831, 348]
Best fitness: 93.0
Best individual (until now): [1, 378, 77, 0.5948568911764831, 348]
Best fitness: 93.0
Best individual (until now): [1, 378, 39, 0.5948568911764831, 348]
Best fitness: 95.0
Best individual (until now): [1, 378, 39, 0.5948568911

In [9]:
print(f"Best individual (starts first turn of the game): {BEST_INDIVIDUAL_FIRST.genotype}")
print(f"Best individual (starts second turn of the game): {BEST_INDIVIDUAL_SECOND.genotype}")

Best individual (starts first turn of the game): [1, 378, 39, 0.5948568911764831, 348]
Best individual (starts second turn of the game): [6, 589, 27, 0.28003986636230477, 409]
