In [25]:
import torch
from torch import nn
import random,numpy
import numpy as np
from copy import deepcopy
from game import Game, Move, RandomPlayer, MyPlayer

In [26]:
class QuixoNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(5*5, 100),
            nn.ReLU(),
            nn.Linear(100, 100),
            nn.ReLU(),
            nn.Linear(100, 44),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

## Training the agent

In [27]:
## Constants definition
num_iterations = 500
num_matches = 10
max_dim_replay_buff = 10_000
time_to_update = 100
gamma = 0.1

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")


Using cpu device


In [28]:
class ReplayBuffer:    
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size        
        self.buffer = []
        self.position = 0
        
    def add_experience(self, experience):        
        if len(self.buffer) < self.buffer_size:
            self.buffer.append(experience)        
        else:
            self.buffer[self.position] = experience        
            self.position = (self.position + 1) % self.buffer_size
            
    def sample_batch(self, batch_size):
        batch_indices = np.random.choice(len(self.buffer), batch_size, replace=True)
        batch = [self.buffer[i] for i in batch_indices]
        return batch

In [29]:
from game import Player
from tqdm import tqdm


def training(player1: 'Player', player2: 'Player'):
    loss_tot = []
    flag=1
    replay_buff = ReplayBuffer(max_dim_replay_buff) # replay buffer, from which we sample for BATCH learning
    # inizializza i modelli -> passali per parametro
    
    for step in tqdm(range(num_iterations), desc="Training Iterations"):  
        # we fill the replay buffer with experiences made with matches
        for match in range(num_matches):
            # lo uso solo in inferenza
            # gioca una intera partita qui dentro e per ogni mossa, metti l'experience dentro il ReplayBuffer
            # ...
            # ... qui descrivere come viene effettivamente usata la rete neurale (magari impacchettarla in qualche funzione)
            winner = -1
            g = Game()
            #player1 = MyPlayer()
            #player2 = RandomPlayer()
            players = [player1, player2]
            
            while winner < 0:
                g.current_player_idx += 1
                g.current_player_idx %= len(players)
                #print(g.current_player_idx)
                prev_state=deepcopy(g)
                from_pos, slide = players[g.current_player_idx].make_move(g)
                   
                if g.current_player_idx==0:
                    ##l'azione è il valore del q-value quindi un numero? o la posizione in questo caso  
                    ##in quel caso replay_row=(prev_state.get_board,GeneratorNet((prev_state.get_board),g.get_board,reward)
                    reward=g.compute_reward(from_pos, slide)
                    replay_row=(prev_state.get_board, player1.last_action_value, deepcopy(g), reward)
                    replay_buff.add_experience(replay_row)
                    
                if g.check_winner() != -1:
                    break
            
            # replay_buffer.push(...) per ogni transizione di stato osservata ()
            # restituisci il risultato sotto forma di tupla (st, a, st+1, r) - (State, action, next_state, reward)
            
        # Now we sample a batch of data from the ReplayBuffer in order to train the Agent
        batch_to_train = replay_buff.sample_batch(50)
      
        TargetNet_targets = []
        GeneratorNet_outputs = []
        player1.myplayer_zero_grad()
        # Now we need to compute the list of targets (made with the TargetNet) and the one with the Q values for the "current" state
        
        for element in batch_to_train:
            _, action_val, new_state, reward = element
            
            # per ogni tupla di 4 elementi prendere l'action (a) (sarà un valore...q value)
            # Rappresenta il primo termine della Loss function (MSE) => output
            # inserirlo dentro GeneratorNet_outputs
            GeneratorNet_outputs.append(action_val)

            # Per calcolare il secondo termine, devo prendere la reward dalla tupla, gamma dai parametri (vedi sopra) e il q value dell'azione migliore dello stato successivo
            # Rappresenta il secondo termine della Loss function (MSE) => target
            # Per calcolare il target, dare in pasto alla NN lo stato+1 contenuto nella tupla element
            # inserirlo dentro TargetNet_targets
            max_action_newstate = player1.compute_target(new_state)
         
            res = reward + gamma*max_action_newstate
            TargetNet_targets.append(res)
        GeneratorNet_outputs =torch.tensor(GeneratorNet_outputs, dtype=torch.float32,requires_grad=True).to(player1.device)
        TargetNet_targets = torch.tensor(TargetNet_targets, dtype=torch.float32,requires_grad=True).to(player1.device)
       
        
        if flag==1:
          
            # print(player1.GeneratorNet.state_dict())  
             print(player1.TargetNet.state_dict()) 
             flag=0
        loss_curr = player1.myplayer_loss_and_update_params(GeneratorNet_outputs, TargetNet_targets) 
       

        loss_tot.append(loss_curr)
        
        if (step % time_to_update) == 0:
            # update the parameter of the TargetNet
            player1.copy_params_TargetNet()
            
       
        #printa come varia 
    print("suca")
    #print(player1.GeneratorNet.state_dict())        
    print(player1.TargetNet.state_dict()) 
    return player1.GeneratorNet.state_dict()
            
      

In [30]:
player1 = MyPlayer()
player2 = RandomPlayer()
trained_model_params = training(player1, player2)
TrainedGeneratorNet = QuixoNet.load_state_dict(trained_model_params) # TRAINED NETWORK to use in INFERENCE PHASE

Training Iterations:   0%|          | 0/500 [00:00<?, ?it/s]

OrderedDict([('linear_relu_stack.0.weight', tensor([[-0.0407,  0.0842, -0.1713,  ...,  0.0183,  0.1666,  0.1921],
        [-0.1550,  0.0843, -0.1830,  ..., -0.0606,  0.0536,  0.1799],
        [-0.1144, -0.1517, -0.1941,  ...,  0.1954,  0.1809, -0.0376],
        ...,
        [ 0.0420, -0.0828,  0.1090,  ..., -0.0485,  0.1563, -0.1397],
        [-0.0472, -0.1840,  0.1153,  ..., -0.1894,  0.1209,  0.1178],
        [ 0.1365,  0.0763, -0.1559,  ...,  0.0941, -0.0328,  0.1177]])), ('linear_relu_stack.0.bias', tensor([-0.0854,  0.1289,  0.0651, -0.0808,  0.1080,  0.1945,  0.1485,  0.0887,
        -0.1669, -0.0744,  0.1381, -0.0841, -0.0822, -0.1427, -0.1954,  0.1152,
        -0.1179, -0.1415,  0.0290, -0.1054, -0.1948,  0.0588,  0.1986,  0.0526,
        -0.1941, -0.0325, -0.0437,  0.1245, -0.0415, -0.1567, -0.1098, -0.0935,
        -0.0637, -0.0373, -0.1815, -0.0030, -0.1112, -0.1717,  0.0632,  0.0011,
         0.0066, -0.1318,  0.1561,  0.0235,  0.1410,  0.1182, -0.0191,  0.0809,
         0.

Training Iterations:  98%|█████████▊| 490/500 [00:31<00:00, 13.25it/s]

## Inference

In [None]:
wins = 0
losts = 0
draws = 0
num_match_test = 1000

for _ in num_match_test:
    # play_match(TrainedGeneratedNet, random)
    # increment wins/losts/draws 
    print("miao")

print(f"Accuracy: {(wins/num_match_test)*100}")
print("Wins: {wins} - Losts: {losts} - Draws {draws}")

