## Initial imports

In [7]:

import gym
import torch
import random
from collections import defaultdict, deque
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import namedtuple, deque

import neptune
import numpy as np
import copy
from pprint import pprint
import math
from tqdm.autonotebook import tqdm
from torch.utils.data import DataLoader
from torchvision import transforms
from joblib import Parallel, delayed
import gc
import gym_joinemio
from gym_joinemio.envs.player import RandomPlayer

Neptune prep

In [None]:
import neptune.new as neptune
import os

proj = 'jmolais/joinemio'
token = os.getenv('JOINEMIO_TOKEN')
run = neptune.init(project=proj,
                   api_token=token)

In [10]:
env = gym.make('joinemio-v0')

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## Replay memory

In [12]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size) # TODO: Batch size delete

    def __len__(self):
        return len(self.memory)

[DEBUG][client|bravado.client] executeOperations({'experimentId': '15968353-066b-4838-8b31-b9cb927329b3', 'operations': [{'path': 'monitoring/stdout', 'logStrings': {'entries': [{'value': 'cpu', 'step': None, 'timestampMilliseconds': 1628002001980}, {'value': '\n', 'step': None, 'timestampMilliseconds': 1628002001983}]}}], '_request_options': {'connect_timeout': 30, 'timeout': None}})
[DEBUG][connectionpool|urllib3.connectionpool] https://app.neptune.ai:443 "POST /api/leaderboard/v1/attributes/operations?experimentId=15968353-066b-4838-8b31-b9cb927329b3 HTTP/1.1" 200 None


## Training: 
Random vs Random. Deep Q-Learning. Params:

- `n_episodes` (int): maximum number of training epsiodes
- `max_t` (int): maximum number of timesteps per episode _// Not used, because these episodes don't take too long and we like when game's are finished_
- `eps_start` (float): starting value of epsilon, for epsilon-greedy action selection
- `eps_end` (float): minimum value of epsilon 
- `eps_decay` (float): mutiplicative factor (per episode) for decreasing epsilon

In [None]:
BUFFER_SIZE = 200
EPISODES = 10
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 0.996

In [None]:
agent1 = RandomPlayer() # Player1
agent2 = RandomPlayer() # Player2

def dqn(n_episodes= EPISODES, eps_start=EPS_START, eps_end = EPS_END, eps_decay=EPS_DECAY):
    memory_buffer = ReplayMemory(BUFFER_SIZE)
    
    scores = []                         # list containing score from each episode
    scores_window = deque(maxlen=100)   # last 100 scores
    eps = eps_start

    for i_episode in range(1, n_episodes+1):
        print(f"=== EPISODE {i_episode} ===")
        state = env.reset()
        score = 0
        
        # # OPT A:
        # one_game = env.play_one_game(agent1, agent2, each_step_render=False)
        # for e in env.recording: print(e)

        # OPT B:
        players = [agent1, agent2]
        env.observation_space = env.reset()
        action = None
        
        while not env.game.game_state == gym_joinemio.envs.board.GameState.finished:
            current_player = env.game.current_player - 1
            action = players[current_player].get_action(env.observation_space)
            state1 = env.game.board.grid
            env.observation_space, reward, done, info = env.step(action)
            state2 = env.game.board.grid

            # Recording data:
            if (current_player == 1): # (?) Do we do it for both players?
                memory_buffer.push(state1, action, state2, reward.value) 

            score +=  reward.value
            scores.append(score)
            scores_window.append(score)
            env.recording.append((current_player + 1, action, env.rewarder()))
            
            # Analyzing scores
            eps = max(eps*eps_decay,eps_end)
            print('Episode {}\tAverage Score {:.2f}\n'.format(i_episode,np.mean(scores_window)), end="")
            if np.mean(scores_window)>=200.0:
                print('\nEnvironment solve in {:d} epsiodes!\tAverage score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
                torch.save(agent.qnetwork_local.state_dict(),'checkpoint.pth')
                break
        
        print(f"--------\nWinner: {env.game.winner}\n")
        # return env.observation_space, reward, done, info  # reward for player1
    return scores

scores= dqn()

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.main_layers = nn.Sequential(
        nn.Linear(6*7, 6*7), nn.relU(),
        nn.Linear(6*7, 6*7), nn.relU(),
        nn.Linear(6*7, 6*7), nn.relU())
        self.output_layer =  nn.Linear(6*7,7)

    def forward(self, board_flatten_state):
        # x1 = torch.tensor(input.astype(float)).type(torch.FloatTensor)
        # x = torch.flatten(x1)
        # for layer in self.net:
        #     x = layer(x)
        weights = nn.relU(self.output_layer(self.main_layers(board_flatten_state)))
        return weights

class AIPlayer:
    @staticmethod
    def possible_moves(board_state):
        available_cols = []
        for i in range(len(board_state[0])):
            if board_state[0][i] == 0:
                available_cols.append(i)
        return available_cols

    def __init__(self):
        self.net = NeuralNetwork()

    def get_action(self, board_state):
        weigths = self.net.forward(board_state)
        pos_nums = self.possible_moves(board_state)
        max_num = 0
        for col in pos_nums:
            if weigths[max_num] < weigths[int(col)]:
                max_num = int(col)
        return max_num

    def train(self, memory_buffer, batch_size):
        return 0

Evaluation