## Initial imports

In [98]:

import time
import gym
import torch
import random
from collections import defaultdict, deque
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import namedtuple, deque

import neptune
import numpy as np
import copy
from pprint import pprint
import math
from tqdm.autonotebook import tqdm
from torch.utils.data import DataLoader
from torchvision import transforms
from joblib import Parallel, delayed
import gc
import gym_joinemio
from gym_joinemio.envs.player import RandomPlayer

Neptune prep

In [99]:
# import neptune.new as neptune
# import os

# proj = 'jmolais/joinemio'
# token = os.getenv('JOINEMIO_TOKEN')
# run = neptune.init(project=proj,
#                    api_token=token)

In [100]:
env = gym.make('joinemio-v0')

In [101]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## Replay memory

In [102]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size) # TODO: Batch size delete

    def __len__(self):
        return len(self.memory)

## Training: 
Random vs Random. Deep Q-Learning. Params:

- `n_episodes` (int): maximum number of training epsiodes
- `max_t` (int): maximum number of timesteps per episode _// Not used, because these episodes don't take too long and we like when game's are finished_
- `eps_start` (float): starting value of epsilon, for epsilon-greedy action selection
- `eps_end` (float): minimum value of epsilon 
- `eps_decay` (float): mutiplicative factor (per episode) for decreasing epsilon

In [103]:
# agent1 = RandomPlayer() # Player1
# agent2 = RandomPlayer() # Player2

# def dqn(n_episodes= EPISODES, eps_start=EPS_START, eps_end = EPS_END, eps_decay=EPS_DECAY):
#     memory_buffer = ReplayMemory(BUFFER_SIZE)
    
#     scores = []                         # list containing score from each episode
#     scores_window = deque(maxlen=100)   # last 100 scores
#     eps = eps_start

#     for i_episode in range(1, n_episodes+1):
#         print(f"=== EPISODE {i_episode} ===")
#         state = env.reset()
#         score = 0
        
#         # # OPT A:
#         # one_game = env.play_one_game(agent1, agent2, each_step_render=False)
#         # for e in env.recording: print(e)

#         # OPT B:
#         players = [agent1, agent2]
#         env.observation_space = env.reset()
#         action = None
        
#         while not env.game.game_state == gym_joinemio.envs.board.GameState.finished:
#             current_player = env.game.current_player - 1
#             action = players[current_player].get_action(env.observation_space)
#             state1 = env.game.board.grid
#             env.observation_space, reward, done, info = env.step(action)
#             state2 = env.game.board.grid

#             # Recording data:
#             if (current_player == 1): # (?) Do we do it for both players?
#                 memory_buffer.push(state1, action, state2, reward.value) 

#             score +=  reward.value
#             scores.append(score)
#             scores_window.append(score)
#             env.recording.append((current_player + 1, action, env.rewarder()))
            
#             # Analyzing scores
#             eps = max(eps*eps_decay,eps_end)
#             print('Episode {}\tAverage Score {:.2f}\n'.format(i_episode,np.mean(scores_window)), end="")
#             if np.mean(scores_window)>=200.0:
#                 print('\nEnvironment solve in {:d} epsiodes!\tAverage score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
#                 torch.save(agent.qnetwork_local.state_dict(),'checkpoint.pth')
#                 break
        
#         print(f"--------\nWinner: {env.game.winner}\n")
#         # return env.observation_space, reward, done, info  # reward for player1
#     return scores

# scores = dqn()

In [104]:
BUFFER_SIZE = 200
EPISODES = 10
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 0.996
EPS_DECAY_LAST_FRAME = 10**5
LEARNING_RATE = 1e-4

MEAN_REWARD_BOUND = 20 # TODO evaluate proper default val


# GAMMA = 0.99
# BATCH_SIZE = 32
# REPLAY_SIZE = 10000
# LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000
REPLAY_START_SIZE = 10000

# EPSILON_DECAY_LAST_FRAME = 10**5
# EPSILON_START = 1.0

In [105]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()

        self.main_layers = nn.Sequential(
            nn.Linear(6*7, 6*7),
            nn.ReLU(),
            nn.Linear(6*7, 6*7),
            nn.ReLU(),
            nn.Linear(6*7, 6*7),
            nn.ReLU()
        )
        self.output_layer = nn.Linear(6*7,7)

    def forward(self, board_flatten_state):
        # x1 = torch.tensor(input.astype(float)).type(torch.FloatTensor)
        # x = torch.flatten(x1)
        # for layer in self.net:
        #     x = layer(x)
        weights = nn.relU(self.output_layer(self.main_layers(board_flatten_state)))
        return weights

class AIPlayer:
    @staticmethod
    def possible_moves(board_state):
        available_cols = []
        for i in range(len(board_state[0])):
            if board_state[0][i] == 0:
                available_cols.append(i)
        return available_cols

    def __init__(self, env, replay_memory):
        # self.net = NeuralNetwork()
        self.env = env
        self.replay_memory = replay_memory
        self._reset()

    def _reset(self):
        self.state =  env.reset()
        self.total_reward = 0.0

    def get_action(self, board_state):
        weigths = self.net.forward(board_state)
        pos_nums = self.possible_moves(board_state)
        max_num = 0
        for col in pos_nums:
            if weigths[max_num] < weigths[int(col)]:
                max_num = int(col)
        return max_num

    def play_step(self, net, epsilon, device):
        done_reward = None
        if np.random.random() < epsilon:
            grid = env.game.board.grid
            action = random.choice(RandomPlayer.possible_moves(grid)) # TODO: Clean it up
        else: 
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())  # TODO: Later check if it returns correct action int range.

        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward.value

        # exp = Transition(self.state, action, new_state, reward)
        # self.replay_memory.push(exp)
        self.replay_memory.push(self.state, action, new_state, reward)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

    def train(self, memory_buffer, batch_size):
        return 0

def our_main():
    env = gym.make('joinemio-v0')
    net = NeuralNetwork().to(device)
    tgt_net = NeuralNetwork().to(device)
    print(net)
    buffer = ReplayMemory(BUFFER_SIZE)
    agent = AIPlayer(env, buffer) # TODO: params? Whatever agent is...
    epsilon = EPS_START
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    
    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward = None

    while True:
        frame_idx += 1
        epsilon = max(EPS_END, EPS_START - frame_idx / EPS_DECAY_LAST_FRAME)

        reward = agent.play_step(net, epsilon, device=device)
        if (reward is not None):
            total_rewards.append(reward)
            speed = (frame_idx - ts_frame) / (time.time() - ts)
            ts_frame = frame_idx
            ts = time.time()
            mean_reward = np.mean(total_rewards[-100:])
            print(f'done: {frame_idx}, games: {len(total_rewards)}, mean reward: {mean_reward}, eps: {epsilon}, speed: {speed}')
            # TODO: # Neptun logging (write epsilon, speed, reward_100, reward)

            if best_mean_reward is None or best_mean_reward < mean_reward:
                torch.save(net.state_dict(), "joinemio-best.dat") # TODO: Extract
                if best_mean_reward is not None:
                    print(f"Best mean reward updated {best_mean_reward}->{mean_reward}; model saved")
                best_mean_reward = mean_reward
            if mean_reward > MEAN_REWARD_BOUND:
                print(f"Solved in {frame_idx} frames")
                break
        
        if len(buffer) < REPLAY_START_SIZE:
            continue

        if frame_idx % SYNC_TARGET_FRAMES == 0:
            tgt_net.load_state_dict(net.state_dict())

        optimizer.zero_grad()
        batch = buffer.sample(BATCH_SIZE)
        # loss_t = calc_loss(batch, net, tgt_net, device=device) # TODO: calc_loss method
        optimizer.step()

our_main()

    

SyntaxError: unmatched ')' (<ipython-input-105-fe7d7b0c2f18>, line 109)