## Initial imports

In [1]:

import time
import gym
import torch
import random
from collections import defaultdict, deque
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import namedtuple, deque

import numpy as np
import copy
from pprint import pprint
import math
from tqdm.autonotebook import tqdm
from torch.utils.data import DataLoader
from torchvision import transforms
from joblib import Parallel, delayed
import gc
import gym_joinemio
from gym_joinemio.envs.player import RandomPlayer
from gym_joinemio.envs.connect_four_env import Reward

Neptune prep

In [2]:
# import neptune
# import neptune.new as neptune
# import os

# proj = 'jmolais/joinemio'
# token = os.getenv('JOINEMIO_TOKEN')
# run = neptune.init(project=proj,
#                    api_token=token)

In [3]:
env = gym.make('joinemio-v0')

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(device)

cpu


## Replay memory

In [5]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size) # TODO: Batch size delete

    def __len__(self):
        return len(self.memory)

In [6]:
# agent1 = RandomPlayer() # Player1
# agent2 = RandomPlayer() # Player2

# def dqn(n_episodes= EPISODES, eps_start=EPS_START, eps_end = EPS_END, eps_decay=EPS_DECAY):
#     memory_buffer = ReplayMemory(BUFFER_SIZE)
    
#     scores = []                         # list containing score from each episode
#     scores_window = deque(maxlen=100)   # last 100 scores
#     eps = eps_start

#     for i_episode in range(1, n_episodes+1):
#         print(f"=== EPISODE {i_episode} ===")
#         state = env.reset()
#         score = 0
        
#         # # OPT A:
#         # one_game = env.play_one_game(agent1, agent2, each_step_render=False)
#         # for e in env.recording: print(e)

#         # OPT B:
#         players = [agent1, agent2]
#         env.observation_space = env.reset()
#         action = None
        
#         while not env.game.game_state == gym_joinemio.envs.board.GameState.finished:
#             current_player = env.game.current_player - 1
#             action = players[current_player].get_action(env.observation_space)
#             state1 = env.game.board.grid
#             env.observation_space, reward, done, info = env.step(action)
#             state2 = env.game.board.grid

#             # Recording data:
#             if (current_player == 1): # (?) Do we do it for both players?
#                 memory_buffer.push(state1, action, state2, reward.value) 

#             score +=  reward.value
#             scores.append(score)
#             scores_window.append(score)
#             env.recording.append((current_player + 1, action, env.rewarder()))
            
#             # Analyzing scores
#             eps = max(eps*eps_decay,eps_end)
#             print('Episode {}\tAverage Score {:.2f}\n'.format(i_episode,np.mean(scores_window)), end="")
#             if np.mean(scores_window)>=200.0:
#                 print('\nEnvironment solve in {:d} epsiodes!\tAverage score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
#                 torch.save(agent.qnetwork_local.state_dict(),'checkpoint.pth')
#                 break
        
#         print(f"--------\nWinner: {env.game.winner}\n")
#         # return env.observation_space, reward, done, info  # reward for player1
#     return scores

# scores = dqn()

## Training parameters: 
Random vs Random. Deep Q-Learning. Params:

- `n_episodes` (int): maximum number of training epsiodes
- `max_t` (int): maximum number of timesteps per episode _// Not used, because these episodes don't take too long and we like when game's are finished_
- `eps_start` (float): starting value of epsilon, for epsilon-greedy action selection
- `eps_end` (float): minimum value of epsilon 
- `eps_decay` (float): mutiplicative factor (per episode) for decreasing epsilon

In [7]:
BUFFER_SIZE = 200
EPISODES = 10
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 0.996
EPS_DECAY_LAST_FRAME = 10**5
LEARNING_RATE = 1e-4

MEAN_REWARD_BOUND = 0.8 # TODO evaluate proper default val


# GAMMA = 0.99
# BATCH_SIZE = 32
# REPLAY_SIZE = 10000
# LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000
REPLAY_START_SIZE = 10000

# EPSILON_DECAY_LAST_FRAME = 10**5
# EPSILON_START = 1.0

## NeuralNetwork

In [8]:
from gym_joinemio.envs.board import Board, GameState


class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()

        self.main_layers = nn.Sequential(
            nn.Linear(6*7, 6*7),
            nn.ReLU(),
            nn.Linear(6*7, 6*7),
            nn.ReLU(),
            nn.Linear(6*7, 6*7),
            nn.ReLU()
        )
        self.output_layer = nn.Linear(6*7,7)

    def forward(self, board_flatten_state):
        # x1 = torch.tensor(input.astype(float)).type(torch.FloatTensor)
        # x = torch.flatten(x1)
        # for layer in self.net:
        #     x = layer(x)
        weights = self.output_layer(self.main_layers(board_flatten_state)) # weights = nn.relU(self.output_layer(self.main_layers(board_flatten_state)))
        return weights

## AIPlayer

In [9]:
class AIPlayer:
    @staticmethod
    def possible_moves(board_state):
        available_cols = []
        for i in range(len(board_state[0])):
            if board_state[0][i] == 0:
                available_cols.append(i)
        return available_cols

    def __init__(self, env, replay_memory):
        # self.net = NeuralNetwork()
        self.env = env
        self.env.opponent_action_set(RandomPlayer.get_action)#self.get_action)
        self.replay_memory = replay_memory
        self._reset()

    def _reset(self):
        self.env = gym.make('joinemio-v0')
        self.state =  self.env.reset()
        self.env.opponent_action_set(RandomPlayer.get_action)
        self.env.game = gym_joinemio.envs.board.Game()
        self.total_reward = 0.0

    def get_action(self, board_state):
        # weigths = self.net.forward(board_state)
        weigths = self.net.forward(torch.flatten(torch.FloatTensor(board_state.astype(float)).type(torch.FloatTensor)))
        pos_nums = self.possible_moves(board_state)
        max_num = 0
        for col in pos_nums:
            if weigths[max_num] < weigths[int(col)]:
                max_num = int(col)
        return max_num

    def play_step(self, net, epsilon, device):
        done_reward = None
        final_reward = None
        if np.random.random() < epsilon:
            print("*RANDOM PLAYER PLAYED*")
            grid = self.env.game.board.grid
            action = RandomPlayer.get_action(grid)
        else:
            print("*AI PLAYER PLAYED*")
            state_a = np.array(self.state, copy=False, dtype=np.uint8)
            state_v = torch.flatten(torch.FloatTensor(state_a).to(device))
            q_vals_v = net(state_v)
            # TODO filter
            grid = self.env.game.board.grid
            for i in range(Board.columns):
                if i not in self.possible_moves(grid):
                    q_vals_v[i] = -1
            _, act_v = torch.max(q_vals_v, dim=0)
            action = int(act_v.item())  # TODO: Later check if it returns correct action int range.

        new_state, reward, is_done, _ = self.env.step(action)
        final_reward = reward
        self.total_reward += reward.value

        # exp = Transition(self.state, action, new_state, reward)
        # self.replay_memory.push(exp)
        self.replay_memory.push(self.state, action, new_state, reward)
        self.state = new_state

        if is_done == GameState.finished:
            done_reward = self.total_reward
            print(f"WINNER: player {self.env.game.winner} | Final reward reading: {final_reward}")
            print(self.env.game.board.grid)
            self._reset()
            return (done_reward, final_reward)
        else: return (done_reward, None)

    def train(self, memory_buffer, batch_size):
        return 0


## Training loop:

In [10]:
def our_main():
    env = gym.make('joinemio-v0')
    net = NeuralNetwork().to(device)
    tgt_net = NeuralNetwork().to(device)
    print(net)
    buffer = ReplayMemory(BUFFER_SIZE)
    agent = AIPlayer(env, buffer) # TODO: params? Whatever agent is...
    epsilon = EPS_START
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    
    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward = None
    last_total_frames = 0
    while True:
        frame_idx += 1
        epsilon = max(EPS_END, EPS_START - frame_idx / EPS_DECAY_LAST_FRAME)
        reward, final_reward = agent.play_step(net, epsilon, device=device)
        if (final_reward is not None):
            frames_this_game = frame_idx - last_total_frames
            last_total_frames = frame_idx
            print("Reward is not None (Game Finished)")
            total_rewards.append(reward)
            ts_frame = frame_idx
            ts = time.time()
            mean_reward = np.mean(total_rewards[-100:])
            print(f'Total Steps Played: {frame_idx}, frames this game {frames_this_game}, Games done: {len(total_rewards)}, mean reward: {mean_reward}, eps: {epsilon}, final reward: {final_reward}')
            print("")          # TODO: # Neptun logging (write epsilon, speed, reward_100, reward)
            
            if best_mean_reward is None or best_mean_reward < mean_reward:
                torch.save(net.state_dict(), "joinemio-best.dat") # TODO: Extract
                if best_mean_reward is not None:
                    print(f"Best mean reward updated {best_mean_reward}->{mean_reward}; model saved")
                best_mean_reward = mean_reward
            if mean_reward > MEAN_REWARD_BOUND and frame_idx > 10000: 
                print(f"Solved in {frame_idx} frames")
                break
        else: print("REWARD NONE")
        
        if len(buffer) < REPLAY_START_SIZE:
            continue

        if frame_idx % SYNC_TARGET_FRAMES == 0:
            tgt_net.load_state_dict(net.state_dict())

        optimizer.zero_grad()
        batch = buffer.sample(BATCH_SIZE)
        # loss_t = calc_loss(batch, net, tgt_net, device=device) # TODO: calc_loss method
        optimizer.step()

our_main()


NeuralNetwork(
  (main_layers): Sequential(
    (0): Linear(in_features=42, out_features=42, bias=True)
    (1): ReLU()
    (2): Linear(in_features=42, out_features=42, bias=True)
    (3): ReLU()
    (4): Linear(in_features=42, out_features=42, bias=True)
    (5): ReLU()
  )
  (output_layer): Linear(in_features=42, out_features=7, bias=True)
)


AttributeError: 'ConnectFourEnv' object has no attribute 'opponent_action_set'

We have brought the network to the stage where the network learns when playing with a player making random moves. We had a problem with looping the learning process, so we used multi-armed bandit algorithm. It allowed us to break these loops. Initially, the learning player performs random moves in the same way as the opponent, but with subsequent moves, moves selected by the network are interfered more and more often. The parameter ε is responsible for the frequency of these movements. An increasingly well-trained network makes more and more non-random moves, which leads to a much larger number of won games. The network discovered that when playing with a player making random moves, the best chance of winning is by placing tokens in one column all the time.


For the learning process, we used a neural network consisting of 4 layers, 42 input neurons on each of them and  7 neurons, each corresponding to the action of dropping a chip into one of 7 different columns of the board. At the input, we place a flattened board with 42 elements depicting the game boards, at the output the profitability weight of the token toss on a given selected column

In a neural network, the activation function is responsible for transforming the summed weighted input from the node into the activation of the node or output for that input. The rectified linear activation function or ReLU for short is a piecewise linear function that will output the input directly if it is positive, otherwise, it will output zero. It has become the default activation function for many types of neural networks because a model that uses it is easier to train and often achieves better performance.

We used relu functions as activation functions on our layers
This function returns 0 if it receives any negative input, but for any positive value  x  it returns that value back. So it can be written as  f(x)=max(0,x)


The learning process was carried out using deep q learning. ore specifically, the agents receives information on the current observation (the current state of the board) and then has to take an action (which slot to choose to add a coin). After that, nature responses with a new state and potentially yields a reward (if the game is won) or a penalty (if the game is lost or if the agent chooses an action that is not valid - such as putting a coin into an already full slot). The goal of each action is to receive the greatest possible reward. After gathering some experience, a neural network is trained to make sense of the state, action and reward relationship. The target is set such that the network aims at minimizing the loss between predicting the reward of the next_state and the realized reward.

Training is nothing as iteratively playing against the trainer, memorizing what happened and updating the neural net weights after each iteration.

So it looks like Deep-Q-Learning was the right choice: just by playing against a random agent, the neural network was trained to win the game - even without knowing the rules first!
