### Create Q-learning table, reward table, and state transition table needed for reinforcement learning

### Generator for transition table

In [1]:
import numpy as np
from math import sqrt

def transition_table_generator(states):
    
    transition_table = np.zeros((states,5))
    
    m = len(transition_table)
    n = sqrt(m)
    
    for i in range(m):
        
        for j in range(5):
            
            # check if top row, then 'up' action invalid
            if i < n:
                transition_table[i][0] = -1
            else:
                transition_table[i][0] = i - n
                
            # check if bottom row, then 'down' action invalid
            if i >= n * (n - 1):
                transition_table[i][1] = -1
            else:
                transition_table[i][1] = i + n
                
            # check if leftmost column, then 'left' action invalid
            if i % n == 0:
                transition_table[i][2] = -1
            else:
                transition_table[i][2] = i - 1
            
            # check if rightmost column, then 'right' action invalid
            if i % n == n - 1:
                transition_table[i][3] = -1
            else:
                transition_table[i][3] = i + 1
            
            # 'no move' action
            transition_table[i][4] = i
    
    transition_table = transition_table.astype(int)

    return transition_table

### Generator for reward table

In [2]:
def reward_table_generator(board, transition_table):
    
    reward_table = np.zeros((len(transition_table), len(transition_table[0])))
    
    for i in range(len(transition_table)):
        
        for j in range(len(transition_table[0])):
            
            state = transition_table[i][j]
            
            # if no move, then reward is - 1
            if state == i:
                reward_table[i][j] = -1
            
            # if snake, quicksand, or pit, add -10 reward
            n = int(sqrt(len(transition_table)))
            row = int(state / n)
            column = int(state % n)
            if board[row][column] == 'S' or board[row][column] == 'Q' or board[row][column] == 'P':
                reward_table[i][j] = -10
            
            # if treasure, add 10
            elif board[row][column] == 'T':
                reward_table[i][j] = 10

    return reward_table

### Create environment for reinforcement learning agent

In [3]:
def environment(board, states):
    env = dict()
    env['board'] = board
    env['transition table'] = transition_table_generator(states)
    env['reward table'] = reward_table_generator(board, env['transition table'])
    return env

### Create module to display agent and environment

In [4]:
def draw_board(board):
    n = len(board)
    for i in range(n):
        for j in range(n):
            print(board[i][j], end=' ')
        print('\n')

### Create learning module for agents

In [5]:
import random

def agent_learning(agent, environment, sessions):
    
    # let agent learn best actions for environment
    alpha = agent['alpha']
    gamma = agent['gamma']
    q_table = agent['q table'].copy()
    board = environment['board']
    
    # find start state and goal state
    start_state = 0
    goal_state = 0
    
    n = len(environment['board'])
    for row in environment['board']:
        for position in row:
            if position == 'A':
                start_state = environment['board'].index(row) + n * row.index(position)
            elif position == 'T':
                goal_state = environment['board'].index(row) + n * row.index(position)
    
    rewards = environment['reward table']
    transition_table = environment['transition table'].tolist()
    
    # possible actions for each state
    actions = []

    for i in transition_table:
        action = []
        for j in i:
            if j != -1:
                action.append(i.index(j))
        actions.append(action)
    
    actions = np.array(actions)
    
    
    # train agent for arbirtray number of epochs
    for i in range(sessions):
        print('Begin session %d' % (i + 1))
        steps = 0
        current_state = start_state
        
        print("Board after %d steps" % steps)
        print("-----------------------------------------------------------")
        draw_board(board)
        
        while current_state != goal_state:
            steps += 1
            # select random action for agent
            action = random.choice(actions[current_state])
            next_state = transition_table[current_state][action]
            future_rewards = []
        
            # update agent's q table
            for action in actions[next_state]:
                future_rewards.append(q_table[next_state, action])
            
            # reward function
            q_old = q_table[current_state][action]
            
            q_new = (1 - alpha) *  q_old + alpha * (rewards[current_state][action] + gamma * max(future_rewards))
            
            

            q_table[current_state][action] = q_new
            
            # update and display board
            orow = int(current_state / n)
            ocolumn = int(current_state % n)
            nrow = int(next_state / n)
            ncolumn = int(next_state % n)
            
            if board[orow][ocolumn] == 'A':
                board[orow][ocolumn] = '-'
            else:
                board[orow][ocolumn] = board[orow][ocolumn].replace('A', '')
            
            if board[nrow][ncolumn] == '-':
                 board[nrow][ncolumn] = 'A'
            else:
                board[nrow][ncolumn] += 'A'
            
            print("Board after %d steps" % steps)
            print("--------------------------------------")
            draw_board(board)           
            
            current_state = next_state
            print('Current state is now: %d' % current_state)
        
            # check if goal state reached, finish learning and reset board
            if current_state == goal_state:
                print("Goal reached after %d steps" % steps)
                board[int(start_state / n)][int(start_state % n)] = 'A'
                board[int(goal_state / n)][int(goal_state % n)] = 'T'

    
    agent['q table'] = q_table
    return agent

### Create agent to learn environment

In [6]:
def rl_agent(environment, alpha, gamma, sessions):
    
    agent = dict()
    
    agent['alpha'] = alpha
    agent['gamma'] = gamma
    agent['q table'] = np.zeros((len(environment['transition table']), len(environment['transition table'][0])))
    # module for agent learning
    agent = agent_learning(agent, environment, sessions)
    
    return agent

### Create agent to learn environment

In [7]:
# board displaying environment
board = [['A','S','P','-','S'],
         ['-','-','Q','S','-'],
         ['-','S','P','-','Q'],
         ['-','Q','-','T','P'],
         ['-','-','-','Q','S']]

# board = [['A','S'],
#          ['-','T']]

env = environment(board, 25)

agent_1 = rl_agent(environment=env, gamma=0.8, alpha=0.5, sessions=5)

Begin session 1
Board after 0 steps
-----------------------------------------------------------
A S P - S 

- - Q S - 

- S P - Q 

- Q - T P 

- - - Q S 

Board after 1 steps
--------------------------------------
- SA P - S 

- - Q S - 

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 1
Board after 2 steps
--------------------------------------
- S PA - S 

- - Q S - 

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 2
Board after 3 steps
--------------------------------------
- S P A S 

- - Q S - 

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 3
Board after 4 steps
--------------------------------------
- S P A S 

- - Q S - 

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 3
Board after 5 steps
--------------------------------------
- S P A S 

- - Q S - 

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 3
Board after 6 steps
--------------------------------------
- S PA - S 

- - Q S - 

- S P - Q 

- Q - T P 

- - - Q S 

Current

- S P - S 

- - Q SA - 

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 8
Board after 92 steps
--------------------------------------
- S P - S 

- - Q S - 

- S P A Q 

- Q - T P 

- - - Q S 

Current state is now: 13
Board after 93 steps
--------------------------------------
- S P - S 

- - Q S - 

- S P A Q 

- Q - T P 

- - - Q S 

Current state is now: 13
Board after 94 steps
--------------------------------------
- S P - S 

- - Q S - 

- S P A Q 

- Q - T P 

- - - Q S 

Current state is now: 13
Board after 95 steps
--------------------------------------
- S P - S 

- - Q S - 

- S P - QA 

- Q - T P 

- - - Q S 

Current state is now: 14
Board after 96 steps
--------------------------------------
- S P - S 

- - Q S - 

- S P A Q 

- Q - T P 

- - - Q S 

Current state is now: 13
Board after 97 steps
--------------------------------------
- S P - S 

- - Q S - 

- S PA - Q 

- Q - T P 

- - - Q S 

Current state is now: 12
Board after 98 steps
----------------------

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 5
Board after 49 steps
--------------------------------------
- S P - S 

- A Q S - 

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 6
Board after 50 steps
--------------------------------------
- S P - S 

- - QA S - 

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 7
Board after 51 steps
--------------------------------------
- S P - S 

- - Q S - 

- S PA - Q 

- Q - T P 

- - - Q S 

Current state is now: 12
Board after 52 steps
--------------------------------------
- S P - S 

- - Q S - 

- SA P - Q 

- Q - T P 

- - - Q S 

Current state is now: 11
Board after 53 steps
--------------------------------------
- S P - S 

- A Q S - 

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 6
Board after 54 steps
--------------------------------------
- S P - S 

- - Q S - 

- SA P - Q 

- Q - T P 

- - - Q S 

Current state is now: 11
Board after 55 steps
--------------------------------------
- S P - S

- - Q S - 

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 2
Board after 135 steps
--------------------------------------
- SA P - S 

- - Q S - 

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 1
Board after 136 steps
--------------------------------------
A S P - S 

- - Q S - 

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 0
Board after 137 steps
--------------------------------------
- S P - S 

A - Q S - 

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 5
Board after 138 steps
--------------------------------------
- S P - S 

A - Q S - 

- S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 5
Board after 139 steps
--------------------------------------
- S P - S 

- - Q S - 

A S P - Q 

- Q - T P 

- - - Q S 

Current state is now: 10
Board after 140 steps
--------------------------------------
- S P - S 

- - Q S - 

- S P - Q 

A Q - T P 

- - - Q S 

Current state is now: 15
Board after 141 steps
---------------------------------