### Basic imports

In [1]:
import pandas as pd
import random
import numpy as np

### Initializing the 25 x 5 transitions table with possible moves.

### There are 25 possible positions on the game board, 
### we are going to define a possible position to move for each of the positions in each direction.
### For each position we have 4 possible moves, UP, DOWN, LEFT, RIGHT, so each position in game board, player will have 4 possible ways to move.
### every invalid move has a -1.
### we account for invalid move when the move is not possible, 
### for example if we are on first row, going up is an invalid move.


In [2]:
import numpy as np

def intializeTransitions():
    
    transitions = np.zeros((25,5))
    for i in range(25):    
        for j in range(4):
            
            # evaluation for moving UP
            # we always have 0-4 values in first row, so if i < 5 then we are on the top row, so going up
            # from top row is invalid.
            # from any other value, moving up is just subtracting 5 from current position.
            if i < 5:
                transitions[i][0] = -1
            else:
                transitions[i][0] = i - 5
                
            # evaluation for moving DOWN
            # we always have 20-24 values in last row, so if i > 20 then we are on the bottom most row, so going DOWN
            # from bottom row is invalid.
            # from any other value, moving DOwN is just adding 5 from current position.
            
            if i >= 20:
                transitions[i][1] = -1
            else:
                transitions[i][1] = i + 5
                
            # evaluation for moving LEFT
            # we always have values divisible by 5 in first column, so if i % 5 ==0 then we are on the first column, 
            # so going LEFT is not possible.
            # from any other value, moving LEFT is just adding -1 from current position.
            
            if i % 5 == 0:
                transitions[i][2] = -1
            else:
                transitions[i][2] = i - 1
            
            # evaluation for moving RIGHT
            # we always have values divisible by 5 in the (last column + 1), so if (i+1) % 5 ==0 then we are on the last column, 
            # so going RIGHT is not possible.
            # from any other value, moving RIGHT is just adding +1 from current position.
            
            # check if rightmost column, then 'right' action invalid
            if (i+1) % 5 == 0:
                transitions[i][3] = -1
            else:
                transitions[i][3] = i + 1
            
            # Lets store the current positions as well.
            transitions[i][4] = i
    
    transitions = transitions.astype(int)
    return transitions

## Rewards Policy

### Initializing the 25 x 5 rewards table with possible transitions from transitions table and the game board.

### The Policy : 
### There are 25 possible position on the game board, 
### we are going to define a rewards for each possible state in the grid.
### Defining the rewards policy..
### Whenever the player encounters a Monster, the reward is -50
### Whenever the player encounters a Blocker, the reward is -20
### Whenever the player encounters the Treasure, the reward is +100
### Whenever the Player encounters a * , the reward is +10 so that the Player can move. 
### Invalid moves from transition tables get a 0 reward.

In [3]:
def initializeRewards(gameBoard, transitions):
    
    rewards = np.zeros((25,5))
    
    for i in range(25):
        
        for j in range(5):
            
            possibleState = transitions[i][j]
            
            # if possible state is an invalid move = -1 from tansition table,
            # then set the reward for that state as 0.
            if possibleState == -1:
                rewards[i][j] = 0
                continue
            
            # if its a neutral move, then reward is 0
            if possibleState == i:
                rewards[i][j] = 0
                continue
            
            # if possible state encounter a Monster or a Blocker, set the reward to -50 or -30 respectively.
            gameBoardRow = int(possibleState / 5)
            gameBoardColumn = int(possibleState % 5)
            if gameBoard[gameBoardRow][gameBoardColumn] == 'M':
                rewards[i][j] = -10
            elif gameBoard[gameBoardRow][gameBoardColumn] == 'B':
                rewards[i][j] = -5
            elif gameBoard[gameBoardRow][gameBoardColumn] == 'T':
                rewards[i][j] = 10
            elif gameBoard[gameBoardRow][gameBoardColumn] == '*':
                rewards[i][j] = 1

    return rewards

### initializing an array of possible actions for all possible positions on the given game board.
#### Hardcoding these values as they are constant, we can programmatically write this as well.
0 - UP
1 - DOWN
2 - LEFT
3 - RIGHT
4 - NO MOVES
For example : the first value has 1,3,4 indicating that the player at that position can move DOWN or RIGHT.

In [4]:
def initializePossibleActions():
    possibleActions = np.array([[1, 3, 4],
                               [1, 2, 3, 4],
                               [1, 2, 3, 4],
                               [1, 2, 3, 4],
                               [1, 2, 4],
                               [0, 1, 3, 4],
                               [0, 1, 2, 3, 4],
                               [0, 1, 2, 3, 4],
                               [0, 1, 2, 3, 4],
                               [0, 1, 2, 4],
                               [0, 1, 3, 4],
                               [0, 1, 2, 3, 4],
                               [0, 1, 2, 3, 4],
                               [0, 1, 2, 3, 4],
                               [0, 1, 2, 4],
                               [0, 1, 3, 4],
                               [0, 1, 2, 3, 4],
                               [0, 1, 2, 3, 4],
                               [0, 1, 2, 3, 4],
                               [0, 1, 2, 4],
                               [0, 3, 4],
                               [0, 2, 3, 4],
                               [0, 2, 3, 4],
                               [0, 2, 3, 4],
                                [0, 2, 4]])
    return possibleActions

### Set up the game board with all required data.

#### Note that our game board is 5 X 5, so on total we will have 25 possibile positions on the board.


In [5]:
def setupGame(game):
    transitions = intializeTransitions();
    rewards = initializeRewards(game, transitions)
    return rewards, transitions

## Reinforced Learning model..

Alpha is the learning rate (fine tuned from multiple runs..)

Gamma is the discount factor. It quantifies how much importance we give for future rewards. 
It’s also handy to approximate the noise in future rewards. Gamma varies from 0 to 1. 
If Gamma is closer to zero, the agent will tend to consider only immediate rewards. 
If Gamma is closer to one, the agent will consider future rewards with greater weight,
willing to delay the reward.
In my implementation I'm keeping it close to 1 (0.8) to consider furture rewards as well.

** src : https://towardsdatascience.com/practical-reinforcement-learning-02-getting-started-with-q-learning-582f63e4acd9

In [7]:
import random

def reinforcedLearningModel(gameBoard, transitions, rewards, learningRate, discountFactor, epochs):
    actionsAvailable = initializePossibleActions()
    qLearnings = np.zeros((25,5))

    for i in range(epochs):
    
        initialState = 0
        terminalState = 4
        currentState = initialState
        #Keep moving forward until the goal state is reached
        while currentState != terminalState:
            # random choice of action at every particular state.
            action = random.choice(actionsAvailable[currentState])

            #move to the next state based on  randomly chosen action and transitions.
            nextState = transitions[currentState][action]
            futureRewards = []

            #identify and Add all rewards for all future actions..
            for nextPossibleAction in actionsAvailable[nextState]:
                futureRewards.append(qLearnings[nextState][nextPossibleAction])

            ## Identify maximum Q learning value and apply the formula
            qPrevious = qLearnings[currentState][action] 
            qValueToUpdate = (1 - learningRate) *  qPrevious + learningRate * (rewards[currentState][action]  + discountFactor * max(futureRewards))

            #Update the Q table with new reward value
            qLearnings[currentState][action] = qValueToUpdate

            # go to next state.
            currentState = nextState
    return qLearnings

### Initital design for the map to Treasure.

In [8]:
# P - Player.
# M - Monster
# B - Blocker
# T - Treasure
# Help the Player to reach treasure.

mapToTreasure = [['P','*','*','M','T'],
                 ['*','B','M','*','*'],
                 ['*','*','*','*','M'],
                 ['B','*','B','*','M'],
                 ['*','*','*','*','*']]

rewards, transitions = setupGame(mapToTreasure)
play = reinforcedLearningModel(gameBoard=mapToTreasure,transitions=transitions, rewards=rewards, learningRate = 0.5, discountFactor = 0.8, epochs=200)

In [9]:
# converting the data into dataframe for manipulation purposes.
play = pd.DataFrame(play)

## Lets print the play move scores..
print("Play move scores..")
print(play)

print("Transitions..")
print(transitions)

Play move scores..
            0         1         2          3         4
0    0.000000  6.048576  0.000000   5.000000  4.838861
1    0.000000  0.310720  4.838861   5.000000  4.000000
2    0.000000 -3.440000  5.000000  -2.000000  4.000000
3    0.000000  8.200000  5.000000  10.000000  8.000000
4    0.000000  0.000000  0.000000   0.000000  0.000000
5    4.838861  6.310720  0.000000   0.310720  5.048576
6    5.000000  6.638400  6.048576  -3.440000  5.310720
7    5.000000  7.048000  0.310720   8.200000  6.560000
8   -2.000000  7.560000 -3.440000   9.000000  7.200000
9   10.000000 -2.800000  8.200000   0.000000  8.000000
10   6.048576  0.048576  0.000000   6.638400  5.310720
11   0.310720  6.310720  6.310720   7.048000  5.638400
12  -3.440000  0.638400  6.638400   7.560000  6.048000
13   8.200000  7.048000  7.048000  -2.800000  6.560000
14   9.000000 -4.361600  7.560000   0.000000  7.200000
15   6.310720  5.838861  0.000000   6.310720  5.048576
16   6.638400  6.048576  0.048576   0.638400  

### Get the route from the run.

### if at all there is going to be any visited node already then, pick the next highly possible move from all the moves.

In [10]:
optimalrouteToTreasure = []
currentstate = 0
previousState=0
nextHighestValue=0
state = 0

while state != 4: #(terminal state)
    row = play.iloc[state]
    print(state, "-->", end=" ")
    # get the index maximum value from the dataframe row
    action = row.idxmax(axis=1)
    #transitions of current state and the action will identify the next state
    state = transitions[state][action]

    ## trying to avoid previously visited nodes if any.
    if (state in optimalrouteToTreasure):
        nextHighest = 4 #(next highest in 5(0-4) values is 3, so setting this to 4 and subtracting below)
        while(state in optimalrouteToTreasure):
            nextHighest=nextHighest-1
            nextHighestValue = np.sort(play.iloc[previousState])[nextHighest] # get next highest value
            i=0
            for value in row.tolist():    
                if value == nextHighestValue:
                    state = transitions[previousState][i]
                    break
                i=i+1
                
    optimalrouteToTreasure.append(state)
    previousState = state    

0 --> 5 --> 10 --> 11 --> 12 --> 13 --> 8 --> 9 --> 

In [11]:
print("The optimal route for the Player to reach the treasure from source  is ", optimalrouteToTreasure)

The optimal route for the Player to reach the treasure from source  is  [5, 10, 11, 12, 13, 8, 9, 4]
