## Reinforcement Learning Naive Approach - Manual

In [13]:
class Field:
    def __init__(self, size, item_pickup, item_dropoff, start_position):
        self.size = size
        self.item_pickup = item_pickup
        self.item_dropoff = item_dropoff
        self.position = start_position
        self.item_in_car = False
        
    
    def make_action(self, action):
        (x,y) = self.position
        
        if action == 0: # down
            # first we put a restriction that it does not go out of the field
            # reward of -10 if it is taking a value out of the grid
            if y == self.size - 1: # if grid size is 10, now y is at 10-1 = 9 (since python starts from 0)
                return -10, False
            else:
                self.position = (x, y+1)
                return -1, False
            
            
        elif action == 1: # up
            if y == 0:
                return -10, False
            else:
                self.position = (x,y-1)
                return -1, False
            
        elif action == 2: # left
            if x == 0:
                return -10, False
            else:
                self.position = (x-1, y)
                return -1, False
                
        elif action == 3: # right
            if x == self.size-1:
                return -10, False
            else:
                self.position = (x+1, y)
                return -1, False
                
        elif action == 4: # pickup
            if self.item_in_car:
                return -10, False
            elif self.item_pickup != (x,y):
                return -10, False
            else:
                self.item_in_car = True
                return 20, False
                
            
        elif action == 5: # dropoff
            if not self.item_in_car:
                return -10, False
            elif self.item_dropoff != (x,y):
                self.item_pickup = (x,y)
                self.item_in_car = False
                return -10, False
            else:
                self.item_in_car = False
                return 20, True

In [14]:
size = 10
item_pickup = (0,0)
item_dropoff = (9,9)
start_position = (9,0)

field = Field(size, item_pickup, item_dropoff, start_position)

In [15]:
field.position

(9, 0)

In [16]:
field.make_action(2)

(-1, False)

In [17]:
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)

(-1, False)

In [18]:
field.position

(0, 0)

In [19]:
field.make_action(4)

(20, False)

In [20]:
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)

field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)

(-1, False)

In [21]:
field.position

(9, 9)

In [22]:
field.make_action(5)

(20, True)

In [24]:
field.item_in_car # dropped off at the desired location

False

### Implementing Random Solution

In [25]:
import random

In [28]:
def random_solution():
    size = 10
    item_pickup = (0,0)
    item_dropoff = (9,9)
    start_position = (9,0)

    field = Field(size, item_pickup, item_dropoff, start_position)
    
    done = False
    step = 0
    
    while not done:
        action = random.randint(0,5)
        reward, done = field.make_action(action)
        step = step+1
        
    return step
    

In [31]:
random_solution()

100828

In [34]:
run = [random_solution() for _ in range(100)]

In [35]:
sum(run)/len(run)

148124.98

In [37]:
# Number of steps on average it takes to get to the final result if we run the process for 100 times

In [38]:
# Also, we are not making any use of this reward

### Q-Learning

#### We have Data and Q table and then we have reward and punishment with explore and exploit

In [39]:
# What is the Q learning table it is a cross tab of # actions vs # states
# For our use case, we have 6 actions to take (up, down, right, left, pick and drop)
# And we have (10*10)*(10*10)*2 - first part for reaching the passenger for pick up 10*10 options,
# second to drop the passenger (10*10), again, we have pick up or drop so *2

In [40]:
# Initially all the values will be 0, therefore these weights will be updated via exploration backwards

In [41]:
# Q-Learning Algorithm

In [44]:
class Field:
    def __init__(self, size, item_pickup, item_dropoff, start_position):
        self.size = size
        self.item_pickup = item_pickup
        self.item_dropoff = item_dropoff
        self.position = start_position
        self.item_in_car = False
        
    def get_number_of_states(self):
        
        return self.size*self.size*self.size*self.size*2
    
    def get_state(self): #map state to q table
        state = self.position[0]*self.size*self.size*self.size*2
        state = state + self.position[1]*self.size*self.size*2
        state = state + self.item_pickup[0]*self.size*2
        state = state + self.item_pickup[1]*2
        
        if self.item_in_car:
            state = state+1
        
        return state
                              
        
    
    def make_action(self, action):
        (x,y) = self.position
        
        if action == 0: # down
            # first we put a restriction that it does not go out of the field
            # reward of -10 if it is taking a value out of the grid
            if y == self.size - 1: # if grid size is 10, now y is at 10-1 = 9 (since python starts from 0)
                return -10, False
            else:
                self.position = (x, y+1)
                return -1, False
            
            
        elif action == 1: # up
            if y == 0:
                return -10, False
            else:
                self.position = (x,y-1)
                return -1, False
            
        elif action == 2: # left
            if x == 0:
                return -10, False
            else:
                self.position = (x-1, y)
                return -1, False
                
        elif action == 3: # right
            if x == self.size-1:
                return -10, False
            else:
                self.position = (x+1, y)
                return -1, False
                
        elif action == 4: # pickup
            if self.item_in_car:
                return -10, False
            elif self.item_pickup != (x,y):
                return -10, False
            else:
                self.item_in_car = True
                return 20, False
                
            
        elif action == 5: # dropoff
            if not self.item_in_car:
                return -10, False
            elif self.item_dropoff != (x,y):
                self.item_pickup = (x,y)
                self.item_in_car = False
                return -10, False
            else:
                self.item_in_car = False
                return 20, True

In [43]:
import numpy as np

In [51]:
size = 10
item_pickup = (0,0)
item_dropoff = (9,9)
start_position = (9,0)

field = Field(size, item_pickup, item_dropoff, start_position)

number_of_states = field.get_number_of_states()
number_of_actions = 6

q_table = np.zeros((number_of_states, number_of_actions)) # we need to pass as tuple (())

epsilon = 0.1
alpha = 0.1
gamma = 0.6

for _ in range(10000):
    field = Field(size, item_pickup, item_dropoff, start_position)
    done = False
    
    while not done:
        state = field.get_state()
        if random.uniform(0,1) < epsilon:
            action = random.randint(0,5) # Exploration
        else:
            action = np.argmax(q_table[state]) # Exploitation
        
        reward, done = field.make_action(action) 
        new_state = field.get_state()
        max_new_state = np.max(q_table[new_state])
        
        q_table[state, action] = (1-alpha)*q_table[state, action]+alpha*(reward+gamma*max_new_state - q_table[state, action])
        
        

In [52]:
q_table

array([[ 0.23071429, -2.06428571, -2.06428571,  0.23071429,  9.78571429,
        -2.06428571],
       [-0.71428571, -5.21428571, -5.21428571, -0.71428571, -5.21428571,
        -2.06428571],
       [ 1.90409499, -1.        , -1.        , -0.1       , -1.        ,
        -1.        ],
       ...,
       [-1.57820338,  0.07278278, -0.30422201, -1.6462095 , -1.35163212,
        10.38193357],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ]])

### Wrapping into a function

In [55]:
def reinforcement_solution():
    epsilon = 0.1
    alpha = 0.1
    gamma = 0.6
    
    field = Field(size, item_pickup, item_dropoff, start_position)
    done = False
    steps = 0
    
    while not done:
        state = field.get_state()
        if random.uniform(0,1) < epsilon:
            action = random.randint(0,5) # Exploration
        else:
            action = np.argmax(q_table[state]) # Exploitation
        
        reward, done = field.make_action(action) 
        new_state = field.get_state()
        max_new_state = np.max(q_table[new_state])
        
        q_table[state, action] = (1-alpha)*q_table[state, action]+alpha*(reward+gamma*max_new_state - q_table[state, action])
        
        steps = steps+1
        
    return steps

In [56]:
reinforcement_solution()

31

In [59]:
run_new = [reinforcement_solution() for _ in range(100)]

In [60]:
sum(run_new)/len(run_new)

50.4

In [57]:
# Now, we can see after updating the q_table, we are able to do it only in 32 steps

In [58]:
# Note: The best solution is 29 steps, we cannot do it in less. Therefore 32 is good.