# SAKI Homework 3 - warehouse MDP

In [1]:
import mdptoolbox
import pandas as pd
import itertools as it
import numpy as np
import pickle
import time
from scipy import sparse

## Prepare MDP

### Import data

In [2]:
#import training set (2x3)
training_data = pd.read_csv('./data/Exercise 3 - Reinforcement Learning - warehousetraining.txt', header=None)
training_data.columns = ["training"]
#import test set (2x3)
test_data = pd.read_csv('./data/Exercise 3 - Reinforcement Learning - warehouseorder.txt', sep='\t', header=None)
test_data.columns = ["action", "color"]

### Probability from training set

In [3]:
#calculate probability from training set
move_probs = (training_data.training.value_counts() / training_data.shape[0])
move_probs = move_probs.round(3)
if (sum(move_probs) == 1.):
    print("successfull")
else:
    print("Error: sum not one")
print(move_probs.sum())
print("")
print(move_probs)

successfull
1.0

restore\tred      0.247
store\tred        0.247
store\twhite      0.128
restore\twhite    0.128
restore\tblue     0.125
store\tblue       0.125
Name: training, dtype: float64


### MDP Settings

In [4]:
############################################
num_fields = 6 # Warehouse places 2x2 = 4 ..
############################################

num_color = 4 # empty, red, white, blue
num_moves = 6 # store red, store white, store blue, restore red, ...
num_actions = num_fields
block_size = num_color ** num_fields

num_state = num_color ** num_fields * num_moves
warehouse_description=[0,1,2,3] #possible colors

### Transition Probability Matrix

In [0]:
#Init TPM
P = []

for action in range(num_actions):
    current_index = 0
    P.append(np.zeros((num_state, num_state),dtype=np.float16))
    
    for instr in range(num_moves):
        for w_state in it.product(warehouse_description, repeat=num_fields):
            #Iter through all 6 column blocks (=possible instructions) (store red, store blue, ...., restore red, ...)
            for move in range(num_moves):

                ##For field one (Action)
                #FOR STORE
                if(instr in range(3)):

                    #1. Empty? if (field1 == 0)
                    if(w_state[action] != 0):
                        P[action][current_index][(current_index % block_size) + (block_size * move)] = move_probs[move]
                    else:
                        #FOR STORE
                        #if(red) index+64 (i**numFields)
                        # else if(white) index+128
                        # else if(blue) index+192

                        #red = 0
                        if(instr == 0):
                            P[action][current_index][((current_index % block_size) + (num_color**(num_actions - action - 1) * 1)) + (block_size * move)] = move_probs[move]
                        #white = 1
                        elif(instr == 1):
                            P[action][current_index][((current_index % block_size) + (num_color**(num_actions - action - 1) * 2)) + (block_size * move)] = move_probs[move]
                        #blue = 2
                        elif(instr == 2):
                            P[action][current_index][((current_index % block_size) + (num_color**(num_actions - action - 1) * 3)) + (block_size * move)] = move_probs[move]


                #FOR RESTORE
                else:
                    #possible? if (field 1 != 0)
                    if(w_state[action] == 0):
                        P[action][current_index][(current_index % block_size) + (block_size * move)] = move_probs[move]
                    else:
                        #FOR STORE
                        #if(red) index-64 (i**numFields)
                        # else if(white) index-128
                        # else if(blue) index-192

                        #red = 3
                        if(instr == 3):
                            P[action][current_index][((current_index % block_size) - (num_color**(num_actions - action - 1) * 1)) + (block_size * move)] = move_probs[move]
                        #white = 4
                        elif(instr == 4):
                            P[action][current_index][((current_index % block_size) - (num_color**(num_actions - action - 1) * 2)) + (block_size * move)] = move_probs[move]
                        #blue = 5
                        elif(instr == 5):
                            P[action][current_index][((current_index % block_size) - (num_color**(num_actions - action - 1) * 3)) + (block_size * move)] = move_probs[move]                          
            
            #START DEBUGGING
            if(sum(P[action][current_index]) != 1.0):
                print(current_index,sum(P[action][current_index]))
            #END DEBUGGING
            current_index += 1
            
    P[action] = sparse.csr_matrix(P[action])
    print("finished P"+str(action))   
    
print("---")
print("successfull")

# Save the matrices into a pickle file.
pickle.dump( P, open( "./obj_data/P.pickle", "wb" ) )

finished P0
finished P1
finished P2
finished P3
finished P4
finished P5
---
successfull


In [0]:
#Load the matrices from pickle file.
P = pickle.load( open( "./obj_data/P.pickle", "rb" ))

### Create Warehouse states

In [7]:
warehouse = []
for instr in range(num_moves):
    for w_state in it.product(warehouse_description, repeat=num_fields):
        tmp = []
        tmp_str = []
        for i in range(num_fields):
            tmp.append(w_state[i])
            tmp_str.append('state'+str(i))
        tmp.append(instr)
        tmp_str.append('NextMove')
        warehouse.append(tmp)
    
warehouse = pd.DataFrame(warehouse, columns=tmp_str)
print(warehouse.head())

   state0  state1  state2  state3  state4  state5  NextMove
0       0       0       0       0       0       0         0
1       0       0       0       0       0       1         0
2       0       0       0       0       0       2         0
3       0       0       0       0       0       3         0
4       0       0       0       0       1       0         0


### Reward

In [0]:
R = []

for action in range(num_actions):
    R.append(np.zeros((num_state, )))
  
    for index, ws in warehouse.iterrows():
        try:
            #Reward for correct move
            if((ws.NextMove in range(3) and ws[action] == 0) or
            (ws.NextMove in range(3, 6) and (ws[action] == (ws.NextMove - 2)))):
               
                if  (action == 0): reward = 80**2  #8**3.5;
                elif(action == 1): reward = 60**2  #6**3.5;
                elif(action == 2): reward = 60**2  #6**3.5;
                elif(action == 3): reward = 40**2  #4**3.5;
                elif(action == 4): reward = 40**2  #4**3.5;
                elif(action == 5): reward = 40**2  #2**3.5;
        
                #Extra reward if restore is possible
                if ws.NextMove in range(3, 6) and (ws[action] == (ws.NextMove - 2)):
                    reward *= 100  #+=100
      
            #Reward for Failed moves
            else:
                #store not possible
                if ws.NextMove in range(3):
                    reward = -20000  #5
                #restore not possible
                else:
                    reward = -1000000  #-10

            R[-1][index] = reward
      
        except:
            print("An exception occurred") 
            print(ws.NextMove)
            print(ws[action])

R = np.asarray(R)
R = R.transpose()

# Save the matrices into a pickle file.
pickle.dump( R, open( "./obj_data/R.pickle", "wb" ) )

In [0]:
#Load the matrices from pickle file.
R = pickle.load( open( "./obj_data/R.pickle", "rb" ))

## Run MDP

In [0]:
#print(len(P))
#print(P[0].shape)
mdptoolbox.util.check(P,R)

In [0]:
start_time = time.time()
mdpresultPolicy = mdptoolbox.mdp.PolicyIteration(P, R, 0.9, max_iter=500)
mdpresultValue = mdptoolbox.mdp.ValueIteration(P, R, 0.9, max_iter=500)
print("--- %s seconds ---" % (time.time() - start_time))
pickle.dump( mdpresultValue, open( "./obj_data/mdpresult_val.pickle", "wb" ) )
pickle.dump( mdpresultPolicy, open( "./obj_data/mdpresult_pol.pickle", "wb" ) )

In [0]:
#load from pickle
mdpresultValue = pickle.load( open( "./obj_data/mdpresult_val.pickle", "rb" ))
mdpresultPolicy = pickle.load( open( "./obj_data/mdpresult_pol.pickle", "rb" ))

In [None]:
start_time = time.time()
#mdpresultValue.setVerbose()
#mdpresultPolicy.setVerbose()
mdpresultValue.run()
mdpresultPolicy.run()
print("--- %s seconds ---" % (time.time() - start_time))

pickle.dump( mdpresultValue, open( "./obj_data/mdpresult_run_val.pickle", "wb" ) )
pickle.dump( mdpresultValue, open( "./obj_data/mdpresult_run_pol.pickle", "wb" ) )

In [None]:
mdpresultValue = pickle.load( open( "./obj_data/mdpresult_run_val.pickle", "rb" ))
mdpresultPolicy = pickle.load( open( "./obj_data/mdpresult_run_pol.pickle", "rb" ))
print('PolicyIteration:')
print(mdpresultPolicy.policy)
print(mdpresultPolicy.V)
print(mdpresultPolicy.iter)
print()
print('ValueIteration:')
print(mdpresultValue.policy)
print(mdpresultValue.V)
print(mdpresultValue.iter)

## Evaluate MDP

### Helper functions

In [0]:
def getInstrNr(instrString):
    if(instrString == 'store red'):
        ret = 0
    elif(instrString == 'store white'):
        ret = 1
    elif(instrString == 'store blue'):
        ret = 2
    elif(instrString == 'restore red'):
        ret = 3
    elif(instrString == 'restore white'):
        ret = 4
    elif(instrString == 'restore blue'):
        ret = 5  
    return ret


def get_distance(i):
    if(i == 0):
        dist = 2
    elif(i == 1):
        dist = 4
    elif(i == 2):
        dist = 4
    elif(i == 3):
        dist = 6
    elif(i == 4):
        dist = 6
    elif(i == 5):
        dist = 8
    return dist

import enum 
class Color(enum.Enum): 
    red   = 1
    white = 2
    blue  = 3

def getIndexOfWarehouse(playground, instr, warehouse):
    index = warehouse[(warehouse['state0'] == playground[0]) &
                      (warehouse['state1'] == playground[1]) &
                      (warehouse['state2'] == playground[2]) &
                      (warehouse['state3'] == playground[3]) &
                      (warehouse['state4'] == playground[4]) &
                      (warehouse['state5'] == playground[5]) &
                      (warehouse['NextMove'] == getInstrNr(instr))].index[0]
    return index

  

### Use Greedy Algorithm and Policies on Testdata

In [39]:
outputeval = open('./eval_result/eval.txt', 'w') 

def store_restore_greedy(local_warehouse, data):
    print(str(counter) + " GREEDY: \t" + str(warehouse_greedy) + str(data.action) + " " + str(data.color), file = outputeval)
    
    for i in range(len(local_warehouse)):
        #Check if position is free
        if(data.action == 'store'):
            if(local_warehouse[i] == 0):
                local_warehouse[i] = Color[data.color].value
                break
        #Check if the desired color is stored
        elif(data.action == 'restore'):
            if(local_warehouse[i] == Color[data.color].value):
                local_warehouse[i] = 0
                break
        #move is not possible
        else:
            print("iteration", counter ,"move not possible")
            return 0
    return get_distance(i)
                

def store_restore(policy, local_warehouse, data):
    
    index = getIndexOfWarehouse(local_warehouse, (data.action + ' ' + data.color), warehouse)
    print(str(counter) + " VAL ITER: \t" + str(warehouse_valueiter) + (data.action + ' ' + data.color), file = outputeval)
    if(data.action == 'store'):
        if(local_warehouse[policy.policy[index]]!=0):
            print("iteration",counter,"did not store")
            return 0;
        local_warehouse[policy.policy[index]] = Color[data.color].value
    elif(data.action == 'restore'):
        if(local_warehouse[policy.policy[index]]!=Color[data.color].value):
            print("iteration",counter,"did not restore")
            return 0;
        local_warehouse[policy.policy[index]] = 0
        
    return get_distance(policy.policy[index])

warehouse_greedy = np.zeros(num_fields)
warehouse_policyiter = np.zeros(num_fields)
warehouse_valueiter = np.zeros(num_fields)

distance_greedy = 0
distance_mdp_value = 0
distance_mdp_policy = 0

counter = 0
for data in test_data.itertuples():
    distance_greedy += store_restore_greedy(warehouse_greedy, data)
    distance_mdp_value += store_restore(mdpresultValue, warehouse_valueiter, data)
    distance_mdp_policy += store_restore(mdpresultPolicy, warehouse_policyiter, data)
    print('', file = outputeval)
    counter += 1
    

print("Greedy:" + str(distance_greedy))
print("MDP value iter:" + str(distance_mdp_value))
print("MDP policy iter:" + str(distance_mdp_policy))
outputeval.close()



iteration 14 did not restore
iteration 21 did not store
iteration 22 did not restore
iteration 25 did not restore
iteration 34 did not store
iteration 35 did not store
iteration 53 did not restore
Greedy:248
MDP value iter:206
MDP policy iter:0
