In [62]:
from scipy.sparse import csr_matrix
import itertools
import numpy as np
import mdptoolbox.mdp as mdp
import mdptoolbox.util as util
import mdptoolbox.example as example



In [63]:
#model MDP flexibly, depending on size of warehouse, number of items, probabilities for items 
#model needs:
    #set of states
    #set of actions
    #transition probabilities
    #reward matrix

In [64]:
#input changes, layout of warehouse, nr of items and probability distribution of items
warehouse = np.zeros(shape=(1,2)) #shape larger than 2x3 will take 30 plus mins to calculate transition matrix and policies
items = [(1, 0.9), (2, 0.1)]


items_only = []
for item in items:
    items_only.append(item[0])
probabilities_only = []
for item in items:
    probabilities_only.append(item[1])
#check sum of probabilities
if np.sum(probabilities_only) != 1:
    raise ValueError("The sum of item probabilities is not 1!")

In [83]:
# states:
boxstates = items_only.copy()
boxstates.insert(0,0)
tasks = ["store", "unstore"]
nr_boxes = warehouse.shape[0]*warehouse.shape[1]
nr_states = len(boxstates)**nr_boxes * len(tasks) * len(items)
iterables = []

#same set for every box
for i in range(nr_boxes):
    iterables.append(boxstates)

#add iterables for tasks
iterables.append(tasks)
iterables.append(items_only)

states = []
counter = 0
for state in itertools.product(*iterables):
    counter += 1
    states.append(list(state))

#actions:
#action is selecting the position in the warehouse for the given task
rows = list(np.arange(warehouse.shape[0]))
columns = list(np.arange(warehouse.shape[1]))
nr_cells = warehouse.shape[0]*warehouse.shape[1]
actions = []
for action in itertools.product(*[rows, columns]):
    actions.append(list(action))

nr_actions = len(actions)

print(nr_states,"states")
print(nr_actions, "actions")

108 states
3 actions


In [66]:
%%time

#transitions and rewards:
#this could be refactored and made pretty for sure!
#matrix AxSxS', every state has probability to get to state s' by each action
#reward depends on whether task was possible before or s==s' and distance that is determined by action

transitions = []
rewards = np.zeros(shape=(len(states), len(actions)))
it_prob = 0

for it_action in range(nr_actions):
    print("action:", it_action)
    #position in warehouse
    y,x = actions[it_action]
    # for CSR matrix
    data = []
    indptr = [0]
    indices = []
    for it_state in range(nr_states):
        #check for every state, if action is doable    
        state = states[it_state].copy()
        if state[-2] == "store":
            if state[y*warehouse.shape[0]+x] == 0:
                #item can be stored at position
                result = state.copy()
                result[y*warehouse.shape[0]+x] = state[-1]                
                #reward = 1/distance, so everyhing is positive valued and linear
                #distance = row + column + 1 (minimal distance = 1)
                distance = y+x+1
                rewards[it_state, it_action] = 1/distance
                #identify correct state in list and set probability according to item
                for next_state in range(nr_states):
                    #iterate twice (store and unstore) over item probabilities and set transition probability accordingly
                    prob = probabilities_only[it_prob]/2
                    if (result[0:-2] == states[next_state][0:-2]):           
                        it_prob += 1
                        if it_prob == len(probabilities_only):
                            it_prob = 0
                        data.append(prob)
                        indices.append(next_state)
                indptr.append(len(indices))

            else:
                #reward[it_action, it_state]
                result = state.copy()
                #identify correct state in list and set probability according to item
                for next_state in range(nr_states):
                    if (result == states[next_state]):           
                        data.append(1)
                        indices.append(next_state)    
                indptr.append(len(indices))
            
        else:
            if state[y*warehouse.shape[0]+x] == state[-1]:
                #can be unstored
                result = state.copy()
                result[y*warehouse.shape[0]+x] = 0
                distance = y+x+1
                rewards[it_state, it_action] = 1/distance
                #identify correct state in list and set probability according to item
                for next_state in range(nr_states):
                    #iterate twice (store and unstore) over item probabilities and set transition probability accordingly
                    prob = probabilities_only[it_prob]/2
                    if (result[0:-2] == states[next_state][0:-2]):           
                        it_prob += 1
                        if it_prob == len(probabilities_only):
                            it_prob = 0
                        data.append(prob)
                        indices.append(next_state)
                indptr.append(len(indices))

            else:
                #reward[it_action, it_state]
                result = state.copy()
                #identify correct state in list and set probability according to item
                for next_state in range(nr_states):
                    if (result == states[next_state]):           
                        data.append(1)
                        indices.append(next_state)
                indptr.append(len(indices))


        

    transition_sparse = csr_matrix((data, indices, indptr), shape=(nr_states ,nr_states), dtype=float)
    transitions.append(transition_sparse)


# check if transitions and rewards form a valid MDP (sizewise at least)
if util.check(transitions, rewards) is not None or len(transitions) != nr_actions or not all( i for i in [transitions[i].shape[0]==transitions[i].shape[1]==nr_states for i in range(len(transitions))]):
    raise ValueError("Size of Transition or reward matrix not correct!")

action: 0
action: 1
action: 2
Wall time: 15 ms


In [84]:
a = transitions[0].toarray()

In [68]:
# try different MDP algorithms
    #FiniteHorizon
    #PolicyIteraiton
    #PolicyIterationModified
    #QLearning
    #RelativeValueIteration
    #ValueIteration
    #ValueIteartionGS
    

In [69]:
algorithms = ["FiniteHorizon", "PolicyIteraiton", "PolicyIterationModified", "QLearning", "RelativeValueIteration", "ValueIteration", "ValueIteartionGS"]
policies = [None,None,None,None,None,None,None]
discountFactor = 0.9

In [70]:
#FiniteHorizon
fh = mdp.FiniteHorizon(transitions, rewards, discountFactor, N=10000)
fh.run()
print("fh duration iterations:", fh.time)
#use iteration of policy
policy = []
policy_iterations = fh.policy
for state in policy_iterations:
    policy.append(state[0])
policies[0] = (tuple(policy))

fh duration iterations: 51.11390829086304


In [71]:
#PolicyIteraiton
pi = mdp.PolicyIteration(transitions, rewards, discountFactor, max_iter=10000000)
pi.run()
policies[1] = pi.policy
print("pi duration:", pi.time)

pi duration: 0.0040130615234375


In [72]:
#PolicyIterationModified
pim = mdp.PolicyIterationModified(transitions, rewards, discountFactor, max_iter=1000000)
pim.run()
policies[2] = pim.policy
print("pim duration:", pim.time)

pim duration: 0.0040132999420166016


In [73]:
#QLearning
ql = mdp.QLearning(transitions, rewards, discountFactor, n_iter = 50000)
ql.run()
policies[3] = ql.policy
print("ql duration:", ql.time)

ql duration: 78.76735591888428


In [74]:
#RelativeValueIteration
rvi = mdp.RelativeValueIteration(transitions, rewards, max_iter=200000)
rvi.run()
policies[4] = rvi.policy
print("rvi duration:", rvi.time)

rvi duration: 0.0010030269622802734


In [75]:
#ValueIteration
vi = mdp.ValueIteration(transitions, rewards, discountFactor, max_iter=2000000)
vi.run()
policies[5] = vi.policy
print("vi duration:", vi.time)

vi duration: 0.0010023117065429688


In [76]:
#ValueIteartionGS
vigs = mdp.ValueIterationGS(transitions, rewards, discountFactor, max_iter=100000)
vigs.run()
policies[6] = vigs.policy
print("vigs duration:", vigs.time)

vigs duration: 0.5116922855377197


In [77]:
def get_action(policy, state):
    if len(policy)!=len(states):
        return "Policy does not match number of states!"
    #give state and get action(position based on chosen policy)
    for it_state in range(len(states)):
        #find position of state to search policy at this posiiton 
        if states[it_state]==state:
            return actions[policy[it_state]]
    return "Input state is invalid, check shape and items!"

In [82]:
#to display options to test out states
print("Warehouse input length:", warehouse.shape[0]*warehouse.shape[1])
print("Possible tasks:", tasks)
print("Possible items:", items_only)


Warehouse input length: 3
Possible tasks: ['store', 'unstore']
Possible items: [1, 2]


In [81]:
#experiemnt herer with this state and policy
test_state = [0, 0, 0, 'store', 2]

print("Possible actions:", actions)
for i in range(len(policies)):
    print("For state", test_state,"policy", algorithms[i], get_action(policies[i], test_state))

Possible actions: [[0, 0], [0, 1], [0, 2]]
For state [0, 0, 0, 'store', 2] policy FiniteHorizon [0, 1]
For state [0, 0, 0, 'store', 2] policy PolicyIteraiton [0, 1]
For state [0, 0, 0, 'store', 2] policy PolicyIterationModified [0, 1]
For state [0, 0, 0, 'store', 2] policy QLearning [0, 0]
For state [0, 0, 0, 'store', 2] policy RelativeValueIteration [0, 1]
For state [0, 0, 0, 'store', 2] policy ValueIteration [0, 1]
For state [0, 0, 0, 'store', 2] policy ValueIteartionGS [0, 1]


In [70]:
#the end