# Start
We use dining philosophers model in this demo

In [2]:
import maude

model = './benchmarks/dining-philosophers/hs-dp.maude'
maude.init()
maude.load(model)
m = maude.getCurrentModule()
print('Using', m, 'module')

Using HEURISTIC-SEARCH-DP module
reduce in HEURISTIC-SEARCH-DP : abst(p(0, single) || p(1, single) || p(2, hungry)) .
rewrites: 5 in 0ms cpu (0ms real) (~ rewrites/second)
result AState: p(--, hungry) || p(--, single)


# Environment
the maude model of dining philosophers is simulated by the MaudeEnv object.
To create MaudeEnv for the DP maude model, you need to provide:
* the maude module
* a initializer function that returns a string of a term for initial states
* the goal proposition 

In [3]:
import random

def dp_sampler(N):
    #N = random.choice([5])
    P = [0] * N #np.zeros(N, dtype=int)
    C = [1] * N #np.ones(N, dtype=int)

    for i in range(N):
        c = random.randrange(3)
        if c == 1:
            # to left
            P[(i-1) % N] += 1
            C[i] = 0
        elif c == 2:
            # to right:
            P[i] += 1
            C[i] = 0

    # here, self.P[i] denotes the number of chopstics assigned for ith philos
    for i in range(N):
        if P[i] == 0:
            P[i] = random.randrange(2) # either think or hungry
        else:
            P[i] += 1 # one chopstick or eat
    
    s = []
    for i in range(N):
        if P[i] == 0:
            s.append(f'p({i},think)')
        elif P[i] == 1:
            s.append(f'p({i},hungry)')
        elif P[i] == 2:
            s.append(f'p({i},single)')
        elif P[i] == 3:
            s.append(f'p({i},eat)')
            
        if C[i] == 0:
            pass
        elif C[i] == 1:
            s.append(f'c({i})')
    return ' || '.join(s)

dp_sampler(5)

'p(0,single) || p(1,think) || c(1) || p(2,eat) || p(3,hungry) || p(4,hungry) || c(4)'

In [5]:
from AGCEL.MaudeEnv import MaudeEnv

env = MaudeEnv(m,'deadlock',lambda : dp_sampler(5))
print(env.get_obs())

{'state': c(0) || c(3) || c(4) || p(0, hungry) || p(1, single) || p(2, single) || p(3, think) || p(4, hungry), 'astate': p(--, think) || p(--, hungry) || p(--, single), 'actions': [<label: th, asubs: {I: --}>, <label: hs, asubs: {I: --, J:Nat: --}>, <label: hs, asubs: {I: --, J:Nat: --}>, <label: hs, asubs: {I: --, J:Nat: --}>, <label: se, asubs: {I: --, J:Nat: --}>]}


# Training
train using the train() method from QLearner class

In [6]:
from AGCEL.QLearning import QLearner

learner = QLearner()
stat = learner.train(env, 100)
print(learner.get_size())
print(learner.q_dict)

training done!
28
{p(--, hungry) || p(--, single): {<label: hs, asubs: {I: --, J:Nat: --}>: 0.9729310918170474, <label: se, asubs: {I: --, J:Nat: --}>: 0.7860750035712095}, p(--, think) || p(--, single): {<label: th, asubs: {I: --}>: 0.8587932748646855, <label: se, asubs: {I: --, J:Nat: --}>: 0.7753329365589599}, p(--, think) || p(--, hungry) || p(--, single): {<label: hs, asubs: {I: --, J:Nat: --}>: 0.7972491866423093, <label: th, asubs: {I: --}>: 0.8798508900446251, <label: se, asubs: {I: --, J:Nat: --}>: 0.7736724988644834}, p(--, think) || p(--, single) || p(--, eat): {<label: et, asubs: {I: --}>: 0.8238105311243668, <label: th, asubs: {I: --}>: 0.7828822743570798, <label: se, asubs: {I: --, J:Nat: --}>: 0.7712825643146353}, p(--, think) || p(--, hungry) || p(--, single) || p(--, eat): {<label: et, asubs: {I: --}>: 0.8254719747057041, <label: hs, asubs: {I: --, J:Nat: --}>: 0.8090148698323536, <label: th, asubs: {I: --}>: 0.7931921199971858, <label: se, asubs: {I: --, J:Nat: --}>: 

# Evaluation
Compare the number of states explored between QHS vs BFS

In [7]:
from AGCEL.HeuristicSearch import HeuristicSearch

for _ in range(5):
    env = HeuristicSearch(m,'deadlock',lambda : dp_sampler(5), learner)
    qhs_result = env.search(mode='qhs')
    env.reset(env.last_init)
    bfs_result = env.search(mode='bfs')
    print(qhs_result, bfs_result)

goal reached!
goal reached!
26 232
goal reached!
goal reached!
13 254
goal reached!
goal reached!
11 114
goal reached!
goal reached!
40 951
goal reached!
goal reached!
12 128


### ------------- IGNORE BELOW ------------

In [14]:
import heapq

class TermWrapper():
    def __init__(self,t):
        self.t = t
        
    def __lt__(self, other):
        return 0

class HeuristicSearch(MaudeEnv):
    def __init__(self, m, goal, initializer, qt):
        MaudeEnv.__init__(self, m, goal, initializer)
        self.qt = qt
        self.last_init = self.state
        
    def get_nbrs(self):
        #returns (next state, action) where action is applied to the current state to produce next state
        return [(t, path()[1].getLabel()) for t, subs, path, nrew in self.state.search(1, self.m.parseTerm('X:State'), depth = 1)]
    
    def score(self, s, a):
        astate = self.abst(s)
        return self.qt.get_q(astate, a)
        
    def search(self, mode='qhs', max_step=1000):
        visited = set()
        i = 0
        queue = [(i,TermWrapper(self.state))] # (priority, concrete_state)

        while not queue == [] and i < max_step:
            state = heapq.heappop(queue)[1].t
            obs = self.reset(state)
            #print(t)
            if state in visited:
                continue
            i += 1
            visited.add(state)
            s = obs['astate']
            if self.is_goal():
                print('goal reached!')
                #print('t:', t)
                #print('num steps:', i)
                break
            # nbrs = [(v, av) for (a, v, av) in env.next_actions if not v in visited] # unvisited next vecs
            if mode == 'bfs': # bfs
                q_items = [(i, TermWrapper(next_state)) for (next_state, a) in self.get_nbrs()]
            elif mode == 'qhs': # qhs
                q_items = [(-self.score(state, a), TermWrapper(next_state)) for (next_state, a) in self.get_nbrs()] # prioritized nbrs
            for item in q_items:
                heapq.heappush(queue, item) # queue,item
        return i

In [40]:
class QLearner():
    def __init__(self):
        self.q_init = 0.0
        self.q_dict = dict()
        
    def get_q(self, s, a):
        q_init = self.q_init
        if s in self.q_dict:
            return self.q_dict[s].get(a, q_init)
        return q_init
        
    def set_q(self, s, a, q):
        # TODO deepcopy terms
        if q == 0.0: # TODO
            return
        elif not s in self.q_dict:
            self.q_dict[s] = { a : q }
        else:
            self.q_dict[s][a] = q
        
    def argmax_q(self, s, actions): # nbrs: iterable if acfg's
        q_dict = self.q_dict
        if s in q_dict and len(actions) != 0:
            d = { a : q_dict[s].get(a, self.q_init) for a in actions } # d = restriction of q_dict to tl
            return max(d, key=d.get) # FIXME: random choice if tie
        else:
            return -1
        
    def max_q(self, s):
        q_dict = self.q_dict
        if s in q_dict: # assume q_dict[t] is nonempty
            return max(q_dict[s].values())
        return self.q_init
    
    def get_size(self):
        # returns the number of nonzero entries in the QTable
        ret = 0
        for _, d in self.q_dict.items():
            ret += len(d)
        return ret
    
    def print_v(self):
        q_dict = self.q_dict
        print(f'fmod SCORE is')
        for t in q_dict:
            print(f'  eq score({t}) = {self.max_q(t)} .')
        print(f'  eq score(X) = {self.q_init} [owise] .')
        print(f'endfm')        
    
    def print_q(self):
        q_dict = self.q_dict
        print('load dp.maude')
        print('mod SCORE is')
        print('  pr DP5 .')
        print('  pr FLOAT .')
        print('  op score : AConf AConf -> Float .')
        for t1, d in q_dict.items():
            for t2, q in d.items():
                print(f'  eq score({t1}, {t2}) = {q} .')
        print(f'  eq score(X:AConf, Y:AConf) = {self.q_init} [owise] .') # TODO: 0 should be printed 0.0
        print(f'endm')
        
    def greedy_policy(self, obs):
        # returns -1 for error
        astate = obs["astate"]
        actions = obs["actions"]
        return self.argmax_q(astate,actions)
    
    def eps_greedy_policy(self, obs, epsilon):
        # returns -1 for error
        r = random.uniform(0, 1)
        if r > epsilon: # exploitation
            return self.greedy_policy(obs)
        else: # exploration
            actions = obs["actions"]
            if len(actions) != 0:
                return random.choice(actions)
            else:
                return -1
            
    def train(self, env, n_training_episodes, min_epsilon, max_epsilon, decay_rate, max_steps):
        stat = 0
        for episode in tqdm(range(n_training_episodes)):
            # Reduce epsilon (because we need less and less exploration)
            epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

            obs = env.reset()
            step = 0
            done = False

            for step in range(max_steps):
                s = obs["astate"]
                a = self.eps_greedy_policy(obs, epsilon)

                # assert action not -1
                if type(a) == type(-1):
                    break

                obs, reward, done = env.step(a)
                ns = obs['astate']
                stat += reward

                # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
                nq = self.get_q(s, a) + learning_rate * (
                    reward + gamma * self.max_q(ns) - self.get_q(s, a) # FIXME!!!!! max_q(s')!!!!
                )
                self.set_q(s, a, nq)

                # If terminated or truncated finish the episode
                if done:
                    break

        print('training done!')
        return self.q_dict, stat

In [25]:
'''
def greedy_policy(Qt, obs):
    # Exploitation: take the action with the highest state, action value
    astate = obs["astate"]
    actions = obs["actions"]
    return Qt.argmax_q(astate,actions)

def eps_greedy_policy(Qtable, obs, epsilon):
    r = random.uniform(0, 1)
    if r > epsilon: # exploitation
        return greedy_policy(Qtable, obs)
    else: # exploration
        actions = obs["actions"]
        if len(actions) != 0:
            return random.choice(actions)
        else:
            return -1
'''
        
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, qt):
    stat = 0
    for episode in tqdm(range(n_training_episodes)):
        # Reduce epsilon (because we need less and less exploration)
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
        # Reset the environment
        obs = env.reset()
        step = 0
        done = False

        # repeat
        for step in range(max_steps):
            # Choose the action At using epsilon greedy policy
            s = obs["astate"]
            #a = eps_greedy_policy(qt, obs, epsilon)
            a = qt.eps_greedy_policy(obs, epsilon)
            
            # assert action not -1
            if type(a) == type(-1):
                break

            # Take action At and observe Rt+1 and St+1
            # Take the action (a) and observe the outcome state(s') and reward (r)
            #print('episode:', episode, 'step:', step, 'a:',a)
            obs, reward, done = env.step(a)
            ns = obs['astate']
            stat += reward

            # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]

            #Qtable[s][a] = Qtable[s][a] + learning_rate * (
            #    reward + gamma * np.max(Qtable[a]) - Qtable[s][a]
            #)
            
            nq = qt.get_q(s, a) + learning_rate * (
                reward + gamma * qt.max_q(ns) - qt.get_q(s, a) # FIXME!!!!! max_q(s')!!!!
            )
            qt.set_q(s, a, nq)

            # If terminated or truncated finish the episode
            if done:
                break

    print('training done!')
    return qt, stat

In [26]:
# Training parameters
n_training_episodes = 100  # Total training episodes
learning_rate = 0.7  # Learning rate

# Evaluation parameters
n_eval_episodes = 100  # Total number of test episodes

# Environment parameters
#env_id = "FrozenLake-v1"  # Name of the environment
max_steps = 300  # Max steps per episode
gamma = 0.95  # Discounting rate
eval_seed = []  # The evaluation seed of the environment

# Exploration parameters
max_epsilon = 1.0  # Exploration probability at start
min_epsilon = 0.05  # Minimum exploration probability
decay_rate = 0.0005  # Exponential decay rate for exploration prob

In [27]:
# train Qtable
#env = MaudeEnv(dp_generator)
env = MaudeEnv(m,'deadlock',dp_generator)
Qtable = QTable()
Qtable = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable)

  0%|          | 0/100 [00:00<?, ?it/s]

training done!


In [28]:
qt=Qtable[0]

In [39]:
qt.q_dict

{p(--, hungry) || p(--, single): {'hs': 0.986145285034768,
  'se': 0.7278900926619839},
 p(--, think) || p(--, single): {'th': 0.8075372104341592,
  'se': 0.7562435959691568},
 p(--, think) || p(--, single) || p(--, eat): {'et': 0.7965121647914764,
  'th': 0.7148195844801836,
  'se': 0.7363522087237413},
 p(--, think) || p(--, hungry) || p(--, single) || p(--, eat): {'hs': 0.7487671188435407,
  'et': 0.7501174529798867,
  'th': 0.7257537928562119,
  'se': 0.6708908792568634},
 p(--, think) || p(--, hungry) || p(--, single): {'se': 0.7050434538998628,
  'th': 0.7843602857822237,
  'hs': 0.7759559170731525},
 p(--, think) || p(--, hungry) || p(--, eat): {'hs': 0.7111647379228591,
  'th': 0.6962760899866769,
  'et': 0.6918448527266022},
 p(--, think) || p(--, hungry): {'hs': 0.7806181721044272,
  'th': 0.7147510240924144},
 p(--, hungry) || p(--, eat): {'et': 0.6831780357946137,
  'hs': 0.7952513472362125},
 p(--, hungry) || p(--, single) || p(--, eat): {'et': 0.7457844198602055,
  'se': 

In [24]:
class QTable():
    def __init__(self):
        self.q_init = 0.0
        self.q_dict = dict()
        
    def get_q(self, s, a):
        q_init = self.q_init
        if s in self.q_dict:
            return self.q_dict[s].get(a, q_init)
        return q_init
        
    def set_q(self, s, a, q):
        # TODO deepcopy terms
        if q == 0.0: # TODO
            return
        elif not s in self.q_dict:
            self.q_dict[s] = { a : q }
        else:
            self.q_dict[s][a] = q
        
    def argmax_q(self, s, actions): # nbrs: iterable if acfg's
        q_dict = self.q_dict
        if s in q_dict and len(actions) != 0:
            d = { a : q_dict[s].get(a, self.q_init) for a in actions } # d = restriction of q_dict to tl
            return max(d, key=d.get) # FIXME: random choice if tie
        else:
            return -1
        
    def max_q(self, s):
        q_dict = self.q_dict
        if s in q_dict: # assume q_dict[t] is nonempty
            return max(q_dict[s].values())
        return self.q_init
    
    def get_size(self):
        # returns the number of nonzero entries in the QTable
        ret = 0
        for _, d in self.q_dict.items():
            ret += len(d)
        return ret
    
    def print_v(self):
        q_dict = self.q_dict
        print(f'fmod SCORE is')
        for t in q_dict:
            print(f'  eq score({t}) = {self.max_q(t)} .')
        print(f'  eq score(X) = {self.q_init} [owise] .')
        print(f'endfm')        
    
    def print_q(self):
        q_dict = self.q_dict
        print('load dp.maude')
        print('mod SCORE is')
        print('  pr DP5 .')
        print('  pr FLOAT .')
        print('  op score : AConf AConf -> Float .')
        for t1, d in q_dict.items():
            for t2, q in d.items():
                print(f'  eq score({t1}, {t2}) = {q} .')
        print(f'  eq score(X:AConf, Y:AConf) = {self.q_init} [owise] .') # TODO: 0 should be printed 0.0
        print(f'endm')
        
    def greedy_policy(self, obs):
        # returns -1 for error
        astate = obs["astate"]
        actions = obs["actions"]
        return self.argmax_q(astate,actions)
    
    def eps_greedy_policy(self, obs, epsilon):
        # returns -1 for error
        r = random.uniform(0, 1)
        if r > epsilon: # exploitation
            return greedy_policy(Qtable, obs)
        else: # exploration
            actions = obs["actions"]
            if len(actions) != 0:
                return random.choice(actions)
            else:
                return -1