-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathps_agent_basic.py
99 lines (83 loc) · 5.69 KB
/
ps_agent_basic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# -*- coding: utf-8 -*-
"""
Copyright 2018 Alexey Melnikov and Katja Ried.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
Please acknowledge the authors when re-using this code and maintain this notice intact.
Code written by Alexey Melnikov and Katja Ried, implementing ideas from
'Projective simulation for artificial intelligence'
Hans J. Briegel & Gemma De las Cuevas
Scientific Reports 2, Article number: 400 (2012) doi:10.1038/srep00400
and
'Projective Simulation for Classical Learning Agents: A Comprehensive Investigation'
Julian Mautner, Adi Makmal, Daniel Manzano, Markus Tiersch & Hans J. Briegel
New Generation Computing, Volume 33, Issue 1, pp 69-114 (2015) doi:10.1007/s00354-015-0102-0
"""
import __future__
import numpy as np
class BasicPSAgent(object):
"""Projective Simulation agent with two-layered network. Features: forgetting, glow, reflection, optional softmax rule. """
def __init__(self, num_actions, num_percepts_list, gamma_damping, eta_glow_damping, policy_type, beta_softmax, num_reflections):
"""Initialize the basic PS agent. Arguments:
- num_actions: integer >=1,
- num_percepts_list: list of integers >=1, not nested, representing the cardinality of each category/feature of percept space.
- gamma_damping: float between 0 and 1, controls forgetting/damping of h-values
- eta_glow_damping: float between 0 and 1, controls the damping of glow; setting this to 1 effectively switches off glow
- policy_type: string, 'standard' or 'softmax'; toggles the rule used to compute probabilities from h-values
- beta_softmax: float >=0, probabilities are proportional to exp(beta*h_value). If policy_type != 'softmax', then this is irrelevant.
- num_reflections: integer >=0 setting how many times the agent reflects, ie potentially goes back to the percept. Setting this to zero effectively deactivates reflection.
"""
self.num_actions = num_actions
self.num_percepts_list = num_percepts_list
self.gamma_damping = gamma_damping
self.eta_glow_damping = eta_glow_damping
self.policy_type = policy_type
self.beta_softmax = beta_softmax
self.num_reflections = num_reflections
self.num_percepts = int(np.prod(np.array(self.num_percepts_list).astype(np.float64))) # total number of possible percepts
self.h_matrix = np.ones((self.num_actions, self.num_percepts), dtype=np.float64) #Note: the first index specifies the action, the second index specifies the percept.
self.g_matrix = np.zeros((self.num_actions, self.num_percepts), dtype=np.float64) #glow matrix, for processing delayed rewards
if num_reflections > 0:
self.last_percept_action = None #stores the last realized percept-action pair for use with reflection. If reflection is deactivated, all necessary information is encoded in g_matrix.
self.e_matrix = np.ones((self.num_actions, self.num_percepts), dtype=np.bool_) # emoticons
#emoticons are initialized to True (happy, good choice) and set to false (sad, reflect again) only if the percept-action pair is used and does not yield a reward.
def percept_preprocess(self, observation): # preparing for creating a percept
"""Takes a multi-feature percept and reduces it to a single integer index.
Input: list of integers >=0, of the same length as self.num_percept_list;
respecting the cardinality specified by num_percepts_list: observation[i]<num_percepts_list[i] (strictly)
Output: single integer."""
percept = 0
for which_feature in range(len(observation)):
percept += int(observation[which_feature] * np.prod(self.num_percepts_list[:which_feature]))
return percept
def deliberate_and_learn(self, observation, reward):
"""Given an observation and a reward (from the previous interaction), this method
updates the h_matrix, chooses the next action and records that choice in the g_matrix and last_percept_action.
Arguments:
- observation: list of integers, as specified for percept_preprocess,
- reward: float
Output: action, represented by a single integer index."""
self.h_matrix = self.h_matrix - self.gamma_damping * (self.h_matrix - 1.) + self.g_matrix * reward # learning and forgetting
if (self.num_reflections > 0) and (self.last_percept_action != None) and (reward <= 0): # reflection update
self.e_matrix[self.last_percept_action] = 0
percept = self.percept_preprocess(observation)
action = np.random.choice(self.num_actions, p=self.probability_distr(percept)) #deliberate once
for i_counter in range(self.num_reflections): #if num_reflection >=1, repeat deliberation if indicated
if self.e_matrix[action, percept]:
break
action = np.random.choice(self.num_actions, p=self.probability_distr(percept))
self.g_matrix = (1 - self.eta_glow_damping) * self.g_matrix
self.g_matrix[action, percept] = 1 #record latest decision in g_matrix
if self.num_reflections > 0:
self.last_percept_action = action, percept #record latest decision in last_percept_action
return action
def probability_distr(self, percept):
"""Given a percept index, this method returns a probability distribution over actions
(an array of length num_actions normalized to unit sum) computed according to policy_type."""
if self.policy_type == 'standard':
h_vector = self.h_matrix[:, percept]
probability_distr = h_vector / np.sum(h_vector)
elif self.policy_type == 'softmax':
h_vector = self.beta_softmax * self.h_matrix[:, percept]
h_vector_mod = h_vector - np.max(h_vector)
probability_distr = np.exp(h_vector_mod) / np.sum(np.exp(h_vector_mod))
return probability_distr