Skip to content
Switch branches/tags
Go to file
Cannot retrieve contributors at this time
# -*- coding: utf-8 -*-
Copyright 2018 Alexey Melnikov and Katja Ried.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
Please acknowledge the authors when re-using this code and maintain this notice intact.
Code written by Alexey Melnikov, implementing ideas from
'Projective simulation with generalization'
Alexey A. Melnikov, Adi Makmal, Vedran Dunjko & Hans J. Briegel
Scientific Reports 7, Article number: 14430 (2017) doi:10.1038/s41598-017-14740-y
'Projective simulation for artificial intelligence'
Hans J. Briegel & Gemma De las Cuevas
Scientific Reports 2, Article number: 400 (2012) doi:10.1038/srep00400
import __future__
import numpy as np
from scipy.sparse import lil_matrix
class BasicPSAgent(object):
"""Projective Simulation agent with two-layered network. Features: forgetting, glow, reflection, optional softmax rule. """
def __init__(self, num_actions, num_percepts_list, gamma_damping, eta_glow_damping, policy_type, beta_softmax, num_reflections):
"""Initialize the basic PS agent. Arguments:
- num_actions: integer >=1,
- num_percepts_list: list of integers >=1, not nested, representing the cardinality of each category/feature of percept space.
- gamma_damping: float between 0 and 1, controls forgetting/damping of h-values
- eta_glow_damping: float between 0 and 1, controls the damping of glow; setting this to 1 effectively switches off glow
- policy_type: string, 'standard' or 'softmax'; toggles the rule used to compute probabilities from h-values
- beta_softmax: float >=0, probabilities are proportional to exp(beta*h_value). If policy_type != 'softmax', then this is irrelevant.
- num_reflections: integer >=0 setting how many times the agent reflects, ie potentially goes back to the percept. Setting this to zero effectively deactivates reflection.
self.num_actions = num_actions
self.num_percepts_list = num_percepts_list
self.gamma_damping = gamma_damping
self.eta_glow_damping = eta_glow_damping
self.policy_type = policy_type
self.beta_softmax = beta_softmax
self.num_reflections = num_reflections
self.num_percepts = int( # total number of possible percepts
self.h_matrix = lil_matrix((self.num_actions, self.num_percepts), dtype=np.float32)
self.g_matrix = lil_matrix((self.num_actions, self.num_percepts), dtype=np.float32)
self.last_percept_action = None #stores the last realized percept-action pair for use with reflection. If reflection is deactivated, all necessary information is encoded in g_matrix.
if num_reflections > 0:
self.e_matrix = lil_matrix((self.num_actions, self.num_percepts), dtype=np.bool_) # emoticons
#emoticons are initialized to True (happy, good choice) and set to false (sad, reflect again) only if the percept-action pair is used and does not yield a reward.
def percept_preprocess(self, observation): # preparing for creating a percept
"""Takes a multi-feature percept and reduces it to a single integer index.
Input: list of integers >=0, of the same length as self.num_percept_list;
respecting the cardinality specified by num_percepts_list: observation[i]<num_percepts_list[i] (strictly)
Output: single integer."""
percept = 0
for which_feature in range(len(observation)):
percept += int(observation[which_feature] *[:which_feature]))
return percept
def deliberate_and_learn(self, observation, reward):
"""Given an observation and a reward (from the previous interaction), this method
updates the h_matrix, chooses the next action and records that choice in the g_matrix and last_percept_action.
- observation: list of integers, as specified for percept_preprocess,
- reward: float
Output: action, represented by a single integer index."""
if self.gamma_damping != 0:
self.h_matrix = self.h_matrix.tocsc() = (1 - self.gamma_damping) * + self.gamma_damping # forgetting
self.h_matrix = self.h_matrix.tolil()
self.h_matrix += self.g_matrix * reward # learning
if (self.num_reflections > 0) and (self.last_percept_action != None) and (reward <= 0): # reflection update
self.e_matrix[self.last_percept_action] = 0
percept = self.percept_preprocess(observation)
if np.sum(self.h_matrix[:, percept]) == 0: # if percept_now is new - create it
self.h_matrix[:, percept] = 1
self.g_matrix[:, percept] = 0
if (self.num_reflections > 0):
self.e_matrix[:, percept] = 1
action = np.random.choice(self.num_actions, p=self.probability_distr(percept)) #deliberate once
for i_counter in range(self.num_reflections): #if num_reflection >=1, repeat deliberation if indicated
if self.e_matrix[action, percept]:
action = np.random.choice(self.num_actions, p=self.probability_distr(percept))
self.g_matrix = (1 - self.eta_glow_damping) * self.g_matrix
self.g_matrix[action, percept] = 1 #record latest decision in g_matrix
if self.num_reflections > 0:
self.last_percept_action = action, percept #record latest decision in last_percept_action
return action
def probability_distr(self, percept):
"""Given a percept index, this method returns a probability distribution over actions
(an array of length num_actions normalized to unit sum) computed according to policy_type."""
if self.policy_type == 'standard':
h_vector = (self.h_matrix[:, percept]).toarray().flatten()
probability_distr = h_vector / np.sum(h_vector)
elif self.policy_type == 'softmax':
h_vector = (self.beta_softmax * self.h_matrix[:, percept]).toarray().flatten()
h_vector_mod = h_vector - np.max(h_vector)
probability_distr = np.exp(h_vector_mod) / np.sum(np.exp(h_vector_mod))
return probability_distr