Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# -*- coding: utf-8 -*-
"""
Copyright 2018 Alexey Melnikov and Katja Ried.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
Please acknowledge the authors when re-using this code and maintain this notice intact.
Code written by Alexey Melnikov and Katja Ried, implementing ideas from
'Projective simulation for artificial intelligence'
Hans J. Briegel & Gemma De las Cuevas
Scientific Reports 2, Article number: 400 (2012) doi:10.1038/srep00400
and
'Projective Simulation for Classical Learning Agents: A Comprehensive Investigation'
Julian Mautner, Adi Makmal, Daniel Manzano, Markus Tiersch & Hans J. Briegel
New Generation Computing, Volume 33, Issue 1, pp 69-114 (2015) doi:10.1007/s00354-015-0102-0
"""
import __future__
import numpy as np
class BasicPSAgent(object):
"""Projective Simulation agent with two-layered network. Features: forgetting, glow, reflection, optional softmax rule. """
def __init__(self, num_actions, num_percepts_list, gamma_damping, eta_glow_damping, policy_type, beta_softmax, num_reflections):
"""Initialize the basic PS agent. Arguments:
- num_actions: integer >=1,
- num_percepts_list: list of integers >=1, not nested, representing the cardinality of each category/feature of percept space.
- gamma_damping: float between 0 and 1, controls forgetting/damping of h-values
- eta_glow_damping: float between 0 and 1, controls the damping of glow; setting this to 1 effectively switches off glow
- policy_type: string, 'standard' or 'softmax'; toggles the rule used to compute probabilities from h-values
- beta_softmax: float >=0, probabilities are proportional to exp(beta*h_value). If policy_type != 'softmax', then this is irrelevant.
- num_reflections: integer >=0 setting how many times the agent reflects, ie potentially goes back to the percept. Setting this to zero effectively deactivates reflection.
"""
self.num_actions = num_actions
self.num_percepts_list = num_percepts_list
self.gamma_damping = gamma_damping
self.eta_glow_damping = eta_glow_damping
self.policy_type = policy_type
self.beta_softmax = beta_softmax
self.num_reflections = num_reflections
self.num_percepts = int(np.prod(np.array(self.num_percepts_list).astype(np.float64))) # total number of possible percepts
self.h_matrix = np.ones((self.num_actions, self.num_percepts), dtype=np.float64) #Note: the first index specifies the action, the second index specifies the percept.
self.g_matrix = np.zeros((self.num_actions, self.num_percepts), dtype=np.float64) #glow matrix, for processing delayed rewards
if num_reflections > 0:
self.last_percept_action = None #stores the last realized percept-action pair for use with reflection. If reflection is deactivated, all necessary information is encoded in g_matrix.
self.e_matrix = np.ones((self.num_actions, self.num_percepts), dtype=np.bool_) # emoticons
#emoticons are initialized to True (happy, good choice) and set to false (sad, reflect again) only if the percept-action pair is used and does not yield a reward.
def percept_preprocess(self, observation): # preparing for creating a percept
"""Takes a multi-feature percept and reduces it to a single integer index.
Input: list of integers >=0, of the same length as self.num_percept_list;
respecting the cardinality specified by num_percepts_list: observation[i]<num_percepts_list[i] (strictly)
Output: single integer."""
percept = 0
for which_feature in range(len(observation)):
percept += int(observation[which_feature] * np.prod(self.num_percepts_list[:which_feature]))
return percept
def deliberate_and_learn(self, observation, reward):
"""Given an observation and a reward (from the previous interaction), this method
updates the h_matrix, chooses the next action and records that choice in the g_matrix and last_percept_action.
Arguments:
- observation: list of integers, as specified for percept_preprocess,
- reward: float
Output: action, represented by a single integer index."""
self.h_matrix = self.h_matrix - self.gamma_damping * (self.h_matrix - 1.) + self.g_matrix * reward # learning and forgetting
if (self.num_reflections > 0) and (self.last_percept_action != None) and (reward <= 0): # reflection update
self.e_matrix[self.last_percept_action] = 0
percept = self.percept_preprocess(observation)
action = np.random.choice(self.num_actions, p=self.probability_distr(percept)) #deliberate once
for i_counter in range(self.num_reflections): #if num_reflection >=1, repeat deliberation if indicated
if self.e_matrix[action, percept]:
break
action = np.random.choice(self.num_actions, p=self.probability_distr(percept))
self.g_matrix = (1 - self.eta_glow_damping) * self.g_matrix
self.g_matrix[action, percept] = 1 #record latest decision in g_matrix
if self.num_reflections > 0:
self.last_percept_action = action, percept #record latest decision in last_percept_action
return action
def probability_distr(self, percept):
"""Given a percept index, this method returns a probability distribution over actions
(an array of length num_actions normalized to unit sum) computed according to policy_type."""
if self.policy_type == 'standard':
h_vector = self.h_matrix[:, percept]
probability_distr = h_vector / np.sum(h_vector)
elif self.policy_type == 'softmax':
h_vector = self.beta_softmax * self.h_matrix[:, percept]
h_vector_mod = h_vector - np.max(h_vector)
probability_distr = np.exp(h_vector_mod) / np.sum(np.exp(h_vector_mod))
return probability_distr