Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
109 lines (93 sloc)
5.91 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Copyright 2018 Alexey Melnikov and Katja Ried. | |
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. | |
Please acknowledge the authors when re-using this code and maintain this notice intact. | |
Code written by Alexey Melnikov, implementing ideas from | |
'Projective simulation with generalization' | |
Alexey A. Melnikov, Adi Makmal, Vedran Dunjko & Hans J. Briegel | |
Scientific Reports 7, Article number: 14430 (2017) doi:10.1038/s41598-017-14740-y | |
and | |
'Projective simulation for artificial intelligence' | |
Hans J. Briegel & Gemma De las Cuevas | |
Scientific Reports 2, Article number: 400 (2012) doi:10.1038/srep00400 | |
""" | |
import __future__ | |
import numpy as np | |
from scipy.sparse import lil_matrix | |
class BasicPSAgent(object): | |
"""Projective Simulation agent with two-layered network. Features: forgetting, glow, reflection, optional softmax rule. """ | |
def __init__(self, num_actions, num_percepts_list, gamma_damping, eta_glow_damping, policy_type, beta_softmax, num_reflections): | |
"""Initialize the basic PS agent. Arguments: | |
- num_actions: integer >=1, | |
- num_percepts_list: list of integers >=1, not nested, representing the cardinality of each category/feature of percept space. | |
- gamma_damping: float between 0 and 1, controls forgetting/damping of h-values | |
- eta_glow_damping: float between 0 and 1, controls the damping of glow; setting this to 1 effectively switches off glow | |
- policy_type: string, 'standard' or 'softmax'; toggles the rule used to compute probabilities from h-values | |
- beta_softmax: float >=0, probabilities are proportional to exp(beta*h_value). If policy_type != 'softmax', then this is irrelevant. | |
- num_reflections: integer >=0 setting how many times the agent reflects, ie potentially goes back to the percept. Setting this to zero effectively deactivates reflection. | |
""" | |
self.num_actions = num_actions | |
self.num_percepts_list = num_percepts_list | |
self.gamma_damping = gamma_damping | |
self.eta_glow_damping = eta_glow_damping | |
self.policy_type = policy_type | |
self.beta_softmax = beta_softmax | |
self.num_reflections = num_reflections | |
self.num_percepts = int(np.prod(np.array(self.num_percepts_list).astype(np.float32))) # total number of possible percepts | |
self.h_matrix = lil_matrix((self.num_actions, self.num_percepts), dtype=np.float32) | |
self.g_matrix = lil_matrix((self.num_actions, self.num_percepts), dtype=np.float32) | |
self.last_percept_action = None #stores the last realized percept-action pair for use with reflection. If reflection is deactivated, all necessary information is encoded in g_matrix. | |
if num_reflections > 0: | |
self.e_matrix = lil_matrix((self.num_actions, self.num_percepts), dtype=np.bool_) # emoticons | |
#emoticons are initialized to True (happy, good choice) and set to false (sad, reflect again) only if the percept-action pair is used and does not yield a reward. | |
def percept_preprocess(self, observation): # preparing for creating a percept | |
"""Takes a multi-feature percept and reduces it to a single integer index. | |
Input: list of integers >=0, of the same length as self.num_percept_list; | |
respecting the cardinality specified by num_percepts_list: observation[i]<num_percepts_list[i] (strictly) | |
Output: single integer.""" | |
percept = 0 | |
for which_feature in range(len(observation)): | |
percept += int(observation[which_feature] * np.prod(self.num_percepts_list[:which_feature])) | |
return percept | |
def deliberate_and_learn(self, observation, reward): | |
"""Given an observation and a reward (from the previous interaction), this method | |
updates the h_matrix, chooses the next action and records that choice in the g_matrix and last_percept_action. | |
Arguments: | |
- observation: list of integers, as specified for percept_preprocess, | |
- reward: float | |
Output: action, represented by a single integer index.""" | |
if self.gamma_damping != 0: | |
self.h_matrix = self.h_matrix.tocsc() | |
self.h_matrix.data = (1 - self.gamma_damping) * self.h_matrix.data + self.gamma_damping # forgetting | |
self.h_matrix = self.h_matrix.tolil() | |
self.h_matrix += self.g_matrix * reward # learning | |
if (self.num_reflections > 0) and (self.last_percept_action != None) and (reward <= 0): # reflection update | |
self.e_matrix[self.last_percept_action] = 0 | |
percept = self.percept_preprocess(observation) | |
if np.sum(self.h_matrix[:, percept]) == 0: # if percept_now is new - create it | |
self.h_matrix[:, percept] = 1 | |
self.g_matrix[:, percept] = 0 | |
if (self.num_reflections > 0): | |
self.e_matrix[:, percept] = 1 | |
action = np.random.choice(self.num_actions, p=self.probability_distr(percept)) #deliberate once | |
for i_counter in range(self.num_reflections): #if num_reflection >=1, repeat deliberation if indicated | |
if self.e_matrix[action, percept]: | |
break | |
action = np.random.choice(self.num_actions, p=self.probability_distr(percept)) | |
self.g_matrix = (1 - self.eta_glow_damping) * self.g_matrix | |
self.g_matrix[action, percept] = 1 #record latest decision in g_matrix | |
if self.num_reflections > 0: | |
self.last_percept_action = action, percept #record latest decision in last_percept_action | |
return action | |
def probability_distr(self, percept): | |
"""Given a percept index, this method returns a probability distribution over actions | |
(an array of length num_actions normalized to unit sum) computed according to policy_type.""" | |
if self.policy_type == 'standard': | |
h_vector = (self.h_matrix[:, percept]).toarray().flatten() | |
probability_distr = h_vector / np.sum(h_vector) | |
elif self.policy_type == 'softmax': | |
h_vector = (self.beta_softmax * self.h_matrix[:, percept]).toarray().flatten() | |
h_vector_mod = h_vector - np.max(h_vector) | |
probability_distr = np.exp(h_vector_mod) / np.sum(np.exp(h_vector_mod)) | |
return probability_distr | |