In [1]:
%%html
<style>.text_cell .rendered_html * {direction: ltr; text-align: left;}</style>

# Q1

### 1)

Seasons can be defined as states, and each episode is contained of 4 states from starting point to the end or goal state.  
The policy is the amount of seed that we want to implant.  
The situation of weather and dust must be involved in the rewards.  
Rewards also depend on the amount of product we have from previous years.  

### 2)

states: number of guests who register every day.  
actions: the number of guests the manager wants to register to the hotel.  
rewards: negative comments and the number of rejected quests will appear as punishments and will affect the rewards.  
We can start from random policy and improve it in time.

### 3)

states: each device can be defined as one state.  
actions: repair, analyze, change,    
rewards: if the device is completely out of service and needs, then it is a punishment that will appear in the reward function, and it can get -inf as a reward. For repairing and changing the product, we have punishment as well.   
Because each device is different, we should calculate the expected values separately. The discount factor that we use in q-value calculation should be used for each device independently. 

In [2]:
from amalearn.reward import RewardBase
from amalearn.agent import AgentBase

In [3]:
from amalearn.environment import EnvironmentBase
import gym
import numpy as np


There are 9 actions that we can choose:

In [4]:
ACTIONS = np.array([(0,-1), (-1,0), (0,1), (1,0), (-1,-1), (1,1), (0,0), (-1,1), (1,-1)])

In [5]:
class Environment(EnvironmentBase):
    def __init__(self, map_size=17, obstacle=[],goal= np.array([(1,1)]) ,id=0, action_count=9, actionPrice=-1, goalReward=100, punish=-10, j_limit=10, i_limit=10, p=0.8):
        """
        initialize your variables
        """
        self.state = None
        self.map_size = map_size
        self.action_count = action_count
        self.action_price = actionPrice
        self.goal_reward = goalReward
        self.punishment = punish
        self.j_limit = j_limit 
        self.i_limit = i_limit 
        self.p = p
        self.obstacle = obstacle
        self.actions = np.array([(0,-1), (-1,0), (0,1), (1,0), (-1,-1), (1,1), (0,0), (-1,1), (1,-1)])
        self.goal = goal
        
    def isStatePossible(self, state):
        """if given state is possible (not out of the grid and not obstacle) return ture"""
        valid = False
        if (self.map_size*state[0] + state[1]) not in self.obstacle:
            valid = True
        if state[0]<0 or state[1]<0 or state[0]>16 or state[1]>16:
            valid = False
        return valid
    
    def isAccessible(self, state, state_p):
        """if given state is Accesible (we can reach state_p by doing an action from state) return true"""
        if not self.isStatePossible(state_p):
            return False
        next_states = self.available_states(state)
        if state_p in next_states:
            return True
        return False
            
    def getTransitionStatesAndProbs(self, state, action):
        """return probability of transition or T(sp,a,s)"""
        transition_probs= []
        transition_states = self.available_states(state)
        next_state = action + state
        for s in transition_states:
            if next_state[0] == s[0] and next_state[1] == s[1]:
                transition_probs.append(self.p)
            else:
                transition_probs.append((1-self.p)/(len(transition_states)-1+0.0000001)) 
        return transition_states,transition_probs
        
    
    def getReward(self, state, action):
        """return reward of transition"""
        rewards = []
        available_states = self.available_states(state)
        next_state = state + action
        for s in available_states:
            r = 0
            if not self.isStatePossible(next_state):
                r = -1
            if next_state[0] != s[0] and next_state[1] != s[1]:
                r -= 0.01
            if self.goal[0] == s[0] and self.goal[1] == s[1]:
                r += 5
            rewards.append(r)
        return rewards
        
    def sample_all_rewards(self):
        return 
    
    def calculate_reward(self, action):
        return 

    def terminated(self):
        return 

    def observe(self):
        return 

    def available_actions(self, current_state):
        available_actions = []
        for a in self.actions:
            next_state = current_state + a
            if self.isStatePossible(next_state):
                available_actions.append(a)
        return available_actions
    
    def available_states(self, current_state):
        next_states = []
        available_actions = self.available_actions(current_state)
        for a in available_actions:            
            next_states.append(a+current_state)
        return next_states
            
    def next_state(self, action):
        return self.state + action

    def reset(self):
        self.state = np.array([(map_size, map_size)])
        print("current state: ", self.state)
        return 

    def render(self, mode='human'):
        #print('{}:\taction={}'.format(self.state['length'], self.state['last_action']))
        return 

    def close(self):
        return

In [6]:

class Agent(AgentBase):
    def __init__(self, id, environment, discount, theta):
        self.environment = environment
        self.V = np.zeros((17, 17))
        self.policy = np.zeros((17, 17))
        super(Agent, self).__init__(id, environment)
        self.discount = discount
        self.theta = theta
    
    def sum_Q_value(self, s0, s1, a):
        sum_q_value = 0
        transition_states,transition_probs = self.environment.getTransitionStatesAndProbs((s0,s1),a)
        rewards = self.environment.getReward((s0,s1),a)
        for i,s in enumerate(transition_states):
            sum_q_value +=  transition_probs[i]*(rewards[i] + self.discount*(self.V[s[0]][s[1]]))
        return sum_q_value
    
    def policy_evaluation(self):
        while True:
            delta = 0
            for s0 in range(17):
                for s1 in range(17):
                    if not self.environment.isStatePossible((s0,s1)):
                        continue
                    a = self.policy[s0][s1]
                    v = self.V[s0][s1]
                    self.V[s0][s1] = self.sum_Q_value(s0, s1, ACTIONS[int(a)])
                    delta = max(delta, abs(v-self.V[s0][s1]))
            if delta < self.theta:
                break

            
    def policy_improvement(self):
        print("policy imp")
        
        policy_stable = True
        for s0 in range(17):
            for s1 in range(17):
                if not self.environment.isStatePossible((s0,s1)):
                    continue
                max_q = 0
                for index_a,a in enumerate(ACTIONS):
                    transition_states,transition_probs = self.environment.getTransitionStatesAndProbs((s0,s1),a)
                    rewards = self.environment.getReward((s0,s1),a)
                    q_value = 0
                    for i,s in enumerate(transition_states):
                        q_value +=  transition_probs[i]*(rewards[i] + self.discount*(self.V[s[0]][s[1]]))
                    if q_value > max_q:
                        max_q = q_value
                        max_a = index_a
                self.policy[s0][s1] = max_a
                if self.policy[s0][s1] != max_a:
                    policy_stable = False
        print(self.policy)
        return policy_stable
    
    def policy_iteration(self):
        policy_stable = False
        iter=0
        while not policy_stable:
            self.policy_evaluation()
            policy_stable = self.policy_improvement()
    
    def take_action(self) -> (object, float, bool, object):
        pass

In [7]:
def get_obstacles(map):
    n = len(map)
    obstacles= []
    for i in range(n):
        for j in range(n):
            if map[i][j] == "1":
                obstacles.append(n*i+j)
    return obstacles

def get_goal(map):
    n = len(map)
    goals= []
    for i in range(n):
        for j in range(n):
            if map[i][j] == "G":
                goals.append(np.array([i,j]))
    return goals

In [8]:
n = 17
map =  [
        "11111111111111111",
        "1G000001100000001",
        "10000001100000001",
        "10000001100000001",
        "10000001100000001",
        "10000000000000001",
        "10000000000000001",
        "10000000000000001",
        "10000000000001111",
        "10000000000001111",
        "10000000000000001",
        "10000000000000001",
        "10000011000000001",
        "10000011000000001",
        "10000011000000001",
        "100000110000000S1",
        "11111111111111111"]

obstacles = get_obstacles(map)
goal = np.array([1,1])

In [9]:
state = (1,1)
if (17*state[0] + state[1]) not in obstacles:
    print("available")
else:
    print("not available")

available


In [10]:
import numpy as np
state1 = np.array([1,1])
state2 = np.array([1,1])
if state1 is state2:
    print("yes")

In [11]:
state3 = np.array([(0,0), (1,1)])
state3[1][1]

1

In [12]:
env = Environment(17, obstacles,goal=np.array([1,1]),id = "0", action_count=9, actionPrice=-0.01, goalReward=100, punish=-1, j_limit=10, i_limit=10, p=0.8)

In [13]:
agent = Agent('1', env, discount=0.9, theta=0.5)

In [14]:
agent.policy_iteration()

policy imp
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 6. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 4. 4. 4. 4. 4. 0. 0. 4. 4. 4. 4. 4. 4. 4. 0.]
 [0. 7. 7. 7. 7. 1. 4. 0. 0. 4. 4. 4. 4. 4. 4. 4. 0.]
 [0. 4. 4. 4. 4. 7. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
