In [1]:
import numpy as np
import pandas as pd
import random
from collections import defaultdict
import gym
import gym_minigrid
import matplotlib.pyplot as plt
%matplotlib inline




In [2]:
class QLearning:
    def __init__(self, actions, agent_indicator=10):
        self.actions = actions
        self.agent_indicator = agent_indicator
        self.alpha = 0.01
        self.gamma = 0.9
        self.epsilon = 0.2
        self.q_values = defaultdict(lambda: [0.0] * actions)
        
    def _convert_state(self, s):
        return np.where(s == self.agent_indicator)[0][0]
        
    def update(self, state, action, reward, next_state, next_action):
        state = self._convert_state(state)
        next_state = self._convert_state(next_state)
        
        q_value = self.q_values[state][action]
        next_q_value = max(self.q_values[next_state])
        
        td_error = reward + self.gamma * next_q_value - q_value
        self.q_values[state][action] = q_value + self.alpha * td_error
    
    def act(self, state):
        if np.random.rand() < self.epsilon:
            action = np.random.choice(self.actions)
        else:
            state = self._convert_state(state)
            q_values = self.q_values[state]
            action = np.argmax(q_values)
        return action

In [3]:
from utils import gen_wrapped_env, show_video

In [8]:
env = gen_wrapped_env('MiniGrid-Empty-8x8-v0')
obs = env.reset()

agent_position = obs[0]

agent = QLearning(3, agent_position)

In [None]:
rewards = []
for ep in range(100000):
    done = False
    obs = env.reset()
    action = agent.act(obs)
    
    ep_rewards = 0
    while not done:
        next_obs, reward, done, info = env.step(action)

        next_action = agent.act(next_obs)

        agent.update(obs, action, reward, next_obs, next_action)
        
        ep_rewards += reward
        obs = next_obs
        action = next_action
    rewards.append(ep_rewards)
    if (ep+1) % 20 == 0:
        print("episode: {}, rewards: {}".format(ep+1, ep_rewards))
env.close()

episode: 20, rewards: 0
episode: 40, rewards: 0
episode: 60, rewards: 0
episode: 80, rewards: 0
episode: 100, rewards: 0
episode: 120, rewards: 0
episode: 140, rewards: 0
episode: 160, rewards: 0
episode: 180, rewards: 0
episode: 200, rewards: 0
episode: 220, rewards: 0
episode: 240, rewards: 0
episode: 260, rewards: 0
episode: 280, rewards: 0
episode: 300, rewards: 0
episode: 320, rewards: 0
episode: 340, rewards: 0
episode: 360, rewards: 0
episode: 380, rewards: 0
episode: 400, rewards: 0
episode: 420, rewards: 0
episode: 440, rewards: 0
episode: 460, rewards: 0
episode: 480, rewards: 0
episode: 500, rewards: 0
episode: 520, rewards: 0
episode: 540, rewards: 0
episode: 560, rewards: 0
episode: 580, rewards: 0
episode: 600, rewards: 0
episode: 620, rewards: 0
episode: 640, rewards: 0
episode: 660, rewards: 0
episode: 680, rewards: 0
episode: 700, rewards: 0
episode: 720, rewards: 0
episode: 740, rewards: 0
episode: 760, rewards: 0
episode: 780, rewards: 0
episode: 800, rewards: 0
epis

episode: 6380, rewards: 0
episode: 6400, rewards: 0
episode: 6420, rewards: 0
episode: 6440, rewards: 0
episode: 6460, rewards: 0
episode: 6480, rewards: 0
episode: 6500, rewards: 0
episode: 6520, rewards: 0
episode: 6540, rewards: 0
episode: 6560, rewards: 0
episode: 6580, rewards: 0
episode: 6600, rewards: 0
episode: 6620, rewards: 0
episode: 6640, rewards: 0
episode: 6660, rewards: 0
episode: 6680, rewards: 0
episode: 6700, rewards: 0
episode: 6720, rewards: 0
episode: 6740, rewards: 0
episode: 6760, rewards: 0
episode: 6780, rewards: 0
episode: 6800, rewards: 0
episode: 6820, rewards: 0
episode: 6840, rewards: 0
episode: 6860, rewards: 0
episode: 6880, rewards: 0
episode: 6900, rewards: 0
episode: 6920, rewards: 0
episode: 6940, rewards: 0
episode: 6960, rewards: 0
episode: 6980, rewards: 0
episode: 7000, rewards: 0
episode: 7020, rewards: 0
episode: 7040, rewards: 0
episode: 7060, rewards: 0
episode: 7080, rewards: 0
episode: 7100, rewards: 0
episode: 7120, rewards: 0
episode: 714

In [None]:
{s:np.round(q, 5).tolist() for s, q in agent.q_values.items()}

In [None]:
show_video()

In [12]:
pd.Series(rewards).to_csv('./logs/rewards_qlearning.csv')

FileNotFoundError: [Errno 2] No such file or directory: './logs/rewards_qlearning.csv'

In [None]:
sarsa_logs = pd.read_csv('./logs/rewards_sarsa.csv', index_col=False).iloc[:, 1]
q_logs = pd.read_csv('./logs/rewards_qlearning.csv', index_col=False).iloc[:, 1]

In [None]:
plt.figure(figsize=(16, 8))
plt.plot(q_logs.cumsum() / (pd.Series(np.arange(q_logs.shape[0]))+1), label="QLearning")
# plt.plot(sarsa_logs.cumsum() / (pd.Series(np.arange(sarsa_logs.shape[0]))+1), label="SARSA")
plt.legend()