# Q-Learning

In [3]:
# Importing libraries
import gym
import numpy as np

## Loading Frozen Lake Environment

In [4]:
ag_env=gym.make('FrozenLake-v0')

## Testing the environment

In [23]:
# Taking a look at the maze
ag_env.reset()
ag_env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


## Using DP for optimal Q table

In [21]:
# Creating a 2D array of size (obs_space)X(action_space)
Q=np.zeros([ag_env.observation_space.n,ag_env.action_space.n])
print(Q.shape)

(16, 4)


In [26]:
# Setting the learning parameters
l_rate=0.75      # Learning rate
d_rate=0.9       # Discount rate
n_episodes=200   # Number of runs
t=100            # Time steps in a run
r_list=[]

In [28]:
# Algorithm
for i in range(n_episodes):
    s=ag_env.reset()
    r_run=0
    d=False
    for j in range(t):
        j=j+1;
        # Selecting the best action with some noise added randomly
        action=np.argmax(Q[s,:]+np.random.randn(1,ag_env.action_space.n)*(1./(j+1)))
        # Performing the action
        obs,rew,d,_=ag_env.step(action)
        # Setting the value of previous state
        Q[s,action]=Q[s,action]+l_rate*(rew+d_rate*np.max(Q[obs,:])-Q[s,action])
        r_run=r_run+rew
        s=obs
        if d==True:
            break
    r_list.append(r_run)

In [29]:
# Mean score over n_episodes
print('Mean score over episodes: ',np.mean(r_list))

Mean score over episodes:  0.03


In [30]:
# Final Q-table values
print(Q)

[[0.06594153 0.05567085 0.05397259 0.0410465 ]
 [0.00903424 0.00964748 0.009128   0.0527256 ]
 [0.03847698 0.13002324 0.05376814 0.03577335]
 [0.03477459 0.03874566 0.0029385  0.03680888]
 [0.06507651 0.02367948 0.0732326  0.06997074]
 [0.         0.         0.         0.        ]
 [0.08087509 0.0944525  0.16900509 0.04331782]
 [0.         0.         0.         0.        ]
 [0.01275841 0.06633905 0.01649777 0.06983633]
 [0.07596068 0.09634497 0.00942278 0.05749597]
 [0.13992963 0.0147752  0.         0.03714471]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.17948912 0.        ]
 [0.         0.         0.         0.4152715 ]
 [0.         0.         0.         0.        ]]


## Q-Learning using Neural Networks

In [32]:
import random
import tensorflow as tf
import matplotlib.pyplot as plt

In [33]:
tf.reset_default_graph()

In [59]:
# Creating placeholer and initial weights

inputs=tf.placeholder(shape=[1,16],dtype=tf.float32)
W=tf.Variable(tf.random_uniform([16,4],0,0.01))
Q_out=tf.matmul(inputs,W)
predict=tf.argmax(Q_out,1)

In [58]:
# Backpropogation and loss minimization

Q_next=tf.placeholder(shape=[1,4],dtype=tf.float32)
loss=tf.reduce_sum(tf.square(Q_next-Q_out))
trainer=tf.train.GradientDescentOptimizer(learning_rate=0.01)
update_Model=trainer.minimize(loss)

In [62]:
# Training the network
init=tf.global_variables_initializer()
r_list=[]

# Learning parameters
y=0.99
e=0.1

with tf.Session() as sess:
    sess.run(init)
    for i in range(n_episodes):
        s=ag_env.reset()
        r_run=0
        d=False
        for j in range(t):
            j=j+1
            
            # Selecting an action greedily
            action,Q_all=sess.run([predict,Q_out],feed_dict={inputs:np.identity(16)[s:s+1]})
            
            # Using epsilon for random action
            if np.random.rand(1)<e:
                action[0]=ag_env.action_space.sample()
            
            # New state
            s1,rew,d,_=ag_env.step(action[0])
            
            # Obtaining Q values for the new state
            Q1=sess.run(Q_out,feed_dict={inputs:np.identity(16)[s1:s1+1]})
            
            # Obtaining the max Q from the future state
            max_Q1=np.max(Q1)
            targetQ=Q_all
            targetQ[0,action[0]]=rew+y*max_Q1
            
            # Backpropogation to alter the weights (i.e.) old values of Q
            _,W1=sess.run([update_Model,W],feed_dict={inputs:np.identity(16)[s:s+1],Q_next:targetQ})
            r_run+=rew
            s=s1
            
            if d==True:
                e=1./((i/50)+10)
                break
        r_list.append(r_run)           

InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder_4' with dtype float and shape [1,16]
	 [[node Placeholder_4 (defined at <ipython-input-40-04170f870b98>:3) ]]

Original stack trace for 'Placeholder_4':
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\ipykernel\kernelapp.py", line 505, in start
    self.io_loop.start()
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\tornado\platform\asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\asyncio\base_events.py", line 539, in run_forever
    self._run_once()
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\asyncio\base_events.py", line 1775, in _run_once
    handle._run()
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\asyncio\events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\tornado\ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\tornado\ioloop.py", line 743, in _run_callback
    ret = callback()
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\tornado\gen.py", line 781, in inner
    self.run()
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\tornado\gen.py", line 742, in run
    yielded = self.gen.send(value)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\ipykernel\kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\ipykernel\kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\ipykernel\kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\ipykernel\ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 2848, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 2874, in _run_cell
    return runner(coro)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\IPython\core\async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3049, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3214, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-40-04170f870b98>", line 3, in <module>
    inputs=tf.placeholder(shape=[1,16],dtype=tf.float32)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\tensorflow\python\ops\array_ops.py", line 2143, in placeholder
    return gen_array_ops.placeholder(dtype=dtype, shape=shape, name=name)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 7400, in placeholder
    "Placeholder", dtype=dtype, shape=shape, name=name)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\tensorflow\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\tensorflow\python\framework\ops.py", line 3616, in create_op
    op_def=op_def)
  File "c:\users\akhil reddy\appdata\local\programs\python\python37\lib\site-packages\tensorflow\python\framework\ops.py", line 2005, in __init__
    self._traceback = tf_stack.extract_stack()
