# Prabin Lamsal

## Computer Engineering
## Assignment 7


### Creating an environment

In [None]:
import argparse
import gym

def build_arg_parser():
    parser = argparse.ArgumentParser(description='Run an environment')
    parser.add_argument('--input-env', dest='input_env', required=True,
            choices=['cartpole', 'mountaincar', 'pendulum', 'taxi', 'lake'], 
            help='Specify the name of the environment')
    return parser

def run(input_env):
    name_map = {'cartpole': 'CartPole-v1', 
                'mountaincar': 'MountainCar-v0',
                'pendulum': 'Pendulum-v0',
                'taxi': 'Taxi-v1',
                'lake': 'FrozenLake-v0'}

    env = gym.make(name_map[input_env])
    env.reset()

    for _ in range(1000):
        env.render()

        env.step(env.action_space.sample()) 

In [None]:
run("cartpole")

In [None]:
run("mountaincar")

### Building a learning agent

In [None]:
def build_arg_parser():
    parser = argparse.ArgumentParser(description='Run an environment')
    parser.add_argument('--input-env', dest='input_env', required=True,
            choices=['cartpole', 'mountaincar', 'pendulum'], 
            help='Specify the name of the environment')
    return parser

def run(input_env):

    name_map = {'cartpole': 'CartPole-v1', 
                'mountaincar': 'MountainCar-v0',
                'pendulum': 'Pendulum-v0'}

    env = gym.make(name_map[input_env])
 
    for _ in range(20):
        observation = env.reset()

        for i in range(100):
            env.render()

            print(observation)
 
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            if done:
                print('Episode finished after {} timesteps'.format(i+1))
                break

In [None]:
run("cartpole")

[ 0.02919127  0.03328397 -0.02613534 -0.03849287]
[ 0.02985695 -0.16145363 -0.0269052   0.24583088]
[ 0.02662788 -0.35618117 -0.02198858  0.5299073 ]
[ 0.01950425 -0.16075693 -0.01139044  0.23037766]
[ 0.01628911 -0.35571426 -0.00678288  0.519446  ]
[ 0.00917483 -0.16049749  0.00360604  0.22463341]
[ 0.00596488  0.03457274  0.0080987  -0.06690986]
[ 0.00665634 -0.16066438  0.00676051  0.22831722]
[ 0.00344305 -0.3558823   0.01132685  0.52312493]
[-0.0036746  -0.16092157  0.02178935  0.23403266]
[-0.00689303 -0.35634798  0.02647     0.53350824]
[-0.01401999 -0.551832    0.03714017  0.8344128 ]
[-0.02505663 -0.3572366   0.05382842  0.55363774]
[-0.03220136 -0.16291021  0.06490118  0.27838823]
[-0.03545957  0.03122872  0.07046894  0.00686049]
[-0.03483499 -0.16482939  0.07060615  0.32091847]
[-0.03813158 -0.3608821   0.07702452  0.6350058 ]
[-0.04534922 -0.5569891   0.08972464  0.95091695]
[-0.056489   -0.36318183  0.10874297  0.6877191 ]
[-0.06375264 -0.55963165  0.12249736  1.0125607 ]


### q-learning

In [None]:
import numpy as np
from random import randint
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

R = np.matrix([[-1, -1, -1, -1,  0,  -1],
              [-1, -1, -1,  0, -1, 100],
              [-1, -1, -1,  0, -1,  -1],
              [-1,  0,  0, -1,  0,  -1],
              [ 0, -1, -1,  0, -1, 100],
              [-1,  0, -1, -1,  0, 100]])
Q = np.zeros_like(R)

In [None]:
gamma = 0.8

initial_state = randint(0,4)

def available_actions(state):
    current_state_row = R[state,]
    av_act = np.where(current_state_row >= 0)[1]
    return av_act


def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_act,1))
    return next_action

def update(current_state, action, gamma):
    
    max_index = np.where(Q[action,] == np.max(Q[action,]))[1]

    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index, size = 1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]

    Q[current_state, action] = R[current_state, action] + gamma * max_value    

available_act = available_actions(initial_state) 

action = sample_next_action(available_act) 

In [None]:
for i in range(100):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update(current_state,action,gamma)

print ("Trained Q matrix: \n", Q/np.max(Q)*100)

Trained Q matrix: 
 [[  0.           0.           0.           0.          79.79539642
    0.        ]
 [  0.           0.           0.          53.96419437   0.
  100.        ]
 [  0.           0.           0.          53.96419437   0.
    0.        ]
 [  0.          59.07928389  42.96675192   0.          67.51918159
    0.        ]
 [ 53.96419437   0.           0.          53.96419437   0.
  100.        ]
 [  0.          79.79539642   0.           0.          79.79539642
   93.09462916]]


In [None]:
current_state = 2
steps = [current_state]

while current_state != 5:
    next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index, size = 1))
    else:
        next_step_index = int(next_step_index)
    steps.append(next_step_index)
    current_state = next_step_index

print ("Best sequence path: ", steps)

Best sequence path:  [2, 3, 4, 5]
