In [11]:
! pip install gym



In [12]:
import gym 
import random
import numpy as np

# initiating environment in Ai gym

In [13]:
# for discrete action space game 
env_name="CartPole-v1"
# creating environment
env=gym.make(env_name)

In [14]:
print(env.observation_space)  # states in the environment used 

# in this case there are 4 
# cart position , cart velocity , pole angle , pole velocity at tip

print(env.action_space)   # action that can be taken by AI

# in this case there are 2 actions that can be taken 
# right and left

Box(4,)
Discrete(2)


In [15]:
# basic code for environment access / use the gym
# resting variable
state = env.reset()
for _ in range(200):
    #action = random.choice(range(action_space var))  # this line and line below it are same
    action = env.action_space.sample()   # sample works as random and chooses from action_space randomly
    state, reward, done, info = env.step(action) #action determined used to take a step in the game 
    env.render()    #render means to display 

env.close()      # to automatically close

In [18]:
# making action little bit smarter using tilt value of the pole
state = env.reset()
for _ in range(200):
    pole_angle=state[2]    # using pole_angle in state variable  # known as policy
    action = 0 if pole_angle<0 else 1         #known as policy
    state,reward,done,info = env.step(action)
    env.render()
env.close()
    

In [21]:
# for continuous action space 
env_name="MountainCarContinuous-v0"
env=gym.make(env_name)

In [27]:
print(env.observation_space)
# car position and velocity
print(env.action_space)
# push car left or to right range(-1 to+1)
print(env.action_space.high) # to see range 
print(env.action_space.low)

Box(2,)
Box(1,)
[1.]
[-1.]


In [28]:
state = env.reset()
for _ in range(200):
    action=env.action_space.sample()
    state,reward,done,info = env.step(action)
    env.render()
env.close()

# Reinforcement Learning (Q-Learning)


# Basic Algo
   
   # Q(st,at) = rt+1 +G.Qmax(st+1)

In [30]:
# working on toy text game of frozen lake provided by ai gym 
'''
 game board :
 
 SFFF        S = starting point
 FHFH        F = forzen surface,Safe
 FFFH        H = hole , fall not safe
 HFFG        G = goal, finish point
 
 actions :
       
       left =0
       right=2
       up   =3
       down =1
'''

'\n game board :\n \n SFFF       // S = starting point\n FHFH       // F = forzen surface,Safe\n FFFH       // H = hole , fall not safe\n HFFG       // G = goal, finish point\n \n actions :\n       \n       left =0\n       right=2\n       up   =3\n       down =1\n'

In [33]:
import gym 
import random
import numpy as np
import time
from gym.envs.registration import register  # to use register function
from IPython.display import clear_output  # feature of jupyter notebook

In [34]:
# register is in the init function of the source code of the open ai
try:     #used as if we register one game again we get error therfore to avoid we use exception handling
    register(
        id='FrozenLakenoslip-v0',     # we can change this name if we want before (-v0)
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4','is_slippery':False},  # making game non slippery for our benifit
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass
env_name = "FrozenLakenoslip-v0"
env = gym.make(env_name)

In [38]:
print(env.observation_space)    # 4X4 matrix
print(env.action_space)     # 4 movement
type(env.action_space)

Discrete(16)
Discrete(4)


gym.spaces.discrete.Discrete

In [39]:
# using object orientation
class Agent():                     # player class
    def __init__(self, env):      # constructor  # self is Agent
        
# checking if the env we are working with is discrete or continuous 
        self.is_discrete = type(env.action_space)==gym.spaces.discrete.Discrete   # if true: is_discrete= True, else:False
        
        if self.is_discrete:                          # if true then size is below
            self.action_size = env.action_space.n
        else:                                        # else store max and min value
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            
# to figure out which action to use next all logic will be here 
    def get_action(self,state):
        # action = env.action_space.sample()     #different approch than line written beneath this
        if self.is_discrete:
            action = random.choice(range(self.action_size)) # just for testing
        else:
            action = np.random.uniform(self.action_low,   #just for testing
                                       self.action_high,
                                       self.action_shape)
        return action

In [40]:
agent = Agent(env)  # class object

In [51]:
for episode in range(2):  # for 2 episodes
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        env.render()
        time.sleep(0.5)          #time wait before redering different o/p and clean pre o/p
        clear_output(wait=True)  # to clear previous o/p
        #no close as will render on screen only

  (Up)
SFFF
F[41mH[0mFH
FFFH
HFFG


In [62]:
# creating smart / Qlearning based agent using about agent 
# concept of inheritance

class QAgent(Agent):       # inheritance
    def __init__(self, env, discount_rate=0.97, learning_rate=0.01):
        
        super().__init__(env)  #getting init of Agent class inheritance concept
        
        self.state_size = env.observation_space.n
        
        self.exploration_rate = 1.0  # to not be stuck with same path and try and find different path (adds randomness)
        # exploration for 1st episode is max and then after every episode exploration keeps changing
        # represented by epsilon
        
        self.discount_rate=discount_rate  # represented by gamma
        self.learning_rate=learning_rate  # represented by alpha
        
        self.q_table = 0.0001 * np.random.random([self.state_size,self.action_size])  
        # matrix formed of row x column states rows and actions column
        #0.0001 can also be written as 1e-4 used to reduce the action values
        
        
    def get_action(self,state):
        q_state = self.q_table[state] # select row from q table based on state from 0 to 15 in this state
        action_greedy = np.argmax(q_state)  #gives index of max value in qtable corresponding to state 
        action_random = super().get_action(state)  # for exploration use random state 
        return action_random if random.random()<self.exploration_rate else action_greedy # to explore randomly and in other cases return true action
    
    def train(self,experience):
        state, action, next_state, reward, done = experience  #extracting values from experience of 1 episode
        
        q_next = self.q_table[next_state]  # getting the next states q value as we need it to calculate new q_value in algo
        q_next = np.zeros([self.action_size]) if done else q_next #if agent reaches goal no next state will be there and if not use above value
        q_target = reward + self.discount_rate * np.max(q_next) # q_next algorithm coded here
        q_update=q_target-self.q_table[state,action] # differnence b/w calculated and previous value in table
        
        self.q_table[state,action]+=self.learning_rate*q_update   #updating the qvalues in small amount therefore mul by learning rate
        
        if done:      # after each episode decrease the exploration rate so that it can follow right path 
            self.exploration_rate*=0.99
            

In [63]:
agent=QAgent(env)  #object 

In [65]:
total_reward=0
for episode in range(200):  # for 2 episodes
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train((state,action,next_state,reward,done))    # calling train function and passing tuple in the exploration variable
        state=next_state # updating state 
        total_reward+=reward       # checking the reward
        print(f"s{state} a:{action}")   
        print(f"epsiode:{episode} reward:{total_reward} exploration:{agent.exploration_rate}")
        
        env.render()
        print(agent.q_table)
        time.sleep(0.05)          #time wait before redering different o/p and clean pre o/p
        clear_output(wait=True)  # to clear previous o/p
                #no close as will required as displaying in notebook only

s15 a:2
epsiode:199 reward:116.0 exploration:0.04904089407128576
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
[[5.49447499e-05 4.53613713e-05 1.32360029e-03 5.65187574e-05]
 [4.13352852e-05 2.38359620e-05 7.05226731e-03 4.13905180e-05]
 [1.12356287e-04 3.09460399e-02 5.87019937e-05 2.85607448e-04]
 [1.56814379e-04 3.94884488e-05 8.73487872e-06 8.26218614e-05]
 [4.66217303e-05 3.32680351e-05 1.09687851e-05 4.16313978e-05]
 [5.95914952e-05 8.78806179e-05 5.58948810e-05 1.60469870e-06]
 [6.09885998e-05 1.14251312e-01 6.09929013e-05 1.56005595e-04]
 [2.73354065e-05 2.06371139e-06 8.32926573e-05 6.37699391e-05]
 [7.17111763e-05 6.13522603e-05 1.29028922e-05 5.13282604e-05]
 [5.52703716e-05 4.74807619e-04 5.00006456e-05 2.83630322e-05]
 [2.48989027e-05 3.28638812e-01 3.56934739e-05 9.27739672e-04]
 [7.51192552e-06 6.96172234e-05 1.44946209e-05 1.74544317e-05]
 [1.77753573e-06 4.21372847e-05 4.19616648e-05 4.15631286e-05]
 [7.36181173e-05 9.05051403e-05 2.15465021e-02 3.98559492e-05]
 [2.30058507e-