In [1]:
import os
import gym
import random
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import Adam,SGD
from collections import deque

In [2]:
env = gym.make("MountainCar-v0")

In [3]:
class mountain_agent: 
    
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.gamma = 0.95
        self.lr_rate = 0.01
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_model()
        
    def build_model(self):
        model = Sequential()
        
        model.add(Dense(64,activation='relu',input_dim=self.state_size))
        model.add(Dense(32,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(self.action_size,activation='sigmoid'))
        
        adam = Adam(lr=self.lr_rate)
        sgd = SGD(lr=self.lr_rate)
        model.compile(loss='binary_crossentropy',optimizer=adam)
        
        return model
    
    def rem(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
        
    def act(self,state):
        # Exploration vs Exploitation
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        
        act_values = self.model.predict(state) # predict reward value based upon current state
        return np.argmax(act_values[0])
    
    def train(self,batch_size=32):
        minibatch = random.sample(self.memory,batch_size)
        
        for state,action,reward,next_state,done in minibatch:
            if not done:
                target = reward + self.gamma*np.argmax(self.target_model.predict(next_state)[0])
            else:
                target = reward
                
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            self.model.fit(state,target_f,epochs=1,verbose=0)
            
        if self.epsilon_min < self.epsilon:
            self.epsilon *= self.epsilon_decay
            
    def update_model(self):
        self.target_model.set_weights(self.model.get_weights())

### Train Model

In [4]:
state_size = 2
action_size = 3
done=False
ag = mountain_agent(state_size=state_size,action_size=action_size)

In [5]:
for e in range(200):
    state=env.reset()
    state=np.reshape(state,[1,state_size])
    batch_size=1
    
    for time in range(500):
        env.render()
        action=ag.act(state)
        next_state,reward,done,other_info=env.step(action)
        reward=reward+15 if not done else 1000
        next_state=np.reshape(next_state,[1,state_size])
        ag.rem(state,action,reward,next_state,done)
        state=next_state
        
        if done:
            ag.update_model()
            print(f'Game episode: {e+1}/{100}, High Score: {time}, Exploration rate: {ag.epsilon}')
            print(reward,done,state)
            break
        
    if len(ag.memory)>batch_size:
        ag.train(batch_size)

    
print('Deep Q learner Trained')
env.close()      

Game episode: 1/100, High Score: 199, Exploration rate: 1.0
1000 True [[-0.57488358  0.01403326]]
Game episode: 2/100, High Score: 199, Exploration rate: 0.995
1000 True [[-0.58927839  0.00285253]]
Game episode: 3/100, High Score: 199, Exploration rate: 0.990025
1000 True [[-0.36206108 -0.0155282 ]]
Game episode: 4/100, High Score: 199, Exploration rate: 0.985074875
1000 True [[-0.48609838  0.00489916]]
Game episode: 5/100, High Score: 199, Exploration rate: 0.9801495006250001
1000 True [[-0.53974172 -0.01349094]]
Game episode: 6/100, High Score: 199, Exploration rate: 0.9752487531218751
1000 True [[-0.48917816  0.00467565]]
Game episode: 7/100, High Score: 199, Exploration rate: 0.9703725093562657
1000 True [[-0.28006997  0.01855943]]
Game episode: 8/100, High Score: 199, Exploration rate: 0.9655206468094844
1000 True [[-0.5206837  0.0100022]]
Game episode: 9/100, High Score: 199, Exploration rate: 0.960693043575437
1000 True [[-0.3426145  -0.00118037]]
Game episode: 10/100, High Scor

Game episode: 74/100, High Score: 199, Exploration rate: 0.6935613678313175
1000 True [[-0.49895708  0.01418513]]
Game episode: 75/100, High Score: 199, Exploration rate: 0.6900935609921609
1000 True [[-0.60188058  0.00377283]]
Game episode: 76/100, High Score: 199, Exploration rate: 0.6866430931872001
1000 True [[-0.463705    0.02095831]]
Game episode: 77/100, High Score: 199, Exploration rate: 0.6832098777212641
1000 True [[-6.49051172e-01 -1.48570687e-04]]
Game episode: 78/100, High Score: 199, Exploration rate: 0.6797938283326578
1000 True [[-0.56298293  0.01846195]]
Game episode: 79/100, High Score: 199, Exploration rate: 0.6763948591909945
1000 True [[-0.29652644 -0.01082997]]
Game episode: 80/100, High Score: 199, Exploration rate: 0.6730128848950395
1000 True [[-0.63145377 -0.00611752]]
Game episode: 81/100, High Score: 199, Exploration rate: 0.6696478204705644
1000 True [[-0.56356506  0.0005945 ]]
Game episode: 82/100, High Score: 199, Exploration rate: 0.6662995813682115
1000

Game episode: 146/100, High Score: 199, Exploration rate: 0.483444593917636
1000 True [[-0.55054056 -0.00149497]]
Game episode: 147/100, High Score: 199, Exploration rate: 0.4810273709480478
1000 True [[-0.70208775  0.00245129]]
Game episode: 148/100, High Score: 199, Exploration rate: 0.47862223409330756
1000 True [[-0.44134429 -0.0049253 ]]
Game episode: 149/100, High Score: 199, Exploration rate: 0.47622912292284103
1000 True [[-0.60949103  0.01067138]]
Game episode: 150/100, High Score: 199, Exploration rate: 0.4738479773082268
1000 True [[-0.61141586  0.00868872]]
Game episode: 151/100, High Score: 199, Exploration rate: 0.47147873742168567
1000 True [[-0.73153888  0.00171177]]
Game episode: 152/100, High Score: 199, Exploration rate: 0.46912134373457726
1000 True [[-0.59929977  0.00328399]]
Game episode: 153/100, High Score: 199, Exploration rate: 0.46677573701590436
1000 True [[-0.79912025  0.00687298]]
Game episode: 154/100, High Score: 199, Exploration rate: 0.4644418583308248