In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import time
import random
from collections import deque
import tensorflow as tf
from tqdm import tqdm 
from tensorflow.keras import Sequential 
from tensorflow.keras.activations import relu, linear
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import RMSprop
import math
from random import sample
from tensorflow.keras.optimizers import Adam
from tqdm import trange
import pandas as pd
import gc
import os

In [2]:
env = gym.make('MountainCar-v0')


In [3]:
state_space=2
obs_space=2
action_space=3
intermediate_dim=16

In [4]:
class QModel:
    def __init__(self, input_dim, output_dim, lr):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.lr = lr
        self.Qpolicy = self.create()
        self.Qtarget = self.create() 
        self.Qtarget.set_weights(self.Qpolicy.get_weights())
        
    def create(self):
        model = Sequential()
        model.add(tf.keras.layers.InputLayer(input_shape=(1,self.input_dim)))
        model.add(Dense(512,activation = 'relu',kernel_initializer='he_uniform'))
        model.add(Dense(256, activation = 'relu',kernel_initializer='he_uniform'))
        model.add(Dense(128, activation = 'relu',kernel_initializer='he_uniform'))
        model.add(Dense(self.output_dim, activation = 'linear',kernel_initializer='he_uniform'))
        model.compile(optimizer = RMSprop(learning_rate = self.lr, rho = 0.95, epsilon = 1e-7), loss = "mse", metrics = ['accuracy'])
        return model

In [5]:
BATCH_SIZE=64


In [6]:
class DQNSolver:
    def __init__(self, state_space,action_space, decay_coe = 0.99, 
                  memory_size = 20000,  C = 5,LEARNING_RATE=1e-4,GAMMA=0.99,EPSILON_MAX=1.0,EPSILON_MIN=0.01,BATCH_SIZE=64):
        
        #self.env = gym.make('CartPole-v0')
        self.states = state_space
        self.n_actions = action_space
        
        self.actions = [i for i in range(self.n_actions)]
        
        self.lr = LEARNING_RATE
        self.gamma = GAMMA
        self.epsilon = EPSILON_MAX
        self.decay_coe = decay_coe
        self.min_eps = EPSILON_MIN
        #self.episodes = episodes
        self.batch_size = BATCH_SIZE
        self.memory = deque(maxlen = memory_size) # replay memory 
        self.C = C
        
        self.terminal_state = False # end of the episode
        self.target_counter = 0 
        
        # Plot data
        #self.timestep = self.episodes / 10

        
        
        
        self.model = QModel(self.states, self.n_actions, self.lr)
        self.positive_rewards_list=[]
        # Smooth epsilon 
        # self.a = 0.35
        # self.b = 0.1
        # self.c = 0.01
    def find_positive_rewards(self):
        self.positive_rewards_list=[]
        for i,x in enumerate(self.memory):
            if x[2]>10:
                self.positive_rewards_list.append(i)
    def state_shape(self,states):
        states = np.array(states)
        return states.reshape(-1,*states.shape)
    def update_target_model(self):
        """
        Updates the current target_q_net with the q_net which brings all the
        training in the q_net to the target_q_net.
        :return: None
        """
        self.model.Qtarget.set_weights(self.model.Qpolicy.get_weights())
    def decrement_epsilon(self):
        '''
        if self.epsilon > self.min_eps:
            self.epsilon *= self.decay_coe
        else:
            self.epsilon = self.min_eps
        '''
        # s_time = (time - self.a*self.episodes) / (self.b*self.episodes) 
        # cosh = np.cosh(math.exp(-s_time))
        # self.epsilon = 1 - (1/cosh + (time*self.c/self.episodes))
        if self.epsilon>self.min_eps:
            self.epsilon*=self.decay_coe
        else:
            self.epsilon=self.min_eps
    def forget(self):
        self.memory.clear()

    def remember(self, s, a, r, s_, done):
        self.memory.append([self.state_shape(s), a, r, self.state_shape(s_), done])
        
    def act(self, states):
        if np.random.random() > (1 - self.epsilon):
            action = np.random.choice(self.actions)
        else:
            states = self.state_shape(states)
            states.reshape(1,1,self.states)
#             states=[states]
#             states=np.array(states)
            #print(states.shape)
            action = np.argmax(np.array(self.model.Qpolicy.predict_on_batch(states)))
            
        return action
            
    def minibatch(self):
        return random.sample(self.memory, self.batch_size)
        # indices=[]
        # minibatch=[]
        # if(len(self.positive_rewards_list)>10):
        #     indices=random.sample(self.positive_rewards_list,10)
        #     for i in indices:
        #         minibatch.append(self.memory[i])
        #     minibatch=minibatch+random.sample(self.memory, self.batch_size-10)
        #     random.shuffle(minibatch)            
        #     return minibatch
        # else:
        #     for i in self.positive_rewards_list:
        #         minibatch.append(self.memory[i])
        #     minibatch=minibatch+random.sample(self.memory, self.batch_size-len(self.positive_rewards_list))
        #     random.shuffle(minibatch)            
        #     return minibatch


    

        #plt.savefig(r'RL/loss - e{}v2.png'.format(episode), dpi = 500)
        
    def train(self):
        # X - states passed to the NN, y - target
        
        X, y = [], []
        
        if len(self.memory) >= self.batch_size: 
            SARS = self.minibatch()
        
            s = self.state_shape([row[0] for row in SARS])
            s=s.reshape(BATCH_SIZE,1,state_space)
            #print(s.shape)
            qvalue = np.array(self.model.Qpolicy.predict_on_batch(s))
            #print(qvalue)

            s_ = self.state_shape([row[3] for row in SARS])
            s_=s_.reshape(BATCH_SIZE,1,state_space)
            future_qvalue = np.array(self.model.Qtarget.predict_on_batch(s_))
            #print("2")
            #print(future_qvalue)

            for index, (state, action, reward, state_, done) in enumerate(SARS):
                if done == True:
                    Qtarget = reward
                else:
                    Qtarget = reward + self.gamma * np.max(future_qvalue[index][0])
            
                qcurr = qvalue[index][0]
                #print(qcurr)
                qcurr[int(action)] = Qtarget 
                #print(qcurr)
                X.append(state)
                y.append(qcurr)
#             X_dataset=tf.data.Dataset.from_tensor_slices(X).batch(64)
#             y_dataset=tf.data.Dataset.from_tensor_slices(y).batch(64)
#             final_dataset=tf.data.Dataset.zip((X_dataset, y_dataset))
            #X, y = np.array(X).reshape(self.batch_size,1,self.states), np.array(y).reshape(self.batch_size, 1, self.n_actions)
            
           #print(X.shape,"   ",y.shape  )                          
            #loss = self.model.Qpolicy.fit(final_dataset,verbose=0)   
            X, y = np.array(X).reshape(self.batch_size,1,self.states), np.array(y).reshape(self.batch_size, 1, self.n_actions)
           # print(self.model.Qpolicy.predict_on_batch(X))
           #print(X.shape,"   ",y.shape  )                          
            self.model.Qpolicy.train_on_batch(X, y,return_dict=True)
            
            #self.history.append(loss.history['loss'][0])
            
                
            # if self.terminal_state:
            #     self.target_counter+=1

            # # C -> target network update frequency
            # if self.target_counter > self.C: 
            #     self.model.Qtarget.set_weights(self.model.Qpolicy.get_weights())
            #     self.target_counter = 0 

                
  

In [7]:
dqn_solver=DQNSolver(state_space=state_space,action_space=action_space)


In [8]:
scores=[]
no_train=[]


In [9]:
s=env.reset()
s[0]

array([-0.5979995,  0.       ], dtype=float32)

In [10]:
def training(dqn_solver,n_episodes=4000,maxt=200):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
     # last 100 scores
    trains=0
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        state=state[0]
        score = 0

        done=False

        for t in tqdm(range(maxt),leave=False,desc=str(i_episode)):
            action = dqn_solver.act(state)
            next_state, reward, done, _,_ = env.step(action)
            
            dqn_solver.remember(state,action,reward,next_state,done)

            dqn_solver.train()
            trains+=1
            state = next_state
            score += reward
            if done:
                break
            
        dqn_solver.decrement_epsilon()
        scores.append(score)
        no_train.append(trains)             # save most recent score
        if(i_episode%5==0):
            dqn_solver.update_target_model()
        if(i_episode%20==0):
            print('\rEpisode {}\tScore: {:.2f}'.format(i_episode, score, end=""))




In [11]:
training(dqn_solver)

2:   6%|▌         | 12/200 [00:00<00:05, 33.96it/s] 



                                                     

Episode 20	Score: -200.00


                                                     

Episode 40	Score: -200.00


                                                     

Episode 60	Score: -200.00


                                                     

Episode 80	Score: -200.00


                                                      

Episode 100	Score: -200.00


                                                      

Episode 120	Score: -200.00


                                                      

Episode 140	Score: -200.00


                                                      

Episode 160	Score: -200.00


                                                      

Episode 180	Score: -200.00


                                                      

Episode 200	Score: -200.00


                                                      

Episode 220	Score: -200.00


                                                      

Episode 240	Score: -200.00


                                                      

Episode 260	Score: -200.00


                                                      

Episode 280	Score: -200.00


                                                      

Episode 300	Score: -200.00


                                                      

Episode 320	Score: -200.00


                                                      

Episode 340	Score: -200.00


                                                      

Episode 360	Score: -200.00


                                                      

Episode 380	Score: -200.00


                                                      

Episode 400	Score: -200.00


                                                      

Episode 420	Score: -200.00


                                                      

Episode 440	Score: -200.00


                                                      

Episode 460	Score: -200.00


                                                      

Episode 480	Score: -200.00


                                                      

Episode 500	Score: -154.00


                                                      

Episode 520	Score: -200.00


                                                      

Episode 540	Score: -144.00


                                                      

Episode 560	Score: -200.00


                                                      

Episode 580	Score: -200.00


                                                      

Episode 600	Score: -200.00


                                                      

Episode 620	Score: -200.00


                                                      

Episode 640	Score: -200.00


                                                      

Episode 660	Score: -200.00


                                                      

Episode 680	Score: -166.00


                                                      

Episode 700	Score: -200.00


                                                      

Episode 720	Score: -196.00


                                                      

Episode 740	Score: -200.00


                                                      

Episode 760	Score: -200.00


                                                      

Episode 780	Score: -116.00


                                                      

Episode 800	Score: -168.00


                                                      

Episode 820	Score: -200.00


                                                      

Episode 840	Score: -152.00


                                                      

Episode 860	Score: -148.00


                                                      

Episode 880	Score: -173.00


                                                      

Episode 900	Score: -85.00


                                                      

Episode 920	Score: -146.00


                                                      

Episode 940	Score: -148.00


                                                      

Episode 960	Score: -127.00


                                                      

Episode 980	Score: -117.00


                                                       

Episode 1000	Score: -118.00


