<a href="https://colab.research.google.com/github/seifmostafa73/Cartpole-DDQL/blob/main/DDQL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install gym==0.26.2 tensorflow==2.11.0 keras==2.11.0 atari-py

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Operation cancelled by user[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3021, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2815, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/cli/base_command.py", line 180, in _main
    status = self.run(options, args)
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/cli/req_command.py", line 199, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/commands/install.py", line 385, in run
    conflic

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# ***DQL Sudo Code summary***
1) Initialize replay memory capacity.
2) Initialize the policy network with random weights.
3) Clone the policy network, and call it the target network.
4) For each episode:
5) Initialize the starting state.
6) For each time step:
7) Select an action.
8) Via exploration or exploitation
9) Execute selected action in an emulator.
10) Observe reward and next state.
11) Store experience in replay memory.
12) Sample random batch from replay memory.
13) Preprocess states from batch.
14) Pass batch of preprocessed states to policy network.
15) Calculate loss between output Q-values and target Q-values.
16) Requires a pass to the target network for the next state
17) Gradient descent updates weights in the policy network to minimize loss.
18) After  time steps, weights in the target network are updated to the weights in the policy network.

In [None]:
#includes
from collections import deque 
import gym
import math
import datetime
import codecs, json 
import time
import os
import random
import numpy as np
import matplotlib
from collections import namedtuple
from itertools import count
from PIL import Image
import tensorflow as tf
import gc
from tensorflow.keras import backend as k
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import Callback

import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

In [None]:
#hyper paramters

EPISODES = 1001        #this refers to the number of episodes we want or agent to play
MAX_STEPS = 1000           #this refers to the Maxnumber of steps at each episode , if exceeded the episode will end immeditely  
BATCH_SIZE = 128         #replay memory batch size (this will be used to traing a single step , we will randomize them and then take the highest Q value among all of them as our result Q value for the current state)
DISCOUNT_RATE  = 0.95   #𝛾 : rate of decay of rewards
exploration_rate = 1     #ε : prob. of our agent exploring the environment instead of exploiting it
MAX_EXPLORATION_RATE = 1
MIN_EXPLORATION_RATE = 0.01
EXPLOARTION_DECAY = 0.0001
TARGET_UPDATE = 10      #number of episodes to update the target NN with policy NN weights
MEMORY_SIZE = 2000    #size of replay memory
LEARNING_RATE  = 0.001    #α : rate of updating Q value  

MODEL_NAME = 'DQL_MODEL'

In [None]:
# defining Expericne Class "custom Data strucure"
Experience = namedtuple('Experience',['state','action','new_state','reward','done'])

In [None]:
#ReplayMemory Class
class ReplayMemory():
    def __init__(self,mem_capacity):
        self.memory_capacity = mem_capacity # max capacity of replay memory before overriding from beggining
        self.memory = [] # will store number of experiences
        self.count = 0 # keeps track of added experiences
    
    def push(self,new_experience):
        #simple if memory is still not full append at end
        if len(self.memory) < self.memory_capacity:
            self.memory.append(new_experience)
        #if not then replace first element
        else:
            self.memory[self.count % self.memory_capacity] = new_experience
        self.count +=1
    
    def sample(self):
        #when we sample we want to return a random set of experinces of size (batch size) from replay memory to train our NN with
        return random.sample(self.memory,BATCH_SIZE)
    
    def can_sample(self,batch_size):
        #you can only sample a batch if there exist enough experinces
        return len(self.memory) >= batch_size

In [None]:
#env manager class
class Environment():
    def __init__(self,num_actions):
        self.env = gym.make('CartPole-v0')
        self.reset()
        self.done = False # set to true if last step endded the episode
        self.current_step = 0
        self.num_actions = self.num_actions_available()
        self.exploration_rate = exploration_rate
        
    def select_action(self,policy_net):
        #update epsilon
        if random.random() <= self.exploration_rate : #explore
            self.exploration_rate = MIN_EXPLORATION_RATE + (MAX_EXPLORATION_RATE - MIN_EXPLORATION_RATE) * np.exp(-EXPLOARTION_DECAY * self.current_step)
            return random.randrange(self.num_actions)
        else: #exploit
            self.exploration_rate = MIN_EXPLORATION_RATE + (MAX_EXPLORATION_RATE - MIN_EXPLORATION_RATE) * np.exp(-EXPLOARTION_DECAY * self.current_step)
            output = policy_net.predict(self.current_state)
            return (np.argmax(output))
    
    def take_action(self, action):
        self.current_step +=1
        old_state = self.current_state
        new_state,reward,self.done,_,_ = self.env.step(action)
        reward = reward if not self.done else -10
        self.current_state = new_state
        return Experience(old_state,action,new_state,reward,self.done)
    
    """
    note that we will represent a single state in the environment
    as the current agent state stored in self.env porperies 
    """
    
    #some wrapper functions 
    def reset(self):
        self.current_state = self.env.reset()[0]

    def close(self):
        print("Closing")
        self.env.close()

    def render(self):
        return np.array(self.env.render()) #this renders screen and returns numpy array version of the rendered screen
    
    def num_actions_available(self):
        return self.env.action_space.n
    
    def just_starting(self):
        return self.current_screen is None

In [None]:
#DQN(deep q network) class
class DQN():
    def __init__(self,inputshape,outputshape):
        self.input_shape = inputshape
        self.model = Sequential()
        self.model.add(Dense(24, input_shape=inputshape, activation='relu')) # 1st hidden layer; states as input
        self.model.add(Dense(24, activation='relu')) # 2nd hidden layer
        self.model.add(Dense(outputshape, activation='linear')) # 2 actions, so 2 output neurons: 0 and 1 (L/R)
        self.model.compile(optimizer=Adam(learning_rate=LEARNING_RATE),loss='mse')
        self.model.summary()
            
    def predict(self,state):
        state = np.asarray(state).reshape(1,self.input_shape[0])
        return self.model.predict(x=state,verbose = 0)

class ClearMemory(Callback):
    def on_epoch_end(self, epoch, logs=None):
        gc.collect()
        k.clear_session()

In [None]:
replay_memory = ReplayMemory(MEMORY_SIZE) 
rewards_buffer = np.zeros(EPISODES)

def DQL(envir,policy_net,target_net):
    target_net.model.set_weights(policy_net.model.get_weights())
    target_step = 0

    for episode in range(EPISODES):
        envir.reset()
        for step in range(MAX_STEPS):
            current_action = envir.select_action(policy_net)
            new_experience = envir.take_action(current_action)
            
            replay_memory.push(new_experience)
            rewards_buffer[episode] +=new_experience.reward
            
            if new_experience.done == True:
                break;
            if replay_memory.can_sample(BATCH_SIZE):
                retrieved_experiences = replay_memory.sample()
                
                current_states = np.array([retrieved_exp.state for retrieved_exp in retrieved_experiences])
                new_states = np.array([retrieved_exp.new_state for retrieved_exp in retrieved_experiences])
                
                current_outputs = policy_net.model.predict(current_states,verbose=0)
                next_outputs = target_net.model.predict(new_states,verbose=0)
                
                optimal_current_outputs =[]
                
                for i in range(BATCH_SIZE):
                    max_next_output = np.max(next_outputs[i])    
                    reward = retrieved_experiences[i].reward + DISCOUNT_RATE * max_next_output * (1-retrieved_experiences[i].done)
                    optimal_output = np.array(current_outputs[i])
                    optimal_output[retrieved_experiences[i].action] = reward
                    optimal_current_outputs.append(optimal_output)
                
                policy_net.model.fit(current_states,np.array(optimal_current_outputs),batch_size=BATCH_SIZE,callbacks=ClearMemory(),verbose=0,shuffle=False)
       
        target_step+=1
        if target_step >= TARGET_UPDATE:
            target_step = 0
            target_net.model.set_weights(policy_net.model.get_weights())
        
        avg_reward = np.mean(rewards_buffer[episode])
        print("Current Episode :",episode," Avg Rew: ",avg_reward)
        print(envir.exploration_rate)
        
        if episode % 50 ==0:
            print("Saved Policy model")
            policy_net.model.save_weights('/content/drive/MyDrive/Colab Notebooks/DQL Data/POLICY_NET_3.h5')
            json.dump(rewards_buffer.tolist(), codecs.open('/content/drive/MyDrive/Colab Notebooks/DQL Data/output.json', 'w', encoding='utf-8'), separators=(',', ':'), sort_keys=True,indent=4) 

In [None]:
TRAING = False
envir = Environment(2)
policy_net = DQN(envir.env.observation_space.shape,envir.num_actions)
target_net = DQN(envir.env.observation_space.shape,envir.num_actions)

if TRAING:
    #policy_net.model.load_weights('POLICY_NET_2.h5')
    DQL(envir,policy_net,target_net)
else:
    policy_net.model.load_weights('/content/drive/MyDrive/Colab Notebooks/DQL Data/POLICY_NET_3_100STEPS.h5')
    env = gym.make('CartPole-v0',render_mode="human")
    observation = env.reset()[0]
    reward =0
    step=0
    while True:
        env.render() 
        action = np.argmax(policy_net.predict(observation))
        observation, r, done, info,_ = env.step(action)
        step+=1
        reward+=r
        if done: 
            print('Episode finished after {} timesteps, total rewards {}'.format(step+1,np.cumsum(reward)))
            observation = env.reset()[0]
            reward =0
            step=0
envir.close()

In [None]:
envir.close()