In [None]:
import numpy as np
import torch
import tensorflow as tf
import gym
import sys
from collections import deque
import random

In [6]:
from explorerl.utils import *
from explorerl.agents import *

## DQN Tensorflow

In [8]:
env = gym.make('CartPole-v0')

In [7]:
class DQN(tf.keras.Model):
    def __init__(self,input_shape,output_shape,batch_norm=False):
        super(DQN, self).__init__()
        self.linear1 = create_linear_tf(input_dims=input_shape,output_dims=64,batch_norm=batch_norm)
        self.linear2 = create_linear_tf(input_dims=64,output_dims=32,batch_norm=batch_norm)
        self.linear3 = create_linear_tf(input_dims=32,output_dims=16,batch_norm=batch_norm)
        self.linear4 = create_linear_tf(input_dims=16,output_dims=output_shape,batch_norm=batch_norm)
        
    def call(self,x,training=True):
        x = tf.nn.relu(self.linear1(x))
        x = tf.nn.relu(self.linear2(x))
        x = tf.nn.relu(self.linear3(x))
        x = self.linear4(x)
        x = tf.nn.softmax(x)
        return x

In [9]:
class DQNTF(BaseAgent):
    def __init__(self, observation_space,action_space,epsilon=1.0, decay= 0.98, gamma=1.0, 
                 learning_rate=0.01, featurizer=None,scaler=None,use_bias = False,replay_size=2500,batch_size=32):
        super(DQNTF, self).__init__(observation_space,action_space,epsilon, decay, gamma, 
                 learning_rate, featurizer,scaler,use_bias)
        self.create_model()
        self.replay_memory = deque(maxlen=replay_size)
    
    def create_model(self):
        input_space = self.observation_space  
        if self.featurizer:
            input_space = self.featurizer.transform([np.arange(self.observation_space)]).flatten().shape[0]
        if self.use_bias:
            input_space += 1        
        self.model["output"] = DQN(input_shape=input_space,output_shape=self.action_space)
        
        def mse_loss(model,predictions,targets):
            return tf.reduce_mean(tf.square(tf.subtract(predictions,targets))) + tf.add_n(model.losses)
        
        optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)

        def train_step(model,inputs,targets):
            with tf.GradientTape() as tape:
                predictions = model(inputs)
                total_loss = mse_loss(model,predictions,targets)
            gradients = tape.gradient(total_loss,model.trainable_variables)
            optimizer.apply_gradients(zip(gradients,model.trainable_variables))
                
        self.model["loss"] = mse_loss
        self.model["training_op"] = train_step
        print("Model Created!")
    
    def default_policy(self):
        return self.epsilon_greedy()
    
    def epsilon_greedy(self):
        def act(obs):
            estimator = self.model["output"]
            qvals = estimator(self.featurize_state(obs))
            if np.random.random() < self.epsilon:
                return np.random.choice(self.action_space) , qvals
            return np.argmax(qvals) , qvals
        return act
                
    def greedy(self):
        def act(obs):
            estimator = self.model["output"]
            qvals = estimator(self.featurize_state(obs))
            return np.argmax(qvals) , qvals
        return act
    
    def initialize_replay(self,env,policy,size=500):
        for i in range(size):
            obs = env.observation_space.sample()
            action , qvals = policy(obs)
            next_obs, reward, done, info = env.step(action)
            self.replay_memory.append([state,action,reward,next_obs,done])
            
    def replay(self,policy):
        sampled_ = random.sample(self.replay_memory,self.batch_size)
        obs = []
        targets = []
        training_op = self.model["training_op"]
        for state, action, reward, next_obs, done in sampled_:
            _ , qvals = policy(state)
            obs.append(state.flatten())
            target = tf.stop_gradient(qvals).numpy().flatten()
            target[action] = reward
            _ , nextqvals = policy(next_obs)
            if done == False:
                target[action] += np.max(tf.stop_gradient(nextqvals))
            targets.append(target)
        training_op(self.model["output"],np.array(obs),np.array(targets))
    
    def train(self,env,episodes=200,early_stop=False,stop_criteria=20):
        prev_avg = -float('inf')
        orig_epsilon = self.epsilon
        bar = tqdm(np.arange(episodes),file=sys.stdout)
        policy = self.epsilon_greedy()
        self.initialize_replay(env,policy)
        criteria = 0 #stopping condition
        loss = self.model["loss"]
        training_op = self.model["training_op"]
        for i in bar:
            observation = env.reset()
            self.epsilon *= (self.decay**i)
            rewards = 0
            end = 0
            for t in range(10000):
                action , qvals = policy(observation)
                next_obs, reward, done, info = env.step(action)
                self.replay_memory.append([observation,action,reward,qvals,next_obs,done])
                rewards += reward
                end = t
                if done:
                    break
                self.replay(policy)
                observation = next_obs
                
            self.stats["num_steps"].append(end)
            self.stats["episodes"].append(i)
            self.stats["rewards"].append(rewards)
            avg = np.mean(self.stats["rewards"][::-1][:25])
            bar.set_description("Epsilon and reward {} : {}".format(self.epsilon,avg))
            
            if avg < prev_avg:
                criteria += 1
                
            if early_stop:
                if criteria >= stop_criteria:
                    break
                    
            prev_avg = avg
        return self.stats

In [11]:
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
s = DQNTF(observation_space,action_space)
sstats_dict = s.train(env,episodes=100,early_stop=False,stop_criteria=20)

Model Created!
  0%|          | 0/100 [00:00<?, ?it/s]

InvalidArgumentError: In[0] is not a matrix. Instead it has shape [4] [Op:MatMul]

In [None]:
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
s = QLearningTF(observation_space=observation_space,action_space=action_space,use_bias=True,featurizer=featurizer,scaler=scaler)
stats_dict = s.train(env,episodes=100,early_stop=False,stop_criteria=20)
plotting.plot_metrics(s.stats)

## DQN Torch

In [None]:
def create_linear_torch(input_dims,output_dims,batch_norm=False):
    layers = []
    lin = torch.nn.Linear(input_dims,output_dims)
    layers.append(lin)
    if batch_norm:
        layers.append(torch.nn.BatchNorm1d(output_dims))
    return torch.nn.Sequential(*layers)

In [None]:
class DQN(torch.nn.Module):
    def __init__(self,input_size,output_size):
        super(DQN, self).__init__()
        self.linear1 = create_linear(input_size,64)
        self.linear2 = create_linear(64,32)
        self.linear3 = create_linear(32,16)
        self.linear4 = torch.nn.Linear(16,output_size)
        
    def forward(self,x):
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = self.linear4(x)
        x = x.reshape(x.size(0), -1)
        x = F.softmax(x)
        return x

In [None]:
class DQNTorch():
    def __init__(self, env, epsilon=1.0, decay= 0.98, gamma=1.0, 
                 learning_rate=0.01, featurize=False, use_bias = False):
        self.epsilon = epsilon
        self.decay = decay
        self.env = env
        self.action_space = env.action_space.n
        self.state_space = env.observation_space.shape[0]
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.featurize = featurize
        self.featurizer = None
        self.scaler = None
        self.model = {}
        self.use_bias = use_bias
        self.create_model()
        self.stats = {"rewards":[],"episodes":[],"num_steps":[]}
    
    def featurize_state(self, state):
        """
        Returns the featurized representation for a state.
        """
        if self.featurize:
            scaled = self.scaler.transform([state])
            featurized = self.featurizer.transform(scaled)
            if self.use_bias:
                return np.concatenate([1],featurized[0])
            return featurized[0]
        if self.use_bias:
            return np.concatenate(([1],state))
        return state  
    
    def create_model(self):
        input_space = self.state_space    
        # featurizing code taken from https://github.com/dennybritz/reinforcement-learning/tree/master/FA
        # Used to convert a state to a featurizes represenation.
        # Use RBF kernels with different variances to cover different parts of the space
        if self.featurize:
            input_space = 400
            observation_examples = np.array([self.env.observation_space.sample() for x in range(10000)])
            self.scaler = sklearn.preprocessing.StandardScaler()
            self.scaler.fit(observation_examples)

            self.featurizer = sklearn.pipeline.FeatureUnion([
                    ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
                    ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
                    ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
                    ("rbf4", RBFSampler(gamma=0.5, n_components=100))
                    ])
            self.featurizer.fit(observation_examples)
        if self.use_bias:
            input_space += 1
        self.model["optimizers"] = []
        self.model["output"] = DQN(input_size=input_space,output_size=self.action_space)
        
        self.model["loss"] = torch.nn.MSELoss()
        self.model["optimizer"] = torch.optim.Adam(params=self.model["output"].parameters(),
                                                   lr=self.learning_rate,weight_decay=0.0001)

        print("Model Created!")
    
    def epsilon_greedy(self):
        def act(obs):
            qvals = []
            func = self.model["output"]
            qvals = func(obs)
            if np.random.random() < self.epsilon:
                return np.random.choice(self.action_space) , qvals
            return np.argmax(qvals) , qvals
        return act
    
    def greedy(self):
        def act(obs):
            qvals = []
            func = self.model["output"]
            for action in range(self.action_space):
                W = self.model["Ws"][action]
                qval = func(obs,W)[0]
                qvals.append(qval)
            return np.argmax(qvals) , qvals
        return act
    
    def train(self,episodes=200,early_stop=False,stop_criteria=20):
        prev_avg = -float('inf')
        orig_epsilon = self.epsilon
        bar = tqdm(np.arange(episodes),file=sys.stdout)
        policy = self.epsilon_greedy()
        criteria = 0 #stopping condition
        loss_func = self.model["loss"]
        for i in bar:
            observation = env.reset()
            self.epsilon *= (self.decay**i)
            rewards = 0
            end = 0
            for t in range(10000):
                action , qvals = policy(observation)
                next_obs, reward, done, info = env.step(action)
                rewards += reward
                next_action , next_qs = policy(next_obs)
                target = reward + self.gamma*next_qs[next_action]
                                
                loss = loss_func(target.detach(),qvals[action])
                loss.backward()
                # Adjust weights & reset gradients
                end = t
                if done:
                    break
                observation = next_obs
                
            self.stats["num_steps"].append(end)
            self.stats["episodes"].append(i)
            self.stats["rewards"].append(rewards)
            avg = np.mean(self.stats["rewards"][::-1][:25])
            bar.set_description("Epsilon and reward {} : {}".format(self.epsilon,avg))
            
            if avg < prev_avg:
                criteria += 1
                
            if early_stop:
                if criteria >= stop_criteria:
                    break
                    
            prev_avg = avg
        return self.stats 