In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import tensorflow as tf
import gym
import sys
sys.path.append("../")
from tools import tools
from tools import plotting
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import sklearn.pipeline
import sklearn.preprocessing
from sklearn.kernel_approximation import RBFSampler
from tqdm import tqdm
%matplotlib inline

## DQN TORCH

In [9]:
[*[[1,2],3,4]]

[[1, 2], 3, 4]

In [None]:
def create_linear(input_dims,output_dims,batch_norm=False):
    layers = []
    lin = torch.nn.Linear(input_dims,output_dims)
    layers.append(lin)
    if batch_norm:
        layers.append(torch.nn.BatchNorm1d(output_dims))
    return torch.nn.Sequential(*layers)

In [10]:
class DQN(torch.nn.Module):
    def __init__(self,input_size,output_size):
        super(DQN, self).__init__()
        self.linear1 = create_linear(input_size,128)
        self.linear2 = create_linear(128,64)
        self.linear3 = create_linear(64,32)
        self.linear4 = create_linear(32,16)
        self.linear5 = torch.nn.Linear(32,output_size)
        
    def forward(self,x):
        x = F.relu(self.linear1(x))
        x = F.dropout(x, p=0.5, training=True)
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = F.dropout(x, p=0.5, training=True)
        x = F.relu(self.linear4(x))
        x = self.linear5(x)
        x = x.reshape(x.size(0), -1)
        x = F.softmax(x)
        return x

In [2]:
class DQNTorch():
    def __init__(self, env, epsilon=1.0, decay= 0.98, gamma=1.0, 
                 learning_rate=0.01, featurize=False, use_bias = False):
        self.epsilon = epsilon
        self.decay = decay
        self.env = env
        self.action_space = env.action_space.n
        self.state_space = env.observation_space.shape[0]
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.featurize = featurize
        self.featurizer = None
        self.scaler = None
        self.model = {}
        self.use_bias = use_bias
        self.create_model()
        self.stats = {"rewards":[],"episodes":[],"num_steps":[]}
    
    def featurize_state(self, state):
        """
        Returns the featurized representation for a state.
        """
        if self.featurize:
            scaled = self.scaler.transform([state])
            featurized = self.featurizer.transform(scaled)
            if self.use_bias:
                return np.concatenate([1],featurized[0])
            return featurized[0]
        if self.use_bias:
            return np.concatenate(([1],state))
        return state  
    
    def create_model(self):
        input_space = self.state_space    
        # featurizing code taken from https://github.com/dennybritz/reinforcement-learning/tree/master/FA
        # Used to convert a state to a featurizes represenation.
        # Use RBF kernels with different variances to cover different parts of the space
        if self.featurize:
            input_space = 400
            observation_examples = np.array([self.env.observation_space.sample() for x in range(10000)])
            self.scaler = sklearn.preprocessing.StandardScaler()
            self.scaler.fit(observation_examples)

            self.featurizer = sklearn.pipeline.FeatureUnion([
                    ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
                    ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
                    ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
                    ("rbf4", RBFSampler(gamma=0.5, n_components=100))
                    ])
            self.featurizer.fit(observation_examples)
        if self.use_bias:
            input_space += 1
        self.model["optimizers"] = []
        self.model["output"] = DQN(input_size=input_space,output_size=self.action_space)
        
        self.model["loss"] = torch.nn.MSELoss()
        self.model["optimizer"] = torch.optim.Adam(params=self.model["output"].parameters(),
                                                   lr=self.learning_rate,weight_decay=0.0001)

        print("Model Created!")
    
    def epsilon_greedy(self):
        def act(obs):
            qvals = []
            func = self.model["output"]
            qvals = func(obs)
            if np.random.random() < self.epsilon:
                return np.random.choice(self.action_space) , qvals
            return np.argmax(qvals) , qvals
        return act
    
    def greedy(self):
        def act(obs):
            qvals = []
            func = self.model["output"]
            for action in range(self.action_space):
                W = self.model["Ws"][action]
                qval = func(obs,W)[0]
                qvals.append(qval)
            return np.argmax(qvals) , qvals
        return act
    
    def train(self,episodes=200,early_stop=False,stop_criteria=20):
        prev_avg = -float('inf')
        orig_epsilon = self.epsilon
        bar = tqdm(np.arange(episodes),file=sys.stdout)
        policy = self.epsilon_greedy()
        criteria = 0 #stopping condition
        loss_func = self.model["loss"]
        for i in bar:
            observation = env.reset()
            self.epsilon *= (self.decay**i)
            rewards = 0
            end = 0
            for t in range(10000):
                action , qvals = policy(observation)
                next_obs, reward, done, info = env.step(action)
                rewards += reward
                next_action , next_qs = policy(next_obs)
                target = reward + self.gamma*next_qs[next_action]
                                
                loss = loss_func(target.detach(),qvals[action])
                loss.backward()
                # Adjust weights & reset gradients
                end = t
                if done:
                    break
                observation = next_obs
                
            self.stats["num_steps"].append(end)
            self.stats["episodes"].append(i)
            self.stats["rewards"].append(rewards)
            avg = np.mean(self.stats["rewards"][::-1][:25])
            bar.set_description("Epsilon and reward {} : {}".format(self.epsilon,avg))
            
            if avg < prev_avg:
                criteria += 1
                
            if early_stop:
                if criteria >= stop_criteria:
                    break
                    
            prev_avg = avg
        return self.stats 