In [4]:
import math
import random
import sys
 
sys.path.append('../')

import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn

from utilities.buffer import ReplayMemory,Transition


In [None]:
import random
import sys
 
sys.path.append('../')

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.parameter import Parameter

import cvxpy as cp
from cvxpylayers.torch import CvxpyLayer

from utilities.matrix_square_root import sqrtm

def QP_layer_no_eq(nz, nineq_u, nineq_x):
    """Builds the QP layer without equality constraints
    The optimization problem is of the form
        \hat z,\hat e  =   argmin_z z^T*Q*z + p^T*z + e^T*E*e
                subject to G1*z <= h1
                            G2*z <= h2+e
    """
    Q_sqrt = cp.Parameter((nz, nz))
    p = cp.Parameter(nz)
    G1 = cp.Parameter((nineq_u, nz))
    h1 = cp.Parameter(nineq_u)
    G2 = cp.Parameter((nineq_x, nz))
    h2 = cp.Parameter(nineq_x)
    E_sqrt = cp.Parameter((nineq_x, nineq_x))
    z = cp.Variable(nz)
    e = cp.Variable(nineq_x)
    zero =cp.Parameter(nineq_x) 
    
    obj = cp.Minimize(cp.sum_squares(Q_sqrt*z) + p.T@z +
                     cp.sum_squares(E_sqrt*e))
    cons = [ G1@z <= h1,G2@z <= h2+e , e >= zero ]# , e >= 0
    prob = cp. Problem(obj, cons)
    assert prob.is_dpp()

    layer = CvxpyLayer (prob, 
                        parameters =[Q_sqrt, p, G1, h1, G2,
                                     h2, E_sqrt,zero], 
                        variables =[ z, e ])
    return layer

def QP_layer_no_eq_e(nz, nineq_u, nineq_x):
    """Builds the QP layer without equality constraints
    The optimization problem is of the form
        \hat z,\hat e  =   argmin_z z^T*Q*z + p^T*z + e^T*E*e
                subject to G1*z <= h1
                            G2*z <= h2+e
    """
    Q_sqrt = cp.Parameter((nz, nz))
    p = cp.Parameter(nz)
    G1 = cp.Parameter((nineq_u, nz))
    h1 = cp.Parameter(nineq_u)
    G2 = cp.Parameter((nineq_x, nz))
    h2 = cp.Parameter(nineq_x)
    E_sqrt = cp.Parameter((nineq_x, nineq_x))
    z = cp.Variable(nz)
    e = cp.Variable(nineq_x)
    obj = cp.Minimize(cp.sum_squares(Q_sqrt*z) + p.T@z +
                     cp.sum_squares(E_sqrt*e))
    cons = [ G1@z <= h1,G2@z <= h2+e ]# , e >= 0
    prob = cp. Problem(obj, cons)
    assert prob.is_dpp()

    layer = CvxpyLayer (prob, 
                        parameters =[Q_sqrt, p, G1, h1, G2,
                                     h2, E_sqrt], 
                        variables =[ z, e ])
    return layer

def QP_layer(nz, nineq_u, nineq_x, neq):
    """Builds the QP layer with MPC soft constraints.

    The optimization problem is of the form
        \hat z,\hat e  =   argmin_z z^T*Q*z + p^T*z + e^T*E*e
                subject to G1*z <= h1
                           G2*z <= h2+e
                           A*z = b
                
    where Q \in S^{nz,nz},
        S^{nz,nz} is the set of all positive semi-definite matrices,
        p \in R^{nz}
        G1 \in R^{nineq_u,nz}
        h1 \in R^{nineq_u}
        G2 \in R^{nineq_x,nz}
        h2 \in R^{nineq_x}
        A \in R^{neq,nz}
        b \in R^{neq}
        E \in S^{ne,ne}, where ne = nineq_x
    
    Take the matrix square-root of Q：mentioned in paper P19
    (Differentiable Convex Optimization Layers).
    """
    Q_sqrt = cp.Parameter((nz, nz))
    p = cp.Parameter(nz)
    G1 = cp.Parameter((nineq_u, nz))
    h1 = cp.Parameter(nineq_u)
    G2 = cp.Parameter((nineq_x, nz))
    h2 = cp.Parameter(nineq_x)
    A = cp.Parameter ((neq,nz))
    b = cp.Parameter (neq)
    E_sqrt = cp.Parameter((nineq_x, nineq_x))
    z = cp.Variable(nz)
    e = cp.Variable(nineq_x)
    zero =cp.Parameter(nineq_x)
    
    obj = cp.Minimize(cp.sum_squares(Q_sqrt*z) + p.T@z +
                     cp.sum_squares(E_sqrt*e))
    cons = [ G1@z <= h1,G2@z <= h2+e, A@z == b, e >= zero ]#, e >= 0
    prob = cp. Problem(obj, cons)
    assert prob.is_dpp()

    layer = CvxpyLayer (prob, 
                        parameters =[Q_sqrt, p, G1, h1, G2,
                                     h2, E_sqrt, A, b,zero], 
                        variables =[ z, e ])
    return layer
def QP_layer_e(nz, nineq_u, nineq_x, neq):
    """Builds the QP layer with MPC soft constraints.

    The optimization problem is of the form
        \hat z,\hat e  =   argmin_z z^T*Q*z + p^T*z + e^T*E*e
                subject to G1*z <= h1
                           G2*z <= h2+e
                           A*z = b
                
    where Q \in S^{nz,nz},
        S^{nz,nz} is the set of all positive semi-definite matrices,
        p \in R^{nz}
        G1 \in R^{nineq_u,nz}
        h1 \in R^{nineq_u}
        G2 \in R^{nineq_x,nz}
        h2 \in R^{nineq_x}
        A \in R^{neq,nz}
        b \in R^{neq}
        E \in S^{ne,ne}, where ne = nineq_x
    
    Take the matrix square-root of Q：mentioned in paper P19
    (Differentiable Convex Optimization Layers).
    """
    Q_sqrt = cp.Parameter((nz, nz))
    p = cp.Parameter(nz)
    G1 = cp.Parameter((nineq_u, nz))
    h1 = cp.Parameter(nineq_u)
    G2 = cp.Parameter((nineq_x, nz))
    h2 = cp.Parameter(nineq_x)
    A = cp.Parameter ((neq,nz))
    b = cp.Parameter (neq)
    E_sqrt = cp.Parameter((nineq_x, nineq_x))
    z = cp.Variable(nz)
    e = cp.Variable(nineq_x)
    obj = cp.Minimize(cp.sum_squares(Q_sqrt*z) + p.T@z +
                     cp.sum_squares(E_sqrt*e))
    cons = [ G1@z <= h1,G2@z <= h2+e, A@z == b ]#, e >= 0
    prob = cp. Problem(obj, cons)
    assert prob.is_dpp()

    layer = CvxpyLayer (prob, 
                        parameters =[Q_sqrt, p, G1, h1, G2,
                                     h2, E_sqrt, A, b], 
                        variables =[ z, e ])
    return layer

In [None]:
class Cvx_Nets(nn.Module):
    """Builds the nets for Q function in Q-learning.
    The struture is (x0,u0)-QP-[cost,u].
    In addition, if input only contains x0, the nets represents
    the value funnction under current policy, which is:
        max(u0)Q(x0,u0)
    """
    
    def __init__(self, num_input, num_output, num_u=5, cuda=True,collect=False):

        """Initiates the nets."""
        super().__init__()
        self.num_input = num_input  # Dimension: x0
        self.num_output = num_output  # Dimension: u0
        self.num_u = num_u  # Dimension: u0,u1,...,uN
        self.cuda = cuda
        self.collect = collect
        
        # gets the number of the finite steps in MPC
        self.N = int(self.num_u/self.num_output)
        self.num_ineq_u = 2*num_u
        self.num_ineq_x = 2*num_input*self.N
        
        # For Q(x0,u0),defines the QP layer used
        self.layer = QP_layer(nz=self.num_u, nineq_u=
                        self.num_ineq_u, nineq_x=self.num_ineq_x,
                        neq = num_output)
        
        # defines parameters in the QP layer
        self.Q_sqrt = Parameter(torch.rand(num_input, num_input),requires_grad=True)
        self.R_sqrt = Parameter(torch.rand(num_output,num_output),requires_grad=True)
        
        self.A = Parameter(torch.rand(num_input, num_input),requires_grad=True)
        self.B = Parameter(torch.rand(num_input, num_output),requires_grad=True)

        self.h1 = Parameter(0.5*torch.ones(self.num_ineq_u),requires_grad=False)
        self.h21 = Parameter(4*torch.ones(num_input*self.N),requires_grad=False)
        self.h22 = Parameter(4*torch.ones(num_input*self.N),requires_grad=False)
        
        self.E_sqrt = Parameter(torch.eye(self.num_ineq_x),requires_grad=False)

        if collect==True:
            self.Q_sqrt = Parameter(torch.eye(num_input),requires_grad=True)
            self.R_sqrt = Parameter(torch.eye(num_output),requires_grad=True)
            self.A = Parameter(torch.Tensor([[1.0,1.0],[0,1.0]]),requires_grad=True)
            self.B = Parameter(torch.Tensor([[0.5],[1.0]]),requires_grad=True)
            self.h1 = Parameter(0.5*torch.ones(self.num_ineq_u),requires_grad=False)
            self.h21 = Parameter(4*torch.ones(num_input*self.N),requires_grad=False)
            self.h22 = Parameter(4*torch.ones(num_input*self.N),requires_grad=False)
                               
        weight = torch.zeros(num_u)
        weight[0] = 1.0
        self.weight = Parameter(weight,requires_grad=False)
        
        self.F = Parameter(torch.zeros(1,self.num_u),requires_grad=False)
        self.F[0,0] = 1.0
        self.f = Parameter(torch.tensor([1.0]),requires_grad=False)
        
        self.zero = Parameter(torch.zeros(self.num_ineq_x),requires_grad=False)

    def forward(self, x, u0=torch.Tensor()):
        """Builds the forward strucre of the QPNet.
        Sequence: x0-QP-[cost,u].
        QP parameters: Q_sqrt, p, G, h
        """
        x =self.vari_gpu(x)
        u0 =self.vari_gpu(u0)
        
        # input x0 and batch size         
        num_batch = x.size(0)
        x0 = x.view(num_batch, -1)
        
        A_hat = self.build_A_block()
        B_hat = self.build_B_block()
        
        # Q_sqrt in QP
        Q = self.Q_sqrt.mm(self.Q_sqrt.t())
        R = self.R_sqrt.mm(self.R_sqrt.t())
        R_diag = self.build_Rdiagnol_block(R)
        Q_hat, Q_diag = self.build_Q_block(Q, Q, R, B_hat)
        Q_sqrt_hat = sqrtm(Q_hat)  # computs sqrt of Q
        Q_sqrt_hat = Q_sqrt_hat.repeat(num_batch,1,1)  # builds batch
                
        # p in QP  p = 2 * (Q_diag*B_hat)^T * (A_hat*x0)
        A_x0 = A_hat.mm(x0.t()).t()  # presents[x1;x2;...;xN] size: batch * dim(x1;x2;...;xN)
        p = 2*A_x0.mm(Q_diag.mm(B_hat))
        
        # G in QP
        G1,G2 = self.build_G_block(B_hat)
        G1 = G1.repeat(num_batch,1,1)  # builds batch
        G2 = G2.repeat(num_batch,1,1)  # builds batch
        
        # h in QP
        h1 = self.h1.repeat(num_batch,1)  # builds batch
        h21 = self.h21.repeat(num_batch,1)  # builds batch
        h21 -= A_x0 
        h22 = self.h22.repeat(num_batch,1)  # builds batch
        h22 += A_x0
        h2 = torch.cat((h21,h22),1)
        
        zero = self.zero.repeat(num_batch,1)
        
        # E in QP
        E = self.E_sqrt.mm(self.E_sqrt.t())
        E_sqrt = self.E_sqrt.repeat(num_batch,1,1)
        
        # for Q(x0,u0), add equality constraint: u(0) = u0         
        if u0.nelement() != 0:
            u0 = u0.view(num_batch, -1)
            # F*z = f
            F = self.F
            f = u0*self.f
            F = F.repeat(num_batch,1,1)  # builds batch
            #f = f.repeat(num_batch,1)  # builds batch
#             print(Q_sqrt_hat.size(), p.size(), G1.size(),
#                   h1.size(), G2.size(),h2.size(),
#                   E_sqrt.size(),F.size(),f.size())

            self.para = [Q_sqrt_hat, p, G1, h1, G2,
                                h2, E_sqrt, F, f]
            u_opt,e_opt, = self.layer(Q_sqrt_hat, p, G1, h1, G2,
                                h2, E_sqrt, F, f,zero)  # u_opt: batch*dim(u)
        # for V(x0), defines the QP layer without equality 
        # constraints        
        else:
            layer = QP_layer_no_eq(nz=self.num_u, nineq_u=
                        self.num_ineq_u, nineq_x=self.num_ineq_x)
            self.para = [Q_sqrt_hat, p, G1, h1, G2,
                                h2, E_sqrt]            
            # gets the solution of the basic optimization problem
            u_opt,e_opt, = layer(Q_sqrt_hat, p, G1, h1, G2, h2, 
                                E_sqrt,zero)  # u_opt: batch*dim(u)

        # get the optimal cost
        # a+b: sum(i:1 to N): xi^T*Q*xi + u(i-1)^T*R*u(i-1)
        # c: x0^T*Q*x0
        # d:(i:1 to N):ei^T*E*ei
        a = (u_opt.mm(Q_hat)*u_opt + p*u_opt).sum(1)
        b = (A_x0.mm(Q_diag)*A_x0).sum(1)
        c = (x0.mm(Q)*x0).sum(1)
        d = (e_opt.mm(E)*e_opt).sum(1)
        cost_opt = (a+b+c+d).unsqueeze(1)  # size: batch*1
        u0_opt = u_opt.mv(self.weight)  # only the fisrt action
        #print(u0,u0_opt)
        return cost_opt, u0_opt
    
    def build_A_block(self):
        """
        [A]
        [A^2] 
        [A^3]
        [...]
        """
        N = self.N  # number of MPC steps
        A = self.A
        
        row_list = [A]  # reocrd the every row in B_hat
        
        for i in range(1, N):
            A = A.mm(self.A)
            row_list.append(A)
        return torch.cat(row_list,0)
    
    def build_B_block(self):
        """In MPC, express x vector in u vector and compute the new big B_hat matrix
        [B 0 0 ...
        [AB B 0
        ...
        """

        N = self.N  # number of MPC steps
        row_list = []  # reocrd the every row in B_hat
        
        first_block = self.B
        zero = Variable(torch.zeros(self.num_input, self.num_output*(N-1)))
        zero = self.vari_gpu(zero)
        row= torch.cat([first_block, zero],1)
        row_list.append(row)
        
        for i in range(1, N):
            first_block = self.A.mm(first_block)
            row = torch.cat([first_block, row[:,:self.num_output*(N-1)]],1)
            row_list.append(row)  
            
        return torch.cat(row_list,0)
        
        
    def build_Qdiagnol_block(self, Q, P):
        """ (num_imput*N) x (num_imput*N)
        The last block is P for x(N)"""
        
        N = self.N  # number of MPC steps
        num_input = self.num_input
        
        row_list = []  # reocrd the every row in B_hat
        zero = Variable(torch.zeros(num_input, num_input*(N-1)))
        zero = self.vari_gpu(zero)
        row_long = torch.cat([zero, Q, zero],1)  # [0 0 ... Q 0 0 ...]
        for i in range(N, 1, -1):
            row_list.append(row_long[:, (i-1)*num_input : (i+N-1)*num_input])
            
        row = torch.cat([zero, P],1)  # last line by [0 P]
        row_list.append(row)
        
        return torch.cat(row_list,0)
    
    def build_Rdiagnol_block(self, R):
        """
        [R 0 0 ...
        [0 R 0
        ...
        """
        N = self.N  # number of MPC steps
        num_output = self.num_output
        
        row_list = []  # reocrd the every row in B_hat
        zero = Variable(torch.zeros(num_output, num_output*(N-1)))
        zero = self.vari_gpu(zero)
        row_long = torch.cat([zero, R, zero],1)  # [0 0 ... Q 0 0 ...]
        
        for i in range(N, 0, -1):
            row_list.append(row_long[:, (i-1)*num_output : (i+N-1)*num_output])
        return torch.cat(row_list,0)
        
    def build_Q_block(self, Q, P, R, B_hat):
        """Build the Q_hat matrix so that MPC is tranfered into basic optimization problem
        Q_hat = B_hat^T * diag(Q) * B_hat + diag(R)
        """
        Q_diag = self.build_Qdiagnol_block(Q,P)
        R_diag = self.build_Rdiagnol_block(R)
        Q_hat = B_hat.t().mm(Q_diag.mm(B_hat)) + R_diag
        return Q_hat,Q_diag 
        
        
    def build_G_block(self,B_hat):
        """Build the G matrix so that MPC is tranfered into basic optimization problem
        G1 = [eye(num_u)]
             [-eye(num_u)]
        G2 = [   B_hat  ]
             [  -B_hat  ]
        """
        
        eye = Variable(torch.eye(self.num_u))
        eye = self.vari_gpu(eye)
        G1 = torch.cat((eye, -eye), 0)
        G2 = torch.cat((B_hat, -B_hat), 0)
        # print(self.B_hat)
        # print(G.size())
        return G1,G2
    
    def vari_gpu(self, var):
        if self.cuda:
            var = var.cuda()
            
        return var

    def act(self, state, epsilon, env):
        """The action excuted by epsilon-greedy exploration
        """
        if random.random() > epsilon:
            state   = Variable(torch.FloatTensor(state)).unsqueeze(0) # adds extra dim when single input
            state = self.vari_gpu(state)
            _, u_opt = self.forward(state)
            action = (u_opt.cpu().detach().numpy())  # compute the u*[0] 
            #print('act:q_value ',q_value)
            #print('act:model action ',action)
        else:
            rand = np.random.rand(int(np.array(env.action_space.shape)))
            high = env.action_space.high
            low = env.action_space.low
            action = low + rand*(high-low)
            #print('act: ',action)
        return action


In [267]:
class Q_Learning(object):
    """Q-learning trainning. 
    """
    def __init__(self, config, net):
        self.config = config
        
        # define current model and target model
        num_input = config.hyperparameters["num_input"]
        num_output = config.hyperparameters["num_output"]
        num_u = config.hyperparameters["num_u"]
        self.cuda = config.use_GPU
        device = torch.device("cuda:0" if self.cuda else "cpu")
        self.current_net = net(num_input=num_input, num_output=
                           num_output, num_u=num_u, cuda=self.cuda)
        self.current_net = self.current_net.to(device)
        self.target_net = net(num_input=2, num_output=1, num_u=5,cuda=self.cuda).to(device)
        self.target_net = self.target_net.to(device)
        self.target_net.load_state_dict(self.current_net.state_dict())
        self.target_net.eval()
        
        # environments and buffer
        buffer_size = config.hyperparameters["buffer_size"]
        self.env = config.environment
        self.memory = ReplayMemory(buffer_size)
        self.batch_size = config.hyperparameters["batch_size"]
        
        # variables for training
        self.num_episodes = config.num_episodes_to_run
        self.target_update = config.hyperparameters["target_update"]
        self.gamma = config.hyperparameters["gamma"]
        self.loss_fun = torch.nn.MSELoss()  # Initializes the loss function
        learning_rate = 1e-3
        self.optimizer = torch.optim.Adam(self.current_net.parameters(),
                                          lr=learning_rate)

        self.losses = []
        self.trac_time = []
        self.trac_reward = []
        self.step_total = 0  # records total steps for all trajectories
        self.TD_error = [] # records target difference error       
        
    def run(self):
        """Run the agent"""
        for epoch in range(self.num_episodes):
            self.run_agent_in_one_episode(epoch)
        self.env.close() 
            
    def run_agent_in_one_episode(self, epoch):
        state = self.env.reset()
        done = False  # records whether one trajectory is finished
        t_traj = 0  # records the number of steps in one trajectory
        r_traj = 0  # records the total rewards in one trajectory

        while not done:  # Second loop: within one tractory

#             position = str(env.state[0].round(decimals=2))
#             velocity = str(env.state[1].round(decimals=2))
            text = 'trajectory: '+str(epoch+1)
            self.env.render(text)  # visualization of the cart position 
            epsilon = greedy_epsilon(self.step_total)
            action = self.current_net.act(state, epsilon, self.env)
            if action[0]>0.5 or action[0]<-0.5:  # if infeasible, start a new trajectory
                continue

            next_state, reward, done, _ = self.env.step(action)

            # if done, the cart may go out of contraints, we don't 
            # want this fake reward to be saved.
            if done:
                break

            # save the step in the memory
            # transfer into type tensor
            state_      = self.vari_gpu(torch.FloatTensor(state)).unsqueeze(0)
            next_state_ = self.vari_gpu(torch.FloatTensor(next_state)).unsqueeze(0)
            action_     = self.vari_gpu(torch.FloatTensor(action)).unsqueeze(0)
            reward_     = self.vari_gpu(torch.FloatTensor([reward])).unsqueeze(0)
            done_       = self.vari_gpu(torch.FloatTensor([done])).unsqueeze(0)

            self.memory.push(state_, action_, next_state_, reward_, done_)
            state = next_state
            t_traj += 1
            r_traj += reward


            # Third loop: train the model
            self.step_total += 1
            loss = 0.0
            if self.step_total>self.batch_size:
                for k in range(self.config.num_taining_step_every_trajectory_step):
                    loss += self.train()
                    self.losses.append(loss/(k+1))

            # update the target network
            if self.step_total % self.target_update == 0:  
                self.target_net.load_state_dict(self.current_net.state_dict())

            if (self.step_total)%100 == 0 and self.step_total>128:
                print('[step_total: %d] training loss: %.3f' %
                        (self.step_total, 
                         self.losses[self.step_total-self.batch_size-1]))

            # compute the target difference error
            try:
                q_value, _ = self.current_net(state_,action_)  # Q(x0,u0)
            except:
                print(state,action)
                print(self.current_net.para)
                q_value, _ = self.current_net(state_,action_)

            v_value, _ = self.target_net(next_state_)  # V(x1)
            q_value = q_value.data[0,0].cpu().numpy()
            v_value = v_value.data[0,0].cpu().numpy()
            self.TD_error.append(reward + self.gamma * v_value * (1-done) - q_value)

        #  record data of this trajectory in lists
        self.trac_time.append(t_traj)
        self.trac_reward.append(r_traj)

       
    def train(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        state = torch.cat(batch.state_)
        action = torch.cat(batch.action_)
        next_state = torch.cat(batch.next_state_)
        reward = torch.cat(batch.reward_)
        done = torch.cat(batch.done_)
        
        global Q_,p_,G1_,h1_,G2_,h2_,E_,F_,f_
        try:
            q_value, _ = self.current_net(state,action)  # Q(x0,u0)
        except:
#             print(state,action)
#             print(self.current_net.para)
            [Q_,p_,G1_,h1_,G2_,h2_,E_,F_,f_] = self.current_net.para
            layer = QP_layer_e(5, 10, 20,1)
            # solution
            u,e, = layer ( Q_,p_,G1_,h1_,G2_,h2_,E_,F_,f_ )            
            print(1,u,e)
        try:
            v_value, _ = self.target_net(next_state)  # V(x1)
        except:
#             print(state,action)
#             print(self.target_net.para)
            [Q_,p_,G1_,h1_,G2_,h2_,E_] = self.target_net.para
            layer = QP_layer_no_eq_e(5, 10, 20)
            # solution
            u,e, = layer ( Q_,p_,G1_,h1_,G2_,h2_,E_ )
            print(2,u,e)
        # compute the expected Q values
        expected_q_value = reward + self.gamma * v_value * (1-done)

        # compute loss
        loss = self.loss_fun(expected_q_value, q_value) 

        # optimize the model  
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.data
    
    def vari_gpu(self, var):
        if self.cuda:
            var = var.cuda()
        return var

# Epsilon-greedy exploration
# The epsilon decreases exponetially as time goes by.
def greedy_epsilon(step_total):
    epsilon_start = 0.8
    epsilon_final = 0.01
    epsilon_decay = 500
    epsilon = epsilon_final + (epsilon_start-epsilon_final) * math.exp(-1.*step_total/epsilon_decay)
    return epsilon    

In [298]:
from utilities.config import Config
from environments.cart import MyEnv

config = Config()
config.seed = 1
config.environment = MyEnv()
config.num_episodes_to_run = 100
config.num_taining_step_every_trajectory_step = 3
config.visualise_results = False
config.file_to_save_data_results = None
config.file_to_save_results_graph = None
config.use_GPU = False
config.save_model = False


config.hyperparameters = {
            "num_input": 2,
            "num_output": 1,
            "num_u": 5,
            "learning_rate": 1e-4,
            "gamma": 1.0,  # discount_rate
            "target_update": 10,
            "batch_size": 20,
            "buffer_size": 10000,
}

In [300]:
if __name__ == "__main__":
    net = Cvx_Nets
    trainer = Q_Learning(config, net)
    trainer.run()

[step_total: 200] training loss: 63.892
[step_total: 300] training loss: 44.928




[step_total: 400] training loss: 639.745
[step_total: 500] training loss: 471.927
[step_total: 600] training loss: 1283.698
[step_total: 700] training loss: 1228.829
[step_total: 800] training loss: 194.959


In [282]:
[Q1,p1,G11,h11,G21,h21,E1]=[Q_.data,p_.data,
                            G1_.data,h1_.data,
                            G2_.data,h2_.data,E_.data ]
layer = QP_layer_no_eq_e(5, 10, 20)

# solution
u,e, = layer ( Q1,p1,G11,h11,G21,h21,E1)

In [263]:
e

tensor([[4.2431e-05, 1.7225e-02, 1.8794e+00, 6.9909e-01, 4.4908e+00, 5.2681e+00,
         1.1442e+01, 1.0766e+01, 2.1602e+01, 2.1657e+01, 6.0471e-05, 1.2034e-04,
         8.1094e-05, 1.0586e-04, 3.5383e-05, 1.2739e-04, 3.2468e-05, 7.1357e-05,
         2.3103e-05, 6.0185e-05]])

In [286]:
[Q1,p1,G11,h11,G21,h21,E1,F1,f1]=[Q_.data,p_.data,
                            G1_.data,h1_.data,
                            G2_.data,h2_.data,E_.data,
                            F_.data,f_.data ]
layer = QP_layer_e(5, 10, 20,1)

# solution
u,e, = layer ( Q1,p1,G11,h11,G21,h21,E1,F1,f1)

In [243]:
e

tensor([[8.1410e-06, 4.8009e-06, 3.0856e-05, 2.2860e-05, 7.1703e-06, 2.3710e-05,
         3.6047e-05, 3.2273e-05, 1.9094e-04, 4.6959e-04, 7.8976e-06, 7.7903e-06,
         3.9079e+00, 4.2183e+00, 1.3968e+01, 1.4579e+01, 3.6829e+01, 3.8065e+01,
         8.8707e+01, 9.1371e+01]])

In [179]:
[Q1,p1,G11,h11,G21,h21,E1]

[tensor([[[7.5858, 3.4945, 1.9851, 1.1377, 0.6381],
          [3.4945, 3.4133, 1.1316, 0.6430, 0.3602],
          [1.9851, 1.1316, 2.0752, 0.3715, 0.2027],
          [1.1377, 0.6430, 0.3715, 1.6434, 0.1214],
          [0.6381, 0.3602, 0.2027, 0.1214, 1.4985]]]),
 tensor([[-1955.6559, -1110.2148,  -631.8699,  -361.8481,  -202.6708]]),
 tensor([[[ 1.,  0.,  0.,  0.,  0.],
          [ 0.,  1.,  0.,  0.,  0.],
          [ 0.,  0.,  1.,  0.,  0.],
          [ 0.,  0.,  0.,  1.,  0.],
          [ 0.,  0.,  0.,  0.,  1.],
          [-1., -0., -0., -0., -0.],
          [-0., -1., -0., -0., -0.],
          [-0., -0., -1., -0., -0.],
          [-0., -0., -0., -1., -0.],
          [-0., -0., -0., -0., -1.]]]),
 tensor([[0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
          0.5000]]),
 tensor([[[-0.1074,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.6431,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.4384, -0.1074,  0.0000,  0.0000,  0.0000],
          [ 0.355