In [1]:
import gym
import popgym
import numpy as np
from popgym.envs.stateless_cartpole import StatelessCartPole
from collections import deque
from tensorflow.keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random

  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  if hasattr(pil_image, 'HAMMING'):
  if hasattr(pil_image, 'BOX'):
  if hasattr(pil_image, 'LANCZOS'):


In [2]:
import tensorflow as tf
import numpy as np
import tqdm

In [3]:
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs:", len(physical_devices))

Num GPUs: 1


In [4]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self,state_space, intermediate_dim):
    super(Encoder, self).__init__()
    self.hidden_layer1=tf.keras.layers.Dense(
      units=intermediate_dim,
      activation=tf.nn.relu,
      kernel_initializer='he_uniform'
    )
    self.hidden_layer2 = tf.keras.layers.Dense(
      units=intermediate_dim,
      activation=tf.nn.relu,
      kernel_initializer='he_uniform'
    )
    self.output_layer = tf.keras.layers.Dense(
      units=state_space,
      activation=tf.keras.activations.linear
    )
    
  def call(self, input_features):
    activation1 = self.hidden_layer1(input_features)
    activation2=self.hidden_layer2(activation1)
    return self.output_layer(activation2)

In [5]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, intermediate_dim, obs_space,dqn_solver):
    super(Decoder, self).__init__()
    self.hidden_layer1 = tf.keras.layers.Dense(
      units=intermediate_dim,
      activation=tf.nn.relu,
      kernel_initializer='he_uniform'
    )
    self.hidden_layer2 = tf.keras.layers.Dense(
      units=intermediate_dim,
      activation=tf.nn.relu,
      kernel_initializer='he_uniform'
    )
    self.output_layer = tf.keras.layers.Dense(
      units=obs_space,
      activation=tf.keras.activations.linear
    )
    self.dqn_solver=dqn_solver
  
  def call(self, input_features):
    activation1 = self.hidden_layer1(tf.concat((input_features,np.array([[self.dqn_solver.act(input_features)]])),axis=1))
    activation2=self.hidden_layer2(activation1)

    return self.output_layer(activation2)

In [6]:
GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000
BATCH_SIZE = 1

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995

In [7]:
class DQNSolver:

    def __init__(self, state_space, action_space):
        self.exploration_rate = 1.0

        self.action_space = action_space
        self.memory = deque(maxlen=1000000)

        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(
            state_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=0.001))

    def remember(self, state, action, reward, next_state, done):
        # if (len(self.memory) > 0):
        #     self.memory.pop()
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < 1:
            return
        batch = random.sample(self.memory, 1)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                q_update = (reward + 0.95 *
                            np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= 0.995
        self.exploration_rate = max(0.01, self.exploration_rate)

In [8]:
env=StatelessCartPole()


In [9]:
print(env.action_space)
print(env.observation_space.shape[0])

Discrete(2)
2


In [10]:
state_space=4
obs_space=2
action_space=2
intermediate_dim=16

In [11]:
class Autoencoder(tf.keras.Model):
  def __init__(self, intermediate_dim, state_space,obs_space,dqn_solver):
    super(Autoencoder, self).__init__()
    self.encoder = Encoder(intermediate_dim=intermediate_dim,state_space=state_space)
    self.dqn_solver=dqn_solver
    self.decoder = Decoder(
      intermediate_dim=intermediate_dim,
      obs_space=obs_space,dqn_solver=self.dqn_solver
    )
    
  
  def call(self, input_features):
    code = self.encoder(input_features)
    reconstructed = self.decoder(code)
    return reconstructed


opt = tf.optimizers.Adam(learning_rate=LEARNING_RATE)

In [12]:
# class main_model(tf.keras.Model):
#   def __init__(self, intermediate_dim, state_space,obs_space,dqn_solver):
#     super(main_model, self).__init__()
#     self.encoder = Encoder(intermediate_dim=intermediate_dim,state_space=state_space)
#     self.decoder = Decoder(
#       intermediate_dim=intermediate_dim,
#       obs_space=obs_space
#     )
  
#   def call(self, input_features,action):
#     code = self.encoder(input_features)
    
#     reconstructed = self.decoder(code,action)
#     return reconstructed

# autoencoder = Autoencoder(
#   intermediate_dim=intermediate_dim,
#   state_space=state_space,obs_space=obs_space
# )


In [13]:

opt = tf.optimizers.Adam(learning_rate=LEARNING_RATE)
def loss(model,input_features, obs):
  reconstruction_error = tf.reduce_mean(tf.square(tf.subtract(model(input_features), obs)))
  return reconstruction_error
  
def train(loss, model, opt,input_features, obs):
  with tf.GradientTape() as tape:
    gradients = tape.gradient(loss(model,input_features, obs), model.trainable_variables)
  gradient_variables = zip(gradients, model.trainable_variables)
  opt.apply_gradients(gradient_variables)

In [14]:
obs=env.reset()

#obs = np.reshape(obs, [1, 4])
obs

array([-0.03945559,  0.01403861, -0.00858484, -0.04901327], dtype=float32)

In [15]:

dqn_solver=DQNSolver(state_space, action_space)
# encoder = Encoder(intermediate_dim=intermediate_dim,state_space=state_space)
# decoder = Decoder(
#     intermediate_dim=intermediate_dim,
#     obs_space=obs_space,dqn_solver=dqn_solver
# )




In [16]:
autoencoder = Autoencoder(
  intermediate_dim=intermediate_dim,
  state_space=state_space,obs_space=obs_space,dqn_solver=dqn_solver
)
#autoencoder.compile(optimizer=opt,loss=tf.keras.losses.MSE)

In [33]:
#training loop
def data_collection(total_episodes,dqn_solver,autoencoder):

    S_series=[]
    S_actual_series=[]
    O_series=[]
    U_series=[]
    R_series=[]
    O_predicted_series=[]
    Done_series=[]
    encoder=autoencoder.encoder
    decoder=autoencoder.decoder
    O_series.append(np.array([[0.0,0.0]]))
    O_predicted_series.append(np.array([[0.0,0.0]]))
    ep_no=0
    i=0
    while(ep_no<total_episodes):
        s0=env.reset()
        #s0=np.reshape(s0,[1,state_space])
        
        step=0
        done=False
        while not done:
            if(step==0):
                s=s0
                S_actual_series.append(s)
            else:
                s=encoder(tf.concat((S_series[i-1],O_series[i],U_series[i-1]),axis=1))
                s=np.reshape(s,[1,state_space])
                s=s.tolist()
                s[0][0]=np.tanh(s[0][0])*4.8
                s[0][2]=np.tanh(s[0][2])*0.418
                s=np.array(s)
                

            s=np.reshape(s,[1,state_space])
            S_series.append(s)
            action=dqn_solver.act(s)

            actual_state,obs, reward, done, info = env.step(int(action))
            reward=reward*200
            Done_series.append(done)
            actual_state=np.reshape(actual_state,[1,state_space])
            S_actual_series.append(actual_state)
            action=np.array([[action]])
            U_series.append(action)
            obs=np.reshape(obs,[1,obs_space])

            O_series.append(obs)
            reward = reward if not done else -reward
            R_series.append(reward)
            # print(s.shape)
            # print(action.shape)
            obs_pred=decoder(s)
            obs_pred=np.reshape(obs_pred,[1,obs_space])
            obs_pred=obs_pred.tolist()
            obs_pred[0][0]=np.tanh(obs_pred[0][0])*4.8
            obs_pred[0][1]=np.tanh(obs_pred[0][1])*0.418
            obs_pred=np.array(obs_pred)
            O_predicted_series.append(obs_pred)
            i+=1
            step+=1
        ep_no+=1
    return S_series,O_series,U_series,R_series,O_predicted_series,S_actual_series,Done_series


In [34]:
S_series,O_series,U_series,R_series,O_predicted_series,S_actual_series,Done_series=data_collection(10,dqn_solver,autoencoder)


In [19]:
print(len(S_series))
print(len(O_series))
print(len(U_series))
print(len(Done_series))
print(len(O_predicted_series))

265
266
265
265
266


In [35]:
R_series

[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -

In [20]:
def autoencoder_training(epochs):
    from tqdm import trange
    for epoch in trange(epochs):
        for i in range(2,4):
            train(loss,autoencoder,opt,np.concatenate((S_series[i-1],O_series[i],U_series[i-1]),axis=1),O_series[i+1])



In [28]:
autoencoder_training(10)

 40%|████      | 4/10 [00:00<00:00, 28.65it/s]


LookupError: No gradient defined for operation 'IteratorGetNext' (op type: IteratorGetNext)

In [22]:
def dqn_training(dqn_solver,epochs):
    for i in range(len(S_series)-1):
        dqn_solver.remember(S_series[i],U_series[i][0][0],R_series[i],S_series[i+1],Done_series[i])
    for _ in range(epochs):
        dqn_solver.experience_replay()


In [23]:
dqn_training(dqn_solver,10)