In [3]:
import gym
import popgym
import numpy as np
from popgym.envs.stateless_cartpole import StatelessCartPole
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random
from tqdm import trange


  np.bool8: (False, True),
  np.object,


AttributeError: module 'numpy' has no attribute 'object'

In [68]:
import tensorflow as tf
import numpy as np
import tqdm

In [69]:
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs:", len(physical_devices))

Num GPUs: 1


In [70]:
GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000
BATCH_SIZE = 1

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.9

In [71]:
state_space=4
obs_space=2
action_space=2
intermediate_dim=16

In [72]:
env=StatelessCartPole()


In [73]:
print(env.action_space)
print(env.observation_space.shape[0])

Discrete(2)
2


In [74]:
class DQNSolver:

    def __init__(self, state_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(
            state_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        # if (len(self.memory) > 0):
        #     self.memory.pop()
        self.memory.append((state, action, reward, next_state, done))
    
    def forget(self):
        self.memory.clear()

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                q_update = (reward + GAMMA *
                            np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

In [75]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self,state_space, intermediate_dim):
    super(Encoder, self).__init__()
    self.hidden_layer1=tf.keras.layers.Dense(
      units=intermediate_dim,
      activation=tf.nn.relu,
      kernel_initializer='he_uniform'
    )
    self.hidden_layer2 = tf.keras.layers.Dense(
      units=intermediate_dim,
      activation=tf.nn.relu,
      kernel_initializer='he_uniform'
    )
    self.output_layer = tf.keras.layers.Dense(
      units=state_space,
      activation=tf.keras.activations.linear
    )
    
  def call(self, input_features):
    activation1 = self.hidden_layer1(input_features)
    activation2=self.hidden_layer2(activation1)
    return self.output_layer(activation2)

In [76]:
dqn_solver=DQNSolver(state_space, action_space)


In [77]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, intermediate_dim, obs_space):
    super().__init__()
    self.hidden_layer1 = tf.keras.layers.Dense(
      units=intermediate_dim,
      activation=tf.nn.relu,
      kernel_initializer='he_uniform'
    )
    self.hidden_layer2 = tf.keras.layers.Dense(
      units=intermediate_dim,
      activation=tf.nn.relu,
      kernel_initializer='he_uniform'
    )
    self.output_layer = tf.keras.layers.Dense(
      units=obs_space,
      activation=tf.keras.activations.linear
    )
  
  def call(self, input_features,var):
    activation1 = self.hidden_layer1(tf.concat((input_features,var),axis=1))
    activation2=self.hidden_layer2(activation1)

    return self.output_layer(activation2)

In [78]:
class Autoencoder(tf.keras.Model):
  def __init__(self, intermediate_dim, state_space,obs_space):
    super(Autoencoder, self).__init__()
    self.encoder = Encoder(intermediate_dim=intermediate_dim,state_space=state_space)
    
    self.decoder = Decoder(
      intermediate_dim=intermediate_dim,
      obs_space=obs_space
    )
    
  
  def call(self, input_features,var):
    code = self.encoder(input_features)
    #self.var=tf.Variable(var,trainable=False)
    reconstructed = self.decoder(code,var)
    return reconstructed


opt = tf.optimizers.Adam(learning_rate=LEARNING_RATE)

In [79]:

def loss(model,input_features, obs,var):
  reconstruction_error = tf.reduce_mean(tf.square(tf.subtract(model(input_features,var), obs)))
  return reconstruction_error
  
def train(loss, model, opt,input_features, obs,var):
  with tf.GradientTape() as tape:
    gradients = tape.gradient(loss(model,input_features, obs,var), model.trainable_variables)
  gradient_variables = zip(gradients, model.trainable_variables)
  opt.apply_gradients(gradient_variables)

In [80]:
obs=env.reset()
obs

array([ 0.02992012,  0.0229295 , -0.02333126,  0.0402755 ], dtype=float32)

In [81]:
autoencoder = Autoencoder(
  intermediate_dim=intermediate_dim,
  state_space=state_space,obs_space=obs_space
)


In [82]:
def data_collection(total_episodes,dqn_solver,autoencoder):

    S_series=[]
    S_actual_series=[]
    O_series=[]
    U_series=[]
    R_series=[]
    O_predicted_series=[]
    Done_series=[]
    O_series.append(np.array([[0.0,0.0]]))
    O_predicted_series.append(np.array([[0.0,0.0]]))
    encoder=autoencoder.encoder
    decoder=autoencoder.decoder

    ep_no=0
    i=0
    while(ep_no<total_episodes):
        s0=env.reset()
        #s0=np.reshape(s0,[1,state_space])
        
        step=0
        done=False
        while not done:
            if(step==0):
                s=s0
                S_actual_series.append(s)
            else:
                s=encoder(tf.concat((S_series[i-1],O_series[i],U_series[i-1]),axis=1))
                s=np.reshape(s,[1,state_space])
                # s=s.tolist()
                # s[0][0]=np.tanh(s[0][0])*4.8
                # s[0][2]=np.tanh(s[0][2])*0.418
                # s=np.array(s)
                

            s=np.reshape(s,[1,state_space])
            S_series.append(s)
            action=dqn_solver.act(s)

            actual_state,obs, reward, done, info = env.step(int(action))
            reward=reward*200
            Done_series.append(done)
            actual_state=np.reshape(actual_state,[1,state_space])
            S_actual_series.append(actual_state)
            action=np.array([[action]])
            U_series.append(action)
            obs=np.reshape(obs,[1,obs_space])

            O_series.append(obs)
            reward = reward if not done else -reward
            R_series.append(reward)
            # print(s.shape)
            # print(action.shape)
            obs_pred=decoder(s,tf.convert_to_tensor(np.array([[dqn_solver.act(s)]]),dtype=float))
            obs_pred=np.reshape(obs_pred,[1,obs_space])
            obs_pred=obs_pred.tolist()
            # obs_pred[0][0]=np.tanh(obs_pred[0][0])*4.8
            # obs_pred[0][1]=np.tanh(obs_pred[0][1])*0.418
            obs_pred=np.array(obs_pred)
            O_predicted_series.append(obs_pred)
            i+=1
            step+=1
        ep_no+=1
    return S_series,O_series,U_series,R_series,O_predicted_series,S_actual_series,Done_series


In [83]:
def autoencoder_training(epochs,autoencoder,S_series,O_series,U_series):
    for epoch in trange(epochs):
        for i in range(2,len(S_series)-1):
            train(loss,autoencoder,opt,tf.concat((S_series[i-1],O_series[i],U_series[i-1]),axis=1),O_series[i+1],U_series[i])



In [84]:
def dqn_training(dqn_solver,epochs,S_series,U_series,R_series,Done_series):
    dqn_solver.forget()
    for i in range(len(S_series)-1):
        dqn_solver.remember(S_series[i],U_series[i][0][0],R_series[i],S_series[i+1],Done_series[i])
    for _ in trange(epochs):
        dqn_solver.experience_replay()

In [85]:
S_series,O_series,U_series,R_series,O_predicted_series,S_actual_series,Done_series=data_collection(10,dqn_solver,autoencoder)


In [86]:
rewards_storage=[]

In [87]:
def complete_loop(autoencoder,dqn_solver,epochs,EXPLORATION_MAX):
    for epoch in range(epochs):
        S_series,O_series,U_series,R_series,O_predicted_series,S_actual_series,Done_series=data_collection(100,dqn_solver,autoencoder)
        print("epoch: {}, total reward: {}".format(epoch, sum(R_series)))
        rewards_storage.append(sum(R_series))
        if(epoch%2==0):
            dqn_training(dqn_solver,20,S_series,U_series,R_series,Done_series)
            dqn_solver.exploration_rate=EXPLORATION_MAX
            EXPLORATION_MAX*=0.9
        else:
            autoencoder_training(10,autoencoder,S_series,O_series,U_series)

        
        

        


In [88]:
complete_loop(autoencoder,dqn_solver,100,EXPLORATION_MAX=1.0)

epoch: 0, total reward: 1935.0


100%|██████████| 20/20 [00:02<00:00,  8.40it/s]


epoch: 1, total reward: 1903.0


100%|██████████| 10/10 [03:13<00:00, 19.31s/it]


epoch: 2, total reward: 2167.0


100%|██████████| 20/20 [00:02<00:00,  9.29it/s]


epoch: 3, total reward: 2049.0


100%|██████████| 10/10 [03:26<00:00, 20.67s/it]


epoch: 4, total reward: 1959.0


100%|██████████| 20/20 [00:02<00:00,  9.67it/s]


epoch: 5, total reward: 1864.0


100%|██████████| 10/10 [03:08<00:00, 18.89s/it]


epoch: 6, total reward: 1697.0


100%|██████████| 20/20 [00:02<00:00,  9.75it/s]


epoch: 7, total reward: 1668.0


100%|██████████| 10/10 [02:51<00:00, 17.16s/it]


epoch: 8, total reward: 1653.0


100%|██████████| 20/20 [00:02<00:00,  9.49it/s]


epoch: 9, total reward: 1496.0


100%|██████████| 10/10 [02:36<00:00, 15.60s/it]


epoch: 10, total reward: 1526.0


100%|██████████| 20/20 [00:02<00:00,  9.20it/s]


epoch: 11, total reward: 1372.0


100%|██████████| 10/10 [02:24<00:00, 14.40s/it]


epoch: 12, total reward: 1307.0


100%|██████████| 20/20 [00:02<00:00,  9.87it/s]


epoch: 13, total reward: 1146.0


100%|██████████| 10/10 [02:03<00:00, 12.33s/it]


epoch: 14, total reward: 1175.0


100%|██████████| 20/20 [00:02<00:00,  8.74it/s]


epoch: 15, total reward: 1216.0


100%|██████████| 10/10 [02:10<00:00, 13.06s/it]


epoch: 16, total reward: 1174.0


100%|██████████| 20/20 [00:02<00:00,  9.33it/s]


epoch: 17, total reward: 1089.0


100%|██████████| 10/10 [01:58<00:00, 11.86s/it]


epoch: 18, total reward: 1008.0


100%|██████████| 20/20 [00:02<00:00,  9.15it/s]


epoch: 19, total reward: 987.0


100%|██████████| 10/10 [01:49<00:00, 10.93s/it]


epoch: 20, total reward: 1019.0


100%|██████████| 20/20 [00:02<00:00,  9.40it/s]


epoch: 21, total reward: 1019.0


100%|██████████| 10/10 [01:51<00:00, 11.17s/it]


epoch: 22, total reward: 906.0


100%|██████████| 20/20 [00:02<00:00,  9.67it/s]


epoch: 23, total reward: 940.0


100%|██████████| 10/10 [01:44<00:00, 10.46s/it]


epoch: 24, total reward: 970.0


100%|██████████| 20/20 [00:02<00:00,  8.95it/s]


epoch: 25, total reward: 896.0


100%|██████████| 10/10 [01:40<00:00, 10.06s/it]


epoch: 26, total reward: 874.0


100%|██████████| 20/20 [00:02<00:00,  9.01it/s]


epoch: 27, total reward: 867.0


100%|██████████| 10/10 [01:37<00:00,  9.77s/it]


epoch: 28, total reward: 860.0


100%|██████████| 20/20 [00:02<00:00,  8.71it/s]


epoch: 29, total reward: 853.0


100%|██████████| 10/10 [01:37<00:00,  9.72s/it]


epoch: 30, total reward: 878.0


100%|██████████| 20/20 [00:02<00:00,  9.68it/s]


epoch: 31, total reward: 837.0


100%|██████████| 10/10 [01:35<00:00,  9.51s/it]


epoch: 32, total reward: 831.0


100%|██████████| 20/20 [00:02<00:00,  9.50it/s]


epoch: 33, total reward: 845.0


100%|██████████| 10/10 [01:35<00:00,  9.58s/it]


epoch: 34, total reward: 859.0


100%|██████████| 20/20 [00:02<00:00,  9.55it/s]


epoch: 35, total reward: 828.0


100%|██████████| 10/10 [01:34<00:00,  9.41s/it]


epoch: 36, total reward: 838.0


100%|██████████| 20/20 [00:02<00:00,  9.35it/s]


epoch: 37, total reward: 822.0


100%|██████████| 10/10 [01:34<00:00,  9.40s/it]


epoch: 38, total reward: 821.0


100%|██████████| 20/20 [00:02<00:00,  9.59it/s]


epoch: 39, total reward: 800.0


100%|██████████| 10/10 [01:32<00:00,  9.21s/it]


epoch: 40, total reward: 825.0


100%|██████████| 20/20 [00:02<00:00,  8.88it/s]


epoch: 41, total reward: 862.0


100%|██████████| 10/10 [01:38<00:00,  9.82s/it]


epoch: 42, total reward: 1355.0


100%|██████████| 20/20 [00:02<00:00,  8.56it/s]


epoch: 43, total reward: 1018.0


100%|██████████| 10/10 [01:51<00:00, 11.15s/it]


epoch: 44, total reward: 997.0


100%|██████████| 20/20 [00:02<00:00,  8.81it/s]


epoch: 45, total reward: 792.0


100%|██████████| 10/10 [01:31<00:00,  9.18s/it]


epoch: 46, total reward: 797.0


100%|██████████| 20/20 [00:02<00:00,  9.81it/s]


epoch: 47, total reward: 1139.0


100%|██████████| 10/10 [02:03<00:00, 12.35s/it]


epoch: 48, total reward: 1002.0


100%|██████████| 20/20 [00:02<00:00,  9.23it/s]


epoch: 49, total reward: 1003.0


100%|██████████| 10/10 [01:54<00:00, 11.46s/it]


epoch: 50, total reward: 997.0


100%|██████████| 20/20 [00:02<00:00,  9.78it/s]


epoch: 51, total reward: 993.0


100%|██████████| 10/10 [01:50<00:00, 11.00s/it]


epoch: 52, total reward: 1007.0


100%|██████████| 20/20 [00:02<00:00,  9.38it/s]


epoch: 53, total reward: 988.0


100%|██████████| 10/10 [01:50<00:00, 11.09s/it]


epoch: 54, total reward: 995.0


100%|██████████| 20/20 [00:02<00:00,  9.41it/s]


epoch: 55, total reward: 996.0


100%|██████████| 10/10 [01:53<00:00, 11.37s/it]


epoch: 56, total reward: 970.0


100%|██████████| 20/20 [00:01<00:00, 10.28it/s]


epoch: 57, total reward: 980.0


100%|██████████| 10/10 [01:52<00:00, 11.25s/it]


epoch: 58, total reward: 996.0


100%|██████████| 20/20 [00:01<00:00, 10.81it/s]


epoch: 59, total reward: 974.0


100%|██████████| 10/10 [02:12<00:00, 13.29s/it]


epoch: 60, total reward: 992.0


100%|██████████| 20/20 [00:02<00:00,  6.82it/s]


epoch: 61, total reward: 984.0


100%|██████████| 10/10 [02:07<00:00, 12.72s/it]


epoch: 62, total reward: 979.0


100%|██████████| 20/20 [00:01<00:00, 10.62it/s]


epoch: 63, total reward: 952.0


100%|██████████| 10/10 [02:16<00:00, 13.68s/it]


epoch: 64, total reward: 951.0


100%|██████████| 20/20 [00:01<00:00, 10.44it/s]


epoch: 65, total reward: 985.0


100%|██████████| 10/10 [02:16<00:00, 13.63s/it]


epoch: 66, total reward: 962.0


100%|██████████| 20/20 [00:02<00:00,  9.68it/s]


epoch: 67, total reward: 970.0


100%|██████████| 10/10 [02:10<00:00, 13.02s/it]


epoch: 68, total reward: 966.0


100%|██████████| 20/20 [00:02<00:00,  7.02it/s]


epoch: 69, total reward: 983.0


100%|██████████| 10/10 [02:23<00:00, 14.38s/it]


epoch: 70, total reward: 961.0


100%|██████████| 20/20 [00:01<00:00, 10.08it/s]


epoch: 71, total reward: 972.0


100%|██████████| 10/10 [01:52<00:00, 11.22s/it]


epoch: 72, total reward: 941.0


100%|██████████| 20/20 [00:01<00:00, 10.09it/s]


epoch: 73, total reward: 949.0


100%|██████████| 10/10 [02:15<00:00, 13.58s/it]


epoch: 74, total reward: 993.0


100%|██████████| 20/20 [00:02<00:00,  9.50it/s]


epoch: 75, total reward: 1096.0


100%|██████████| 10/10 [02:39<00:00, 15.92s/it]


epoch: 76, total reward: 958.0


100%|██████████| 20/20 [00:01<00:00, 10.16it/s]


epoch: 77, total reward: 977.0


100%|██████████| 10/10 [01:48<00:00, 10.82s/it]


epoch: 78, total reward: 1008.0


100%|██████████| 20/20 [00:01<00:00, 10.53it/s]


epoch: 79, total reward: 774.0


100%|██████████| 10/10 [01:54<00:00, 11.47s/it]


epoch: 80, total reward: 745.0


100%|██████████| 20/20 [00:01<00:00, 10.17it/s]


epoch: 81, total reward: 748.0


100%|██████████| 10/10 [01:50<00:00, 11.02s/it]


epoch: 82, total reward: 734.0


100%|██████████| 20/20 [00:01<00:00, 10.44it/s]


epoch: 83, total reward: 747.0


100%|██████████| 10/10 [01:54<00:00, 11.44s/it]


epoch: 84, total reward: 754.0


100%|██████████| 20/20 [00:01<00:00, 11.22it/s]


epoch: 85, total reward: 751.0


100%|██████████| 10/10 [01:56<00:00, 11.66s/it]


epoch: 86, total reward: 733.0


100%|██████████| 20/20 [00:02<00:00,  7.59it/s]


epoch: 87, total reward: 736.0


100%|██████████| 10/10 [01:51<00:00, 11.18s/it]


epoch: 88, total reward: 750.0


100%|██████████| 20/20 [00:01<00:00, 10.44it/s]


epoch: 89, total reward: 745.0


100%|██████████| 10/10 [01:52<00:00, 11.28s/it]


epoch: 90, total reward: 739.0


100%|██████████| 20/20 [00:02<00:00,  9.83it/s]


epoch: 91, total reward: 737.0


100%|██████████| 10/10 [01:43<00:00, 10.37s/it]


epoch: 92, total reward: 737.0


100%|██████████| 20/20 [00:01<00:00, 10.26it/s]


epoch: 93, total reward: 738.0


100%|██████████| 10/10 [01:44<00:00, 10.49s/it]


epoch: 94, total reward: 728.0


100%|██████████| 20/20 [00:01<00:00, 10.81it/s]


epoch: 95, total reward: 742.0


100%|██████████| 10/10 [01:46<00:00, 10.64s/it]


epoch: 96, total reward: 747.0


100%|██████████| 20/20 [00:01<00:00, 10.26it/s]


epoch: 97, total reward: 730.0


100%|██████████| 10/10 [01:42<00:00, 10.29s/it]


epoch: 98, total reward: 738.0


100%|██████████| 20/20 [00:01<00:00, 10.72it/s]


epoch: 99, total reward: 740.0


100%|██████████| 10/10 [01:44<00:00, 10.45s/it]


In [89]:
S_series

[array([[-0.01548977, -0.0440152 ,  0.02847142,  0.04505366]],
       dtype=float32),
 array([[-0.08101782, -0.01720721,  0.01235009,  0.04358248]],
       dtype=float32),
 array([[-0.15734854,  0.0100327 ,  0.00835472,  0.0420404 ]],
       dtype=float32),
 array([[-0.22027431,  0.0245399 ,  0.0166254 ,  0.03287911]],
       dtype=float32),
 array([[-0.18167639, -0.09359123,  0.16739391, -0.09151803]],
       dtype=float32),
 array([[-0.07872033,  0.00668787, -0.11239037,  0.09850095]],
       dtype=float32),
 array([[-0.2599793 ,  0.01179108,  0.12721863, -0.01696027]],
       dtype=float32),
 array([[-0.15510927, -0.0676048 ,  0.04112715, -0.00236333]],
       dtype=float32),
 array([[-0.05938366,  0.00250585, -0.01052826, -0.01202909]],
       dtype=float32),
 array([[-0.03238721, -0.01581582, -0.03324349, -0.00575809]],
       dtype=float32),
 array([[-0.17942259,  0.04752523, -0.01490035,  0.019643  ]],
       dtype=float32),
 array([[-0.2948555 ,  0.00128277, -0.03440407,  0.033

In [90]:
S_actual_series

[array([-0.01548977, -0.0440152 ,  0.02847142,  0.04505366], dtype=float32),
 array([[-0.01637008,  0.15068716,  0.0293725 , -0.23851205]],
       dtype=float32),
 array([[-0.01335633,  0.34537745,  0.02460225, -0.5217872 ]],
       dtype=float32),
 array([[-0.00644878,  0.5401446 ,  0.01416651, -0.80661726]],
       dtype=float32),
 array([[ 0.00435411,  0.3448314 , -0.00196584, -0.509512  ]],
       dtype=float32),
 array([[ 0.01125074,  0.1497372 , -0.01215607, -0.21744922]],
       dtype=float32),
 array([[ 0.01424548,  0.34503078, -0.01650506, -0.51394176]],
       dtype=float32),
 array([[ 0.02114609,  0.15014513, -0.02678389, -0.22650537]],
       dtype=float32),
 array([[ 0.024149  , -0.044584  , -0.031314  ,  0.05761005]],
       dtype=float32),
 array([[ 0.02325732, -0.2392433 , -0.0301618 ,  0.34025103]],
       dtype=float32),
 array([[ 0.01847245, -0.04370546, -0.02335678,  0.03821146]],
       dtype=float32),
 array([[ 0.01759834,  0.1517435 , -0.02259255, -0.26174828]],
