<a href="https://colab.research.google.com/github/NonMagneticNeedle/reinforcement_learning/blob/master/ppo_tf.keras/PPO_LSTM_DISCRETE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import tensorflow as tf
import numpy as np
import gym
import math
import time
#import tensorflow_probability as tfp
#display gym
tf.compat.v1.disable_eager_execution()
""""""
print(tf.__version__)


class ppo():
	def __init__(self,name,s_dim,a_dim,memory,a_bound,time_steps,lstm_units):
		self.s_dim = s_dim
		self.a_dim =a_dim
		self.memory = memory
		self.a_bound =a_bound
		self.time_steps =time_steps
		self.name = name
		self.lstm_units= lstm_units
		self.actor  = self.make_actor()
		self.critic  = self.make_critic()
		
	def make_actor(self):
			

		state_inputs = tf.keras.Input(batch_shape=(1,self.time_steps,self.s_dim), name='state')
		advantage = tf.keras.Input(batch_shape=(1,self.time_steps,1 ), name="Advantage")
		old_prediction= tf.keras.Input(batch_shape=(1,self.time_steps,self.a_dim), name="old_prediction")
		x = tf.keras.layers.Masking(mask_value=-1.)(state_inputs)	
		x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(300, activation='relu'))(x)
		x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(300, activation='relu'))(x)
		x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(300, activation='relu'))(x)
	
		x,state_h, state_c = tf.keras.layers.LSTM(units= self.lstm_units,return_sequences=True,return_state=True,stateful=True,name='a_lstm',unroll = True)(x)
		x = tf.keras.layers.Dense(100, activation='relu')(x)
		action_outputs = tf.keras.layers.Dense(self.a_dim, activation='softmax',name = 'policy_head')(x)
		
		def proximal_policy_optimization_loss(advantage, old_prediction):
			loss_clipping = 0.2
			entropy_loss = 0.01
			advantage = advantage[0]
			old_prediction = old_prediction[0]
			#y_true = one hot actions , y_pred = prob output
			def loss(y_true, y_pred):
				#y_true is the one hot action taken
				y_true = y_true[0]
				y_pred = y_pred[0]
				prob = y_true * y_pred
				old_prob = y_true * old_prediction
				r = prob / (old_prob + 1e-10)
				loss = -tf.keras.backend.mean(tf.keras.backend.minimum(r * advantage, tf.keras.backend.clip(r, min_value=1 - loss_clipping,max_value=1 + loss_clipping) * advantage)) #+ entropy_loss * (prob * tf.keras.backend.log(prob + 1e-10)))
				return loss
			return loss	
		actor_model= tf.keras.Model(inputs=[state_inputs, advantage,old_prediction], outputs=[action_outputs,state_h,state_c], name='actor_model')
		actor_model.compile(loss={'policy_head':proximal_policy_optimization_loss(advantage=advantage,old_prediction=old_prediction)} ,optimizer=tf.keras.optimizers.Adam(lr=0.0001))
		return actor_model
	def make_critic(self):
			

		state_inputs = tf.keras.Input(batch_shape=(1,self.time_steps,self.s_dim), name='state')
		x = tf.keras.layers.Masking(mask_value=-1.)(state_inputs)	
		x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(300, activation='relu'))(x)
		x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(300, activation='relu'))(x)
		x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(300, activation='relu'))(x)
		x,state_h, state_c = tf.keras.layers.LSTM(units= self.lstm_units,return_sequences=True,return_state=True,stateful=True,name='c_lstm',unroll =True)(x)
		x = tf.keras.layers.Dense(100, activation='relu')(x)	
		val = tf.keras.layers.Dense(1, activation=None,name = "value_head",use_bias=True)(x)
		#hope is msaking makes skips the gradients too # still it trains the layers (after lstms) bias value even if the loss is 0 due to mask
		critic_model= tf.keras.Model(inputs=state_inputs, outputs=[val,state_h,state_c], name='critic_model')
		critic_model.compile(loss={'value_head':'mse'} ,optimizer=tf.keras.optimizers.Adam(lr=0.001))
		return critic_model
	def set_lstm_state_actor(self,h,c):
		self.actor.get_layer('a_lstm').reset_states([h,c])
	def zero_lstm_state_actor(self):
		c_0 = np.zeros((1,self.lstm_units)).astype(np.float32)
		h_0 = np.zeros((1,self.lstm_units)).astype(np.float32)
		self.actor.get_layer('a_lstm').reset_states([h_0,c_0])

	def set_lstm_state_critic(self,h,c):
		self.critic.get_layer('a_lstm').reset_states([h,c])
	def zero_lstm_state_critic(self):
		c_0 = np.zeros((1,self.lstm_units)).astype(np.float32)
		h_0 = np.zeros((1,self.lstm_units)).astype(np.float32)
		self.critic.get_layer('c_lstm').reset_states([h_0,c_0])
	def onehot(self,a):
		i = np.zeros(self.a_dim)
		i[a]=1
		return i		
	
	def gae_calc(self,val,val_,rew,done):
		mask=1 
		gae=0
		gamma=0.99
		lambd = 0.95
		returns=np.zeros_like(val)
		for i in reversed(range(0,len(val))):
			mask=1
			if done[i]:
				mask = 0 	
			delta=rew[i]+gamma*val_[i]*mask - val[i]
			gae=delta+gamma*lambd*mask*gae
			returns[i]=gae+val[i]
		return returns
		
	def adv_calc(self,val,val_,rew,done):
		gamma=0.99
		returns=np.zeros_like(val)
		for i in range(0,len(val)):
			returns[i] = rew[i] + (1- done[i])*val_[i]*gamma
		return returns
	def batchify(self,lst):
		def ceil(n):
			res = int(n)
			return res if res == n or n < 0 else res+1	
		def front_pad_batch(batchsize,data):
			d = np.zeros((batchsize,data.shape[-1]))-1.
			d[batchsize-data.shape[0]:] = data
			return d	
		batchsize =self.time_steps
		full_size = ceil(lst.shape[0]/batchsize)*batchsize

		return front_pad_batch(full_size,lst)	
	def batchify_adv(self,lst):
		def ceil(n):
			res = int(n)
			return res if res == n or n < 0 else res+1	
		def front_pad_batch(batchsize,data):
			d = np.zeros((batchsize,data.shape[-1]))
			d[batchsize-data.shape[0]:] = data
			return d	
		batchsize =self.time_steps
		full_size = ceil(lst.shape[0]/batchsize)*batchsize

		return front_pad_batch(full_size,lst)			
	def batchify_oldpred(self,lst):
		def ceil(n):
			res = int(n)
			return res if res == n or n < 0 else res+1	
		def front_pad_batch(batchsize,data):
			d = np.zeros((batchsize,data.shape[-1]))+1
			d[batchsize-data.shape[0]:] = data
			return d	
		batchsize =self.time_steps
		full_size = ceil(lst.shape[0]/batchsize)*batchsize

		return front_pad_batch(full_size,lst)	
	def train(self,epochs=10):
		time_horizon=self.time_steps

		obs =np.array(self.memory.batch_s)
		obs_ =np.array(self.memory.batch_s_)
		hs =np.array(self.memory.batch_h)	
		cs =np.array(self.memory.batch_c)
		all_obs=self.memory.batch_s.copy()
		all_obs.append(self.memory.batch_s_[-1]  )   
		all_obs = np.array(all_obs)
		all_obs_no = all_obs.shape[0]    
		all_obs_batch = self.batchify(all_obs)               
		all_values = np.zeros((all_obs_batch.shape[0],1))
		self.zero_lstm_state_critic()    
		for b in range(0,all_obs_batch.shape[0],time_horizon):
			vb ,h,c=	self.critic.predict(x=(np.array([all_obs_batch[b:b+time_horizon]])))
			all_values[b:b+time_horizon] = vb[0]
			
		values =all_values[-all_obs_no:-1] 
		values_ = all_values[-all_obs_no+1:]	
		#returns = self.adv_calc(values,values_,self.memory.batch_r,self.memory.batch_done) ############################################################
		returns = self.gae_calc(values,values_,self.memory.batch_r,self.memory.batch_done) #############################################################
		advantage=np.array(returns-values)
		Action=np.array(self.memory.batch_a)
		Old_Prediction =np.array(self.memory.pred)
		#print(obs.shape,advantage.shape,Action.shape,Old_Prediction_musig.shape,returns.shape)		
		obs = self.batchify(obs)
		advantage = self.batchify_adv(advantage)
		Action =self.batchify_adv(Action)
		Old_Prediction =self.batchify_oldpred(Old_Prediction)
		returns =self.batchify(returns)
		#print(obs.shape,advantage.shape,Action.shape,Old_Prediction.shape,returns.shape)
		#print(returns,obs)
		for ep in range(epochs):
			self.zero_lstm_state_actor()
			self.zero_lstm_state_critic()
			for b in range(0,obs.shape[0],time_horizon):
				#train actor
				self.actor.fit(x=(np.array([obs[b:b+time_horizon]]),np.array([advantage[b:b+time_horizon]]),np.array([Old_Prediction[b:b+time_horizon]])),y={'policy_head':np.array([Action[b:b+time_horizon]])},shuffle=False, epochs=1, verbose=0)

				#train critic	
				self.critic.fit(x=(np.array([obs[b:b+time_horizon]])),y={'value_head':np.array([returns[b:b+time_horizon]])},shuffle=False, epochs=1, verbose=0)
		self.memory.clear()



class Memory:
	def __init__(self):
		self.batch_s = []
		self.batch_a = []
		self.batch_r = []
		self.batch_s_ = []
		self.batch_h = []
		self.batch_c =[]
		self.batch_done = []
		self.pred = []
	def store(self, s, a, s_, r, done,pred,h,c):
		self.batch_s.append(s)
		self.batch_a.append(a)
		self.batch_r.append(r)
		self.batch_s_.append(s_)
		self.batch_done.append(done)
		self.pred.append(pred)
		self.batch_h.append(h)
		self.batch_c.append(c)
	def clear(self):
		self.batch_s.clear()
		self.batch_a.clear()
		self.batch_r.clear()
		self.batch_s_.clear()
		self.batch_done.clear()
		self.pred.clear()
		self.batch_h.clear()
		self.batch_c.clear()
	def cnt_samples(self):
		return len(self.batch_s)


def front_pad_single(batchsize,data):
	d = np.zeros((batchsize,data.shape[-1]))-1.
	#print("d ",d.shape ,"data",data.shape)
	d[-1] = data
	#print("padded sequence",d)
	return d




envs = ['CartPole-v1']

env= envs[0]
env1=gym.make(env)
env1=env1.unwrapped

s_dim1 =env1.observation_space.shape[0]
print(s_dim1)
a_dim1 =env1.action_space.n
print(a_dim1)
a_bound1 = 1#env1.action_space.high[0]
print(a_bound1)

memory_1=Memory()
time_horizon=64##########################

agent_1 =  ppo(name = "ppo_agent_01",s_dim=s_dim1 ,a_dim= a_dim1,memory = memory_1,a_bound=a_bound1,time_steps=time_horizon,lstm_units=100)

def preprocess(state):
	#return np.array([state[0],state[1],state[4],state[6],state[7]])
	return state



DUMMY_ACTION1, DUMMY_VALUE1 ,DUMMY_LOSS_MASK = np.zeros((1,time_horizon,a_dim1)), np.zeros((1,time_horizon, 1)),np.zeros((1,time_horizon, 1))

#check if final hidden states are consistnent oor not by taking time as 1,2,5,10 
#find vs_ of terminal state
episodes = 30000
steps = 200
render=0
print("starting>>>")



apath="/gdrive/My Drive/ll_clean/ppo_lstm_clean_mean_tiny_actormodel64.h5"
cpath="/gdrive/My Drive/ll_clean/ppo_lstm_clean_mean_tiny_criticmodel64.h5"
#agent_1.actor.load_weights(apath)
#agent_1.critic.load_weights(cpath)
print("loaded")
avg =[]
t = time.time()
best = 0 
z =0
for episode in range(1,episodes):
	done1=False
	stp=0
	s1=env1.reset()
	s1 = preprocess(s1)
	agent_1.zero_lstm_state_actor()
	
	rews1 = 0	
	if episode > 500:
		render=0

	while not done1:
		#if stp%(steps//10) ==0:
		#	print("+",end='')
		if render:
			env1.render()	
		output1= agent_1.actor.predict((np.array([front_pad_single(time_horizon,s1)]),DUMMY_VALUE1,DUMMY_ACTION1))
		#print(output1)
		pred_action = output1[0][0][-1]# getting the recent action
		h = output1[1]
		c = output1[2]
		#print(pred_action,pred_action.shape)
		action = np.random.choice(np.arange(pred_action.shape[0]), p=pred_action.ravel())# action chosen
		action_one_hot= agent_1.onehot(action)# acton matrix
		s_1, reward1, done1, info1 = env1.step(action)
		#print("s_1",s_1, end='')	
		s_1 = preprocess(s_1)
		#print(" sub:",s_1-s1)
		stp=stp+1	


		if (stp>steps):# or (s_1[3]+s_1[4]>1):
			done1=True		
		agent_1.memory.store(s1,action_one_hot ,s_1,reward1,done1,pred_action.ravel(),h,c)# state, selected onehot action, next state, reward , done ,predicted action, h , c
		rews1+=reward1

		s1 = s_1
	avg.append(rews1)
	if rews1>100:
		best+=1 
	if rews1>0:
		z+=1 		
	agent_1.train()
	if episode % 100 == 0:
		print("current ",episode , " average of last 100 episodes :", np.mean(np.array(avg)), " max:", np.max(np.array(avg)), " min:", np.min(np.array(avg)), " more than 100 games:", best," more than 0 games:", z )
		z=0
		best = 0
		avg.clear()
	

		apath="/gdrive/My Drive/ll_clean/ppo_lstm_clean_mean_tiny_actormodel64.h5"
		cpath="/gdrive/My Drive/ll_clean/ppo_lstm_clean_mean_tiny_criticmodel64.h5"

		#perfect solve around 700th ep
		#agent_1.actor.save_weights(apath)
		#agent_1.critic.save_weights(cpath)
		print(" total time taken (min): " ,(time.time()-t)/60)
		t = time.time()

		 

2.2.0
4
2
1
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
starting>>>
loaded
current  100  average of last 100 episodes : 148.87  max: 201.0  min: 9.0  more than 100 games: 75  more than 0 games: 100
 total time taken (min):  5.1113889137903845
current  200  average of last 100 episodes : 156.58  max: 201.0  min: 72.0  more than 100 games: 94  more than 0 games: 100
 total time taken (min):  4.975412770112356
current  300  average of last 100 episodes : 140.12  max: 201.0  min: 58.0  more than 100 games: 84  more than 0 games: 100
 total time taken (min):  4.395323395729065
current  400  average of last 100 episodes : 133.47  max: 201.0  min: 63.0  more than 100 games: 82  more than 0 games: 100
 total time taken (min):  4.2390475114186605
current  500  average of last 100 episodes : 152.72  max: 201.0  min: 26.0  more than 100 games: 93  more than 0 games: 100
 total time taken (min):  4.817508323987325
current  600  average of last 100 episodes : 17

KeyboardInterrupt: ignored