In [1]:
import numpy as np
import gym
import tensorflow as tf
import collections,itertools
import matplotlib.pyplot as plt

import seaborn as sns
import pandas as pd

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [None]:
env = gym.make('CartPole-v0')

In [None]:
env.observation_space.shape

In [None]:
class Policy():
    
    def __init__(self,lr=0.01):
        
        self.state = []
        self.ob_space = env.observation_space.shape
        self.action_space = env.action_space.n
        
        self.model = tf.keras.models.Sequential()
        
        #state_one_hot = tf.one_hot(self.state, int(env.observation_space.n))
        
        self.model.add(tf.keras.layers.InputLayer(input_shape=(self.ob_space[0],)))
        self.model.add(tf.keras.layers.Dense(self.action_space, activation="softmax"))
        
        #self.loss = tf.keras.metrics.mean_squared_error()
        self.opt = tf.keras.optimizers.Adam()
        
        self.var_list_fn = [i for i in self.model.trainable_weights]
        
        
        
    def predict(self,state):
        
        return self.model.predict(state)
    
    def update(self,state,target,action):

        
   
        with tf.GradientTape() as tape:
        
            tape.watch(self.model.variables)
        
            output = tf.squeeze(self.model(state))
       
            self.picked_action_prob = tf.gather(output, action)
            
            self.loss = -tf.math.log(self.picked_action_prob) * target
            
        grads = tape.gradient(self.loss,self.model.variables)
        
        self.opt.apply_gradients(zip(grads, self.var_list_fn))


        return self.loss

        
    

In [None]:
p = Policy()


test = np.zeros(4).reshape(1,-1)

state = np.zeros_like(env.observation_space.shape[0]).reshape(1,-1)
target = np.zeros_like(env.action_space.n)
action = np.zeros_like(env.action_space.n)

act = np.array(0)

ad = np.array([[1.0330247,1.0195254,1.0393808,0.9419981]])

print(p.update(test,ad,act))
#print(p.predict(test))
        
        

In [None]:
class Value():
    def __init__(self,lr=0.01):
        
        self.ob_space = env.observation_space.shape
        
        self.model = tf.keras.models.Sequential()
                
        self.model.add(tf.keras.layers.InputLayer(input_shape=(self.ob_space[0],)))
        self.model.add(tf.keras.layers.Dense(self.ob_space[0]))
        
        self.var_list_fn = [i for i in self.model.trainable_weights]
        
        self.opt = tf.keras.optimizers.Adam()
        
        
        
    def predict(self,state):
        
        return self.model.predict(state)
    
    def update(self,state,target):
        
        
        with tf.GradientTape() as tape:
        
            tape.watch(self.model.variables)
        
            output = self.model(state)
            
            self.loss = tf.keras.metrics.mean_squared_error(target,output)
            #self.loss = output * target
            
            
        grads = tape.gradient(self.loss,self.model.variables)
        
        self.opt.apply_gradients(zip(grads, self.var_list_fn))
    
        
        return self.loss
        
        
        

In [None]:
test = np.zeros(4).reshape(1,-1)

v = Value()

print(v.update(test,test))

In [None]:
def reinforce(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):
    
    
    Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    
    log_t = []
    log_i = []
    
    
    for i in range(num_episodes): # bun episode
        
        state = env.reset()

        episode = []
        
        log_t.append([])
        log_i.append([])
        
        r_ = 0
        
        for t in itertools.count():

            # take action based on the currnet random policy 
            
            action_probs = estimator_policy.predict(state.reshape(1,-1))
            action = np.random.choice(np.arange(len(action_probs.reshape(-1))), p=action_probs.reshape(-1)) # is this the same as argmax 
            next_state, reward, done, _ = env.step(action)
            
            print(f"step =>{action} reward =>{reward} iter =>{t}")
            
            # hold the output of the step in a list (s,a,r,n_s,d)
            episode.append(Transition(
              state=state, action=action, reward=reward, next_state=next_state, done=done))
            
            
            
            r_ += reward
            log_t[i].append(r_)
            log_i[i].append(t)
            
            if done:
                break
            
            state = next_state
            
            
        
            
        for i,transition in enumerate(episode):
            total_return = sum(discount_factor**i * t.reward for i, t in enumerate(episode[t:])) # episode[t:] == episode[-1]
            
            # compute the advantage 
            
            baseline_value = estimator_value.predict(transition.state.reshape(1,-1))  
            advantage = total_return - baseline_value
            
            # update value est and policy est 
            
                        
            estimator_value.update(transition.state.reshape(1,-1), total_return)
            estimator_policy.update(transition.state.reshape(1,-1), advantage, transition.action)
        
    return log_t,log_i
            
        

In [None]:
policy = Policy()
value = Value()
re = reinforce(env,policy,value,10)

In [None]:
log_t,log_i = re


In [None]:
%matplotlib inline

In [None]:


fig, axs = plt.subplots(ncols=1)
legend = ["ep "+str(i) for i in range(10)]



sns.lineplot(data = log_t)
plt.legend(legend, ncol=2, loc='upper left');

    

In [None]:
for i in range(1):
    env.reset()
    for _ in range(1000):
        #env.render()
        a = env.action_space.sample()
        print(a)
        env.step(a) # take a random action
    env.close()
