In [7]:
''' imports '''

import gym
import numpy as np 


''' model imports'''
import torch 
import torch.nn as nn 

''' data imports'''
from collections import namedtuple 

''' visualisation '''
from tensorboardX import SummaryWriter

# Some Gym Concept and analysis 

    - observation space  Box(4,) 
    - action space:  Discrete(2) 
    - sample observation:  [-0.00220126 -0.01694481  0.01935941 -0.03202244]
    - high:  3.4028235e+38
    - low:  -3.4028235e+38

# Building The solution

**issue**:

    - our model was not imporving due to several reasons, which are hence the limitations of crossentropy method also. 
        - episodes should have been finite and preferably short.
        - the total reward for the episodes should have enough variability to seprate good episodes from bad
        - there is no intermediate indication about whether the agent has succedded or failed
    
**solution: clearly crooentropy method is not the good solution for it, but we can improve over croosentropy using some fixes**
    - larger batches of played episodes
    - discount factor applied to reward
    - keeping elite episodes for a longer time
    - decrease learning rate
    - much longer training time. 

In [8]:
batch_size = 128 
lr = 0.001 
percentile = 30
Gamma = 0.9 

In [9]:
class DiscreteOneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(DiscreteOneHotWrapper, self).__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete)
        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)

    def observation(self, observation):
        res = np.copy(self.observation_space.low)
        res[observation] = 1.0
        return res

In [10]:
env = DiscreteOneHotWrapper(gym.make("FrozenLake-v0"))

print("observation space: ", env.observation_space)
print("action space: ", env.action_space)
print("sample observation: ", env.reset())


n_obs = env.observation_space.shape[0]
n_actions = env.action_space.n 


observation space:  Box(16,)
action space:  Discrete(4)
sample observation:  [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [11]:
hidden = 128 

class Net(nn.Module):
    
    def __init__(self, n_obs, hidden, n_actions):
        super(Net, self).__init__()
        
        self.pipe = nn.Sequential(nn.Linear(in_features= n_obs, out_features= hidden),
                                  nn.ReLU(),
                                  nn.Linear(in_features= hidden, out_features= n_actions))
        
    def forward(self, x):
        return self.pipe(x) 
    
net = Net(n_obs, hidden, n_actions)
print(net)

Net(
  (pipe): Sequential(
    (0): Linear(in_features=16, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=4, bias=True)
  )
)


In [12]:
Episode = namedtuple('Episode', field_names= ("reward", "steps"))
Episode_step = namedtuple('Episode_step', field_names= ("observation", "action"))

In [13]:
def iterate_batches(env, net, batch_size):
    batch = []
    single_episode = [] 
    
    sm = nn.Softmax(dim= 1)
    obs_ = env.reset()
    
    episode_reward = 0.0 
    while True:
        # obs -> get probs -> take action based on probs
        obs = torch.FloatTensor([obs_])
        action_probs = sm(net(obs)).data.numpy()[0]
        action = np.random.choice(a = n_actions, p = action_probs)
        
        # single step in episode completes, append to the single_episode as an Episode Step 
        single_episode.append(Episode_step(observation= obs_, action= action)) 
        new_obs, reward, done, info = env.step(action)
        
        episode_reward += reward 
        
        if done:
            # episode done, so make an episode tuple and append to the batch 
            batch.append(Episode(reward= episode_reward, steps= single_episode.copy()))
            # clear for next episode 
            episode_reward = 0.0 
            single_episode.clear()
            new_obs = env.reset()
            
            if len(batch) == batch_size:
                yield batch
                batch.clear() 
                
        obs_ = new_obs 

In [21]:
def filterBatches(batch, percentile):
    
    ############### using discounted rewards ###########################
    rewards = list(map(lambda s: s.reward*(Gamma**len(s.steps)), batch))
    rewards_bound = np.percentile(a = rewards, q= percentile)
    
    train_obs = []
    train_action = []
    elite_batch = []
                   
    for episode, disc_reward in zip(batch, rewards):
        if episode.reward >= rewards_bound:
            observations = list(map(lambda s: s.observation, episode.steps))
            actions = list(map(lambda s: s.action, episode.steps))
            
            train_obs.extend(observations)
            train_action.extend(actions) 
            elite_batch.append(episode)

       
    train_obs = torch.FloatTensor(train_obs)
    train_action = torch.LongTensor(train_action)
    return train_obs, train_action,elite_batch, rewards_bound 
    
    
# for batch in iterate_batches(env, net, 3):
#     break 
    
# o, a, m, b = filterBatches(batch, 70)
# print(o.shape, a.shape, m, b)

# # torch.Size([7, 16]) torch.Size([7]) 0.0 0.0

In [15]:
logs = gym.logger 
logs.set_level(gym.logger.INFO)

In [30]:
writer = SummaryWriter(comment= 'extended')

In [17]:
objective = nn.CrossEntropyLoss()
opt = torch.optim.Adam(params= net.parameters(), lr= 0.001, betas= (0.9,0.999))

In [31]:
''' main script '''
elite_batches = []
for i, batch in enumerate(iterate_batches(env, net, batch_size)):
    r_mean = float(np.mean(list(map(lambda s:s.reward, batch))))
    # get data
    obs, actions, elite_batches, r_bound = filterBatches(elite_batches + batch, percentile= percentile)
    
    if not elite_batches:
        continue 
    
    
    ############################# train ##################################
    # set gradients zero 
    opt.zero_grad()
    
    # forward
    logits = net(obs)
    # loss
    loss = objective(logits, actions)
    # gradients
    loss.backward()
    # optimize
    opt.step()
    
    ############################ writer ####################################
    if i % 100 == 0:
        logs.info("loss: %.3f mean_reward: %3.f bound_reward: %.3f %d",loss.item(), r_mean, r_bound, i)
        writer.add_scalar('mean_reward: ', r_mean, i)
        writer.add_scalar('loss', loss.item(), i)
        writer.add_scalar('bound_reward', r_bound, i)

    if r_mean > 0.8:
        print("solved with mean of ", r_mean)
        break

INFO: loss: 1.373 mean_reward:   0 bound_reward: 0.000 0
INFO: loss: 1.370 mean_reward:   0 bound_reward: 0.000 100
INFO: loss: 1.370 mean_reward:   0 bound_reward: 0.000 200
INFO: loss: 1.370 mean_reward:   0 bound_reward: 0.000 300
INFO: loss: 1.371 mean_reward:   0 bound_reward: 0.000 400
INFO: loss: 1.372 mean_reward:   0 bound_reward: 0.000 500
INFO: loss: 1.371 mean_reward:   0 bound_reward: 0.000 600
INFO: loss: 1.371 mean_reward:   0 bound_reward: 0.000 700
INFO: loss: 1.373 mean_reward:   0 bound_reward: 0.000 800
INFO: loss: 1.370 mean_reward:   0 bound_reward: 0.000 900
INFO: loss: 1.369 mean_reward:   0 bound_reward: 0.000 1000
INFO: loss: 1.368 mean_reward:   0 bound_reward: 0.000 1100
INFO: loss: 1.367 mean_reward:   0 bound_reward: 0.000 1200


KeyboardInterrupt: 

**<img src="./bound_reward_naive curve.svg" style = "width:400px;"> Reward bound </img>**
 > reward bound is not improving, indicationg percentile = 70 th number episode is still zero.
 
**<img src="./loss_naive curve.svg" style = "width:600px;" > loss curve </img>**
**<img src="./mean_reward_naive curve.svg" style = "width:400px;" > mean reward curve</img>**
   >mean reward near to zero and not improving much. 

In [None]:
torch.save