In [1]:
''' model imports'''
import torch 
import torch.nn as nn 
import torch.optim as optim 
import numpy as np 

''' data imports'''
from collections import namedtuple 
import gym 



''' print related options'''
from pprint import pprint 

''' writer '''
from tensorboardX import SummaryWriter 

***
**cartpole environment**
     - provides observation as 4 value tuple
     - provides an option of taking 2 actions over the returned value 

In [3]:
''' analyse and create the environment '''
gym.envs
env = gym.make('CartPole-v0')

env = gym.wrappers.Monitor(env, directory= "mon", force= True, video_callable= False)
!xvfb-run -s "-screen 0 640*480*24"
print("action space \t", env.action_space)
print("observation space \t", env.observation_space)
print("sample observation \t", env.reset())
print("reward range \t", env.reward_range)
# print("env ", env.env)
# print("spec ", env.spec)
# print("unwrapped ", env.unwrapped)

n_actions = env.action_space.n

n_obs = len(env.reset())
n_obs = env.observation_space.shape[0] # can be calculated by any of this way 

print("no of actions = ", n_actions)
print("no of obs = ", n_obs)

/bin/sh: 1: xvfb-run: not found
action space 	 Discrete(2)
observation space 	 Box(4,)
sample observation 	 [-0.04157752 -0.00740001 -0.01385926 -0.03295999]
reward range 	 (-inf, inf)
no of actions =  2
no of obs =  4


***
useful python functions:
 - [lambda function](https://www.programiz.com/python-programming/anonymous-function) 
 - [map](https://www.programiz.com/python-programming/methods/built-in/map)

In [None]:
''' define the model '''
''' n_obs -> Hidden -> n_actions '''

Hidden = 128 
class Net(nn.Module):
    
    def __init__(self, n_obs, hidden, n_actions):
        super(Net, self).__init__()
        
        self.denseNet = nn.Sequential(nn.Linear(in_features= n_obs, out_features= hidden),
                                      nn.ReLU(),
                                      nn.Linear(in_features= hidden, out_features= n_actions))
    
    def forward(self, x):
        return self.denseNet(x) 
        
net = Net(n_obs,Hidden, n_actions)
print(net)

# if needed to load state dict for pretrained model 
# net.load_state_dict(torch.load('./model/cartpoleNet.pt'))

In [None]:
''' Define storage container '''

Episode = namedtuple(typename= 'Episode', field_names= ['reward', 'steps'])
EpisodeStep = namedtuple(typename= 'EpisodeStep', field_names= ['observation', 'action'])

In [None]:
''' generate data '''
batch_size = 16

def iterate_batches(env, net, batch_size):
    batch = []
    episode_steps = [] 
    episode_reward = 0.0 
    
    obs = env.reset() # list return 
    sm = nn.Softmax(dim= 1 )
    
    while True:
        obs_v = torch.FloatTensor([obs]) # torch accepts in format [batch_size, n_obs]
        net_output = sm(net(obs_v))
        
        action_probs = net_output.data.numpy()[0]
        action = np.random.choice(a = len(action_probs), p = action_probs)
        next_obs, reward, is_done, _ = env.step(action)
        
        episode_reward += reward 
        episode_steps.append(EpisodeStep(observation= obs, action= action))
        
        if is_done:
            batch.append(Episode(reward= episode_reward, steps= episode_steps))
            
            episode_reward = 0.0 
            episode_steps = []
            
            next_obs = env.reset()
            
            if len(batch) == batch_size:
                yield batch 
                batch = []
                
                
        obs = next_obs 
        
        
################### filter batches 

def filterBatch(batch:list, percentile:int):
    train_obs = []
    train_actions = []
    
    rewards = list(map(lambda s:s.reward, batch))
    rewards_bound = np.percentile(rewards, percentile)
    rewards_mean = np.mean(rewards) 
    
    for episode in batch:
        if episode.reward < rewards_bound:
            continue
        train_obs.extend(map(lambda step:step.observation, episode.steps))
        train_actions.extend(map(lambda step: step.action, episode.steps))
    
    train_obs = torch.FloatTensor(train_obs)
    train_actions = torch.LongTensor(train_actions)
    
    return train_obs, train_actions, rewards_bound, rewards_mean 


In [None]:
log = gym.logger
log.set_level(gym.logger.INFO)

In [None]:
# loss
objective = nn.CrossEntropyLoss()

# optimizer 
opt = optim.Adam(net.parameters(), lr= 0.01)

In [None]:
''' main script '''
writer = SummaryWriter() 
env.reset()
for iter_no, batch in enumerate(iterate_batches(env, net, batch_size)):
    train_obs, train_actions, rewards_bound, rewards_mean = filterBatch(batch, 50)
    
    ################# train #################################
    opt.zero_grad()
    
    action_scores = net(train_obs)
    
    loss = objective(action_scores, train_actions)
    loss.backward()
    opt.step() 

    ############### write to tensorboard ####################
    log.info("Iter %d: loss=%.3f, mean reward =%.3f, reward bound = %.3f", iter_no,loss.item(),rewards_mean, rewards_bound)
    writer.add_scalar("loss", loss.item(), iter_no)
    writer.add_scalar("mean_reward", rewards_mean, iter_no)
    writer.add_scalar("reward_bound", rewards_bound, iter_no)
    
    if rewards_mean > 199:
        print(rewards_mean)
        print("solved")
        print("saving model weights ")
        torch.save(net.state_dict(), f= './model/cartpoleNet.pt')
        break 
        
    writer.close() 

In [None]:
def test(env, net):
    episode_reward = 0.0 
    obs = env.reset() # list return 
    sm = nn.Softmax(dim= 1 )
    
    rewards = []
    
    while True:
        obs_v = torch.FloatTensor([obs]) # torch accepts in format [batch_size, n_obs]
        net_output = sm(net(obs_v))
        
        action_probs = net_output.data.numpy()[0]
        action = np.random.choice(a = len(action_probs), p = action_probs)
        next_obs, reward, is_done, _ = env.step(action)
        
        episode_reward += reward 
        
        if is_done:
            print("episode reward: ", episode_reward)
            rewards.append(episode_reward)
            
            episode_reward = 0.0 
            next_obs = env.reset()
            
            if len(rewards) == 100:
                print("mean reward of hundred episodes ", np.mean(rewards))
                break 
                
                
                
        obs = next_obs 
        
test(env, net)