In [1]:
import gym
from collections import deque
import numpy as np
import time

import torch
torch.manual_seed(0) # set random seed
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from policy import Policy
from gym.wrappers.monitoring.video_recorder import VideoRecorder
# import tqdm
import pdb

In [2]:
env = gym.make('LunarLander-v2')
env.seed(0)
print('observation space:', env.observation_space)
print('action space:', env.action_space)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

policy = Policy(s_size=8, h_size=256, a_size=4).to(device)
optimizer = optim.Adam(policy.parameters(), lr=0.001)

observation space: Box(-28.274333953857422, 28.274333953857422, (6,), float32)
action space: Discrete(3)


In [6]:
def reinforce(n_episodes=300, max_t=5000, gamma=0.99, print_every=100):
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_episodes+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
#             env.render()
            rewards.append(reward)
            if done:
                break 
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        discounts = [gamma**i for i in range(len(rewards)+1)]
        R = sum([a*b for a,b in zip(discounts, rewards)])
        
        policy_loss = []
        for log_prob in saved_log_probs:
            policy_loss.append(-log_prob * R)
        policy_loss = torch.cat(policy_loss).sum()
        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        if i_episode % print_every == 0:
            torch.save(policy.state_dict(), 'checkpoint.pth')
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))        
    return scores

In [7]:
scores = reinforce()

Episode 100	Average Score: -335.95


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
c=np.array([i for i in range(scores.shape[0])])
plt.plot(c,scores, label ='total reward vs episode')
plt.legend()
plt.show()