In [None]:
# sudo pip3 install box2d box2d-kengz



"""
Train a simple 1 hidden layer nn to for lunar_lander(provide link) using torch


"""
import gym
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable


class NeuralNet(nn.Module):
    def __init__(self, state_size, action_size, hidden_layer_size, learning_rate=0.001):
        super(NeuralNet, self).__init__()

        self.linear1 = nn.Linear(state_size, hidden_layer_size)
        self.linear2 = nn.Linear(hidden_layer_size, action_size) 
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, x):
        
        x = Variable(torch.from_numpy(x).float())
        x = F.relu(self.linear1(x))
        x = F.softmax(self.linear2(x), dim=0)
        return x
    
    
class RL_Agent:
    def __init__(self, env, state_size, action_size, hidden_layer_size, horizon = 1000, discount_factor = 0.99):
        
        self.state_size = state_size
        self.action_size = action_size
        self.pg_network = NeuralNet(state_size, action_size, hidden_layer_size).to(device)
        self.horizon = horizon #
        self.env = env
        self.discount_factor = discount_factor
        

    def get_action(self, state):
        

        probs = self.pg_network.forward(state)
        action = np.random.choice(self.action_size, p=np.squeeze(probs.detach().numpy()))
        log_prob = torch.log(probs.squeeze(0)[action])
        
        return action, log_prob


    def run_episode(self, train = False):

        states = []
        actions = []
        rewards = []
        log_probs = []

        state = self.env.reset()
        total_reward = 0
        done = False
        steps = 0

        while (not done) and (steps <= self.horizon):
            steps += 1
            
            action, log_prob = self.get_action(state)

            next_state, reward, done, info = self.env.step(action)
            total_reward += reward
            
            log_probs.append(log_prob)
            states.append(state)
            actions.append(action)
            rewards.append(reward)

            state = next_state

        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards)
        
        if train == True:
            self.update_policy(rewards, log_probs)

        return total_reward

    def update_policy(self, rewards, log_probs):
        discounted_rewards = []

        for i in range(len(rewards)):
            future_rewards = rewards[i:]
            discounts = [self.discount_factor ** i for i in range(len(future_rewards))]
            discounted_rewards.append(sum(np.multiply(future_rewards,discounts)))

        discounted_rewards = torch.tensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std()) 

        policy_gradient = []

        for log_prob, r in zip(log_probs, discounted_rewards):
            policy_gradient.append(-log_prob * r)

        self.pg_network.optimizer.zero_grad()
        policy_gradient = torch.stack(policy_gradient).sum()
        policy_gradient.backward()
        self.pg_network.optimizer.step()


# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


env = gym.make("LunarLander-v2")
input_dim = env.observation_space.shape[0] # Describe
output_dim = env.action_space.n # Describe

print(input_dim, output_dim)


agent = RL_Agent(env = env, state_size = 8, action_size = 4, hidden_layer_size = 16)



for episode in range(10000):
    reward = agent.run_episode(train = True)
    print(episode, reward)
