<a href="https://colab.research.google.com/github/prasad-kumkar/Deep-Learning/blob/master/Navigation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn.functional as F 
import torch.nn as nn
import torch.optim as optim

import matplotlib.pyplot as plt
import numpy as np 

import random 
from collections import namedtuple, deque

from unityagents import UnityEnvironment


LR = 5e-4
TAU = 1e-3
GAMMA = 0.99
UPDATE_EVERY = 4
BUFFER_SIZE = int(1e5)
BATCH_SIZE = 64


class QNetwork(nn.Module):
    
    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)

        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return F.sigmoid(self.fc3(x))

In [0]:

class Agent:

    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size 
        self.seed = random.seed(seed)

        self.q_local = QNetwork(state_size, action_size, seed)
        self.q_target = QNetwork(state_size, action_size, seed)
        self.optimizer = optim.Adam(self.q_local.parameters(), lr=LR)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        self.t_size = 0
    
    def step(self, state, action, reward, next_state, done):

        self.memory.add(state, action, reward, next_state, done)

        self.t_size = (self.t_size+1)%UPDATE_EVERY
        if t_size==0:
            if len(self.memory) > BATCH_SIZE:
                e = self.memory.sample()
                self.learn(e)

    def act(self, state, epsilon):
        state = torch.from_numpy(state).float().unsqueeze(0)        #Get state
        self.q_local.eval()                                         #Set Q_local in evaluate mode
        #Equivalent to q_local.train(False)
        with torch.no_grad():                                       #Get Action values
            action_values = self.q_local(state)
        self.q_local.train()                                        #Train Q_local

        if random.random()>epsilon:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
    
    def learn(self, experiences, gamma=GAMMA):
        states, actions, rewards, next_states, dones = experiences

        #TD target
        Q_target_next = self.q_target(next_states).detach().max(1)[0].unsqueeze(1)
        Q_target = rewards + Q_target_next*gamma*(1-dones)

        #Currently predicted Q value
        Q_expected = self.q_local(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.q_local, self.q_target)

    def soft_update(self, local_model, target_model, tau=TAU):

        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data+(1.0-tau)*target_param.data)



class ReplayBuffer:

    def __init__(self, action_size, buffer_size, batch_size, seed):
        self.action_size = action_size
        self.batch_size = batch_size
        self.batch_size = batch_size 
        self.seed = random.seed(seed)

        self.memory = deque(maxlen=buffer_size)
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, next_state, done)
        self.memory.append(e)

    def sample(self):
        e = random.sample(self.memory, k=self.batch_size)
        
        states = torch.from_numpy(np.vstack([i.state for i in e if i is not None]))
        actions = torch.from_numpy(np.vstack([i.action for i in e if i is not None]))
        rewards = torch.from_numpy(np.vstack([i.reward for i in e if i is not None]))
        next_states = torch.from_numpy(np.vstack([i.next_state for i in e if i is not None]))
        dones = torch.from_numpy(np.vstack([i.done for i in e if i is not None]))
        
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

In [10]:
env = UnityEnvironment(file_name="/Banana.x86_64")

brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=True)[brain_name]
action_size = brain.vector_action_space_size
state = env_info.vector_observations[0]
state_size = len(state)

agent = Agent(state_size, action_size, seed = 1000)

def training(n_episodes=100,max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    scores=[]
    scores_window = deque(maxlen=100)
    eps = eps_start

    for i_episode in range(n_episodes):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        done = False
        for _ in range(max_t):    
            action = agent.act(state, eps)
            env_info = env.reset(train_mode=True)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            score+=reward
        scores.append(score)
        scores_window.append(score)
        eps = max(eps_end, eps_decay*eps)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
    return scores

score = training()
'''
while True:
    action = agent.act(action_size)
    env_info = env.step(action)[brain_name]
    next_state = env_info.vector_observations[0]
    reward = env_info.rewards[0]
    done = env_info.local_done[0]
    score += reward
    state = next_state
    if done:
        break
print(score)
'''

PermissionError: ignored

In [0]:
!cd /

In [15]:
cd

/root


In [0]:
ls

In [0]:
!ls