# Combining Neural Networks and Reinforcement learning. 
1. Use deep neural networks to estimate the q values to each state/Action pair. Input - State and Output - q(s,a1),q(s,a2) etc. 
2. Stack of consequetive frames are used as input to the network. 

<img src="dqn1.png"/>


Experience replay: 
we store the agents experience in a dataset called replay memory. We use these replay memory samples to train the network. 
using the sequential data to train the network will lead to an ineffiecient network.  

<img src="dqn2.png"/>

Loss calculation in the network: 
Loss is calculated between the result of the network(estimated q value) and the actual bellman equation. 

 


1. Initialize replay memory capacity. 
2. Initialize the network with random weights. 
3. Clone the policy network and call it Target network. 
4. For each episode: 
    1. Initialie the starting state:
        - via exploration-exploitation strategy. 
    2. Execute selected action on emulator. 
    3. Observe reward and next state. 
    4. Store experience in replay memory. 
    5. Sample random batch from replay memory. 
    6. Preprocess states from batch. -- DONT UNDERSTAND  
    7. Pass batch of preprocessed states to policy network. 
    8. Calculate loss between output Q-values and target Q-values:
        - Require a pass to the target network for the next state. 
    9. Gradient descent updates weights in the policy network to minimize loss:
        - After x time steps, weights in the target network are updated using the weights in the policy network. 
 

# Import Libraries/packages

In [4]:
import gym
import math
import random
import numpy as np
import matplotlib
from collections import namedtuple
from itertools import count
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [5]:
# Setup display
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython: from IPython import display

# Deep Q-Network

In [6]:
# Deep Q-Network

class DQN(nn.Module):
    def __init__(self, img_height, img_width):
        super().__init__()
        
        self.fc1 = nn.Linear(in_features=img_height*img_width*3, out_features=24)
        self.fc2 = nn.Linear(in_features=24, out_features=32)
        self.out = nn.Linear(in_features=32, out_features=2)
        
    def forward(self, t):
        t = t.flatten(start_dim=1)
        t = F.relu(self.fc1(t))
        t = F.relu(self.fc2(t))
        t = self.out(t)
        return t
                   

# Experience Class

In [9]:
#Experience class

Experience = namedtuple('Experience', ('state','action,'next_state','reward'))

SyntaxError: invalid syntax (<ipython-input-9-163cd520d4fa>, line 3)

# Replay Memory

In [10]:
# Replay memory

class ReplayMemory():
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.push_count = 0
        
    def push(self, experience):
        if len(self.memory) < self.capacity:
            self.memory.append(experience)
        else:
            self.memory[self.push_count % self.capacity] = experience
        self.push_count += 1
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def can_provide_sample(self, batch_size):
        return len(self.memory) >= batch_size

# Epsilon-Greedy Strategy

In [11]:
# Epsilon Greedy strategy

class EpsilonGreedyStrategy():
    def __init__(self, start, end, decay):
        self.start = start
        self.end = end
        self.decay = decay
        
    def get_exploration_rate(self, current_step):
        return self.end + (self.start - self.end) * math.exp(-1. * current_step * self.decay)

# Reinforcement Learning Agent 

In [14]:
#Reinforcement learning agent

class Agent():
    def __init__(self, strategy, num_actions, device):
        self.current_step = 0
        self.strategy = strategy
        self.num_actions = num_actions
        
    def select_action(self, state, policy_net):
        rate = strategy.get_exploration_rate(self.current_step)
        self.current_step += 1
        
        if rate > random.random():
            action = random.randrange(self.num_actions)
            return torch.tensor([action]).to(device) # explore
        else:
            with torch.no_grad():
                return policy_net(state).argmax(dim=1).to(device) # exploit