## Reinforcement Learning 
**Some Common Terms**:
- Agent
- Environment
- Action, Rewards, Observations

## Working on OpenAI Gym 
##### 1. Interacting with the Gym API

In [1]:
#!pip install gym

In [2]:
import gym

In [3]:
# Create Environment

env = gym.make('CartPole-v0')

##### Comes with certain important method and attributes:
- action_space
- observation_space
- reset() : returns initial state and also resets the environment
- step()
- render()

In [4]:
# Initial state of game
env.reset()

array([-0.03425547, -0.00488432,  0.00026142, -0.03037539])

In [16]:
for t in range(1000):
    env.step(env.action_space.sample()) # taking random action
    env.render()
    
env.close()



In [10]:
env.action_space

Discrete(2)

In [11]:
env.action_space.n

2

In [12]:
env.observation_space

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)

In [17]:
env.observation_space.shape[0]

4

In [15]:
type(env.observation_space)

gym.spaces.box.Box

In [18]:
for t in range(1000):
    random_action = env.action_space.sample()
    env.step(random_action) # randomly move left or right
    env.render()
    
env.close()

##### 2. Playing Games with a Random Strategy
- Game Episode
- Step() Funtion in More Detail
- Game Over?

In [22]:
for e in range(20):
    # Play 20 episodes
    observation = env.reset()
    for t in range(50):
        env.render()
        action = env.action_space.sample()
        # step returns four things when it is taken
        observation,reward,done,other_info = env.step(action)
        
        if done: # if done true then game is over
            # Game Episode is over
            print("Game Episode : {}/{} High Score :{}".format(e,20,t))
            break;
            
env.close()
print("All 20 episodes over!")

Game Episode : 0/20 High Score :14
Game Episode : 1/20 High Score :40
Game Episode : 2/20 High Score :18
Game Episode : 3/20 High Score :23
Game Episode : 4/20 High Score :20
Game Episode : 5/20 High Score :14
Game Episode : 6/20 High Score :10
Game Episode : 7/20 High Score :9
Game Episode : 8/20 High Score :21
Game Episode : 9/20 High Score :33
Game Episode : 10/20 High Score :13
Game Episode : 11/20 High Score :11
Game Episode : 12/20 High Score :24
Game Episode : 13/20 High Score :18
Game Episode : 14/20 High Score :18
Game Episode : 15/20 High Score :19
Game Episode : 16/20 High Score :15
Game Episode : 17/20 High Score :22
Game Episode : 18/20 High Score :27
Game Episode : 19/20 High Score :26
All 20 episodes over!


### 3. Q-Learning
#### Agent Design and Neural Model

In [None]:
"""
Bellman equation:-
   Q(s,a) = r + r*max(Q(s',a'))  ; r = hyper parameter which belongs to (0,1)
"""

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os 
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random
%matplotlib inline

In [None]:
Class Agent:
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 # Discount factor
        # Exploration vs Exploitation Tradeoff
        # Exploration : Good in the begining --> helps you to try various random things
        # Exploitation : Sample Good experience from the past(memory) --> good in the end
        self.epsilon = 1.0 # 100% Random Exploration in the begining (Epsilon Greedy Method)
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001
        self.model = self._create_model()
        
    def _create_model(self):
        model = Sequential()
        model.add(Dense(24,input_dim=4,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(2,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=0.001))
        #model.summary()
        return model
   
    def remember(self,state,action,reward,next_state,done):
        # Remember past experience
        self.memory.append((state,action,reward,next_state,done))
        
    def act(self,state):
        # Epsilon Greedy Method
        if np.random.rand() <= self.epsilon():
            # Take a random action
            return random.randrange(self.action_size)
        
        # Ask neural network to give me the suitabel action
        return np.argmax(model.predict(state)[0])
            
    def train(self,batch_size=32):
        # Training using a 'Replay Buffer'
        minibatch = random.sample(self.memory,batch_size)
        for experience in minibatch:
            state,action,reward,next_state,done = experience
            # X,Y : state, expected reward
            
            if not done:
                # Game is not yet over, bellman eqn to apprx the target_value of reward
                target = reward + self.gamma*np.argmax(self.model.predict(next_state)[0])
            else:
                target = reward
                
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            #X = state , Y = target_f
            self.model.fit(state,target_f,epochs=1,verbose=0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay  # after gaining more experience , trust experience than randomness
            
            
    def load(self,name):
        self.model.load_weights(name)
        
    def save(self,name):
        self.model.save_weights(name)
            

#### Training the DQN Agent (Deep Q-Learner)

In [23]:
n_episodes = 1000
output_dir = "carpole_model/"

In [None]:
agent = Agent(state_size=4,action_size=2)
done = False

In [None]:
for e in range(n_episodes):
    sate = env.reset()
    state = np.reshape(stae,[1,state_size])
    batch_size = 32
    
    for time in range(500):
        env.render()
        action = agent.act(state) # action is 0 or 1
        next_state,reward,done,other_info = env.step(action)
        rewards = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done) # Experience for the agent
        
        if done:
            print("Game Episode : {}/{} High Score :{} Exploration Rate:{:.2}".format(e,20,t,agent.epsilon))
            break
        
        if len(agent.memory)>batch_size:
            agent.train(batch_size)
    
        if e%50==0:
            agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
    
print("Deep Q-Learner Model Trained")
env.close

In [None]:
# Built more games from Open AI Gym