In [1]:
import numpy as np
import pickle
import gym
import time
print("Everything imported correctly")

Everything imported correctly


In [2]:
env = gym.make("Pong-v0")
observation = env.reset()
print(env.observation_space)

Box(210, 160, 3)


In [3]:
#hyperparameter
hidden_neurons = 200
learning_rate = 0.0003
gamma_rate = 0.99
decay_rate = 0.90
input_dimension = 80*80
resume = True
if resume==True:
    model=pickle.load(open('save.p','rb'))
    episode_reward_dict=pickle.load(open('rewards.p','rb'))
else:
    model = {'W1': np.random.randn(hidden_neurons,input_dimension) /np.sqrt(input_dimension),
             'W2': np.random.randn(hidden_neurons) / np.sqrt(hidden_neurons)
            }
    episode_reward_dict={}

In [4]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))                

In [5]:
def preprocess(frame):
    frame = frame[35:195]
    frame = frame[::2,::2,0]
    frame[frame==144] = 0
    frame[frame==109] = 0
    frame[frame!=0]   = 1
    return frame.astype(np.float).ravel()

In [6]:
#RMSProp
gradient_buffer={}
expected_g_squared = {}
g_dict = {}
for layer_name in model.keys():
    expected_g_squared[layer_name] = np.zeros_like(model[layer_name])
    gradient_buffer[layer_name] = np.zeros_like(model[layer_name])
    g_dict[layer_name] = np.zeros_like(model[layer_name])

In [7]:
hidden_layers,observations,logps,rewards=[],[],[],[]

In [8]:
def forward_propagation(x):
    h = np.dot(model["W1"],x)
    h[h<0]=0
    logp = np.dot(model["W2"],h)
    p = 1.0 / (1.0 + np.exp(-logp))
    return h,logp,p

In [9]:
def backward_propagation(episode_observation,episode_hidden_layer,episode_logp):
    gradient_w2 = np.dot(episode_hidden_layer.T,episode_logp).ravel()
    gradient_hidden = np.outer(episode_logp,model["W2"])
    gradient_hidden[episode_hidden_layer<0]=0
    gradient_w1 = np.dot(gradient_hidden.T,episode_observation)
    return {"W1":gradient_w1,"W2":gradient_w2}

In [10]:
def discounted_rewards(episode_reward):
    running_sum=0
    discounted_rewards = np.zeros_like(episode_reward)
    for i in reversed(range(0,episode_reward.size)):
        if(episode_reward[i]!=0):
            running_sum=0
        running_sum = running_sum*gamma_rate+episode_reward[i]
        discounted_rewards[i]=running_sum
    return discounted_rewards

In [11]:
def normalized_discounted_rewards(episode_reward):
    discounted_reward = discounted_rewards(episode_reward)
    discounted_reward -= np.mean(discounted_reward)
    discounted_reward /= np.std(discounted_reward)
    return discounted_reward

In [12]:
total_episodes = 800000
batch_size=10
for episode in range(0,total_episodes):
    curr_state = env.reset()
    prev_state = np.zeros_like(curr_state)
    reward_sum=0
    while True:
        state = curr_state-prev_state
        prev_state = curr_state
        state = preprocess(state)
        observations.append(state)
        hidden_layer,logp,prob_up = forward_propagation(state)
        #action=2 -> UP  action=3 -> DOWN
        action = 2 if np.random.uniform() < prob_up else 3
        fake_label = 1 if action==2 else 0
        
        hidden_layers.append(hidden_layer)
        logps.append(fake_label-prob_up)
        
        curr_state, reward, done, info = env.step(action)
        env.render()
        reward_sum += reward
        rewards.append(reward)
        
        
        if done:
            episode_input=np.vstack(observations)
            episode_hidden=np.vstack(hidden_layers)
            episode_logp=np.vstack(logps)
            episode_rewards=np.vstack(rewards)
            hidden_layers,observations,logps,rewards=[],[],[],[]
            discounted_episode_rewards=normalized_discounted_rewards(episode_rewards)
            discounted_episode_rewards*=episode_logp
            gradient=backward_propagation(episode_input,episode_hidden,discounted_episode_rewards)
            for layer_name in gradient.keys():
                gradient_buffer[layer_name]+=gradient[layer_name]
            
            if episode%batch_size==0:
                for layer_name in model.keys():
                    g_dict[layer_name]=gradient_buffer[layer_name]
                    expected_g_squared[layer_name]=decay_rate*expected_g_squared[layer_name]+(1-decay_rate)*g_dict[layer_name]**2
                    model[layer_name]+=learning_rate*g_dict[layer_name]/(np.sqrt(expected_g_squared[layer_name])+1e-5)
                    gradient_buffer[layer_name]=np.zeros_like(g_dict[layer_name])
            
            episode_reward_dict[episode]=reward_sum
            if reward!=0:
                if reward==1:
                    print("Episode number ",episode," Reward: ",reward_sum," Won")
                else:
                    print("Episode number ",episode," Reward: ",reward_sum," Lost")
            else:
                print("Episode number ",episode," Reward: ",reward_sum)
            if episode%100==0:
                pickle.dump(model,open('save.p','wb'))
                pickle.dump(episode_reward_dict,open('rewards.p','wb'))

           
            break            

Episode number  0  Reward:  -8.0  Lost


KeyboardInterrupt: 