In [1]:
import tensorflow as tf
import numpy as np

In [2]:
modelA = tf.keras.models.Sequential([
    tf.keras.layers.Dense(6, activation="relu"),
    tf.keras.layers.Dense(10, activation="relu"),
    tf.keras.layers.Dense(2, activation=None)
])

In [4]:
#input layer paddle.xcor, paddle.ycor, ball.xcor, ball.ycor, ball.dx, ball.dy
#4 hidden layers 
#output layer, up, down

In [5]:
learning_rate=1e-3
MAX_ITERS = 500

In [6]:
optimizer = tf.keras.optimizers.Adam(learning_rate)

In [7]:
def choose_action(model, observation):
  # add batch dimension to the observation
    observation = np.expand_dims(observation, axis=0)
    logits = model.predict(observation) 
   # print(logits)
    
  
    # pass the log probabilities through a softmax to compute true probabilities
    prob_weights = tf.nn.softmax(logits).numpy()
    #print(prob_weights)
    action = np.random.choice(2, size=1, p=prob_weights.flatten())[0] 
   # print(action)
    return action

In [8]:
class Memory:
    def __init__(self): 
        self.clear()
    def clear(self): 
        self.observations = []
        self.actions = []
        self.rewards = []

  # Add observations, actions, rewards to memory
    def add_to_memory(self, new_observation, new_action, new_reward): 
        self.observations.append(new_observation)
        self.actions.append(new_action)
        self.rewards.append(new_reward) 

In [9]:
def normalize(x):
    x = x - np.mean(x)
    x = x / np.std(x)
    return x

In [10]:
def discount_rewards(rewards, gamma=0.99): 
    discounted_rewards = np.zeros_like(rewards)
    R = 0
    for t in reversed(range(0, len(rewards))):
      # NEW: Reset the sum if the reward is not 0 (the game has ended!)
        if rewards[t] != 0:
            R = 0
      # update the total discounted reward as before
        R = R * gamma + rewards[t]
        discounted_rewards[t] = R
      
    return normalize(discounted_rewards)

In [11]:
def compute_loss(logits, actions, rewards): 
    neg_logprob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=actions)
    loss = tf.reduce_mean( neg_logprob * rewards ) 
    return loss

In [12]:
def train_step(model, optimizer, observations, actions, discounted_rewards):
    with tf.GradientTape() as tape:
        # Forward propagate through the agent network
        logits = model(observations)

        loss = compute_loss(logits, actions, discounted_rewards) 
        

    grads = tape.gradient(loss, model.trainable_variables) 
    
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [13]:
def paddle_a_up():
    if paddle_a.ycor() < 275:
        y = paddle_a.ycor()
        y += 40
        paddle_a.sety(y)

def paddle_a_down():
    if paddle_a.ycor() > -275:
        y = paddle_a.ycor()
        y -= 40
        paddle_a.sety(y)

def paddle_b_up():
    if paddle_b.ycor() < 275:
        y = paddle_b.ycor()
        y += 40
        paddle_b.sety(y)

def paddle_b_down():
    if paddle_a.ycor() > -275:
        y = paddle_b.ycor()
        y -= 40
        paddle_b.sety(y)

In [14]:
import turtle
import os
def draw_board():
    wn = turtle.Screen()
    wn.title("Pong")
    wn.bgcolor("black")
    wn.setup(width=800, height=600)
    wn.tracer(0)


    # Paddle A
    paddle_a = turtle.Turtle()
    paddle_a.speed(0)
    paddle_a.shape("square")
    paddle_a.color("white")
    paddle_a.shapesize(stretch_wid=5,stretch_len=1)
    paddle_a.penup()
    paddle_a.goto(-350, 0)

    # Paddle B
    paddle_b = turtle.Turtle()
    paddle_b.speed(0)
    paddle_b.shape("square")
    paddle_b.color("white")
    paddle_b.shapesize(stretch_wid=5,stretch_len=1)
    paddle_b.penup()
    paddle_b.goto(350, 0)

    # Ball
    ball = turtle.Turtle()
    ball.speed(0)
    ball.shape("square")
    ball.color("white")
    ball.penup()
    ball.goto(0, 0)
    ball.dx = 15
    ball.dy = 15

    
    return wn, paddle_a, paddle_b, ball

In [15]:
memoryA = Memory()
memoryB = Memory()

In [16]:
def dist(a, b):
    return np.linalg.norm(a-b)

In [17]:
from tqdm import tqdm

In [18]:
def reward_func(d):
    return 800/d - 1

In [19]:
for i_episode in tqdm(range(MAX_ITERS)):
    reward = 0
    wn, paddle_a, paddle_b, ball = draw_board()
    done = False 
    while True:
        #update screen
        wn.update()
            
            
        if ball.dx > 0:
            
            observation = np.array([paddle_b.xcor(), paddle_b.ycor(), ball.xcor(), ball.ycor(), ball.dx, ball.dy])
            action = choose_action(modelB, observation)
            if action ==1:
                paddle_b_up()
            elif action == 0:
                paddle_b_down()
            
            b = np.array([paddle_b.xcor(), paddle_b.ycor()])
            z = np.array([ball.xcor(), ball.ycor()])
            d = dist(b,z)
            reward = reward_func(d)
        
        elif ball.dx < 0:
            observation = np.array([paddle_a.xcor(), paddle_a.ycor(), ball.xcor(), ball.ycor(), ball.dx, ball.dy])
            action = choose_action(modelA, observation)
            if action == 1:
                paddle_a_up()
            elif action == 0:
                paddle_a_down()
            a = np.array([paddle_a.xcor(), paddle_a.ycor()])
            z = np.array([ball.xcor(), ball.ycor()])
            d = dist(a,z)
            reward = reward_func(d)
        
            
        
            

       
        redward = 0
        
        ball.setx(ball.xcor() + ball.dx)
        ball.sety(ball.ycor() + ball.dy)
               

        # Top and bottom
        if ball.ycor() > 290:
            ball.sety(290)
            ball.dy *= -1

        elif ball.ycor() < -290:
            ball.sety(-290)
            ball.dy *= -1

        # Left and right
        if ball.xcor() > 350:
            done = True

        elif ball.xcor() < -350:
            done = True
            
            # Paddle and ball collisions
        if ball.xcor() < -340 and ball.ycor() < paddle_a.ycor() + 50 and ball.ycor() > paddle_a.ycor() - 50:
            ball.dx *= -1
            
        elif ball.xcor() > 340 and ball.ycor() < paddle_b.ycor() + 50 and ball.ycor() > paddle_b.ycor() - 50:
            ball.dx *= -1
        
        
        
        if done:
            if memoryA.observations:
                train_step(modelA, optimizer, 
                     observations=np.vstack(memoryA.observations),
                     actions=np.array(memoryA.actions),
                     discounted_rewards = discount_rewards(memoryA.rewards))
            if memoryB.observations:
                train_step(modelB, optimizer, 
                     observations=np.vstack(memoryB.observations),
                     actions=np.array(memoryB.actions),
                     discounted_rewards = discount_rewards(memoryB.rewards))
                
            memoryA.clear()
            memoryB.clear()
            turtle.resetscreen()
            break

  4%|â–Ž         | 18/500 [00:27<12:13,  1.52s/it]

TclError: invalid command name ".!canvas"