In [10]:
import pygame
import math as m
import os
import numpy as np
import tensorflow as tf

In [11]:
x ,y = 90,51
vel = 0
rad = 0

In [12]:
def intersection(p1,p2,p3,p4):
    rhs = np.array(p4 - p2)
    lhs = np.array([ p1 - p2, p4 - p3 ])
    lhs = np.transpose(lhs)
    try:
        res = np.linalg.solve(lhs,rhs)
    except:
        res = [ float('inf'), float('inf') ]
    return res


def distancelineshape( p1, p2, shape ):
    for i in range(len(shape)):
        nexti  = ( i + 1 ) % len(shape)
        alphas = intersection(p1,p2,shape[i],shape[nexti])
        if(alphas[0]==float('inf') or alphas[0]>1):
            return float('inf')
        else:
            return np.linalg.norm(p1-p2)*(1-alphas[0])
    return float('inf')
    
def detect_coll(shape1, shape2):
    for i in range(len(shape1)):
        nexti = ( i + 1 ) % len(shape1)
        for j in range(len(shape2)):            
            nextj   = ( j + 1 ) % len(shape2)
            alphas  = intersection(shape1[i],shape1[nexti],shape2[j],shape2[nextj])
            if( max(alphas) <= 1 and min(alphas) >= 0 ):
                return True
    return False


            

In [13]:
class Agent:
    def __init__(self, num_actions, numInp):
        initializer = tf.contrib.layers.xavier_initializer()
        
        self.input_layer = tf.placeholder(dtype=tf.float32, shape=[None, numInp])
        
        
        hidden_layer = tf.layers.dense(self.input_layer, 12, activation=tf.nn.relu, kernel_initializer=initializer)
        hidden_layer_2 = tf.layers.dense(hidden_layer, 12, activation=tf.nn.relu, kernel_initializer=initializer)
        hidden_layer_3 = tf.layers.dense(hidden_layer_2, 12, activation=tf.nn.relu, kernel_initializer=initializer)
        hidden_layer_4 = tf.layers.dense(hidden_layer_3, 12, activation=tf.nn.relu, kernel_initializer=initializer)
        out = tf.layers.dense(hidden_layer_4, num_actions, activation=None)
        
        self.outputs = tf.nn.softmax(out)
        self.choice = tf.argmax(self.outputs, axis=1)
        
        self.rewards = tf.placeholder(shape=[None, ], dtype=tf.float32)
        self.actions = tf.placeholder(shape=[None, ], dtype=tf.int32)
        
        one_hot_actions = tf.one_hot(self.actions, num_actions)
        
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=out, labels=one_hot_actions)
        
        self.loss = tf.reduce_mean(cross_entropy * self.rewards)
        
        self.gradients = tf.gradients(self.loss, tf.trainable_variables())
        
        self.gradients_to_apply = []
        for index, variable in enumerate(tf.trainable_variables()):
            gradient_placeholder = tf.placeholder(tf.float32)
            self.gradients_to_apply.append(gradient_placeholder)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
        self.update_gradients = optimizer.apply_gradients(zip(self.gradients_to_apply, tf.trainable_variables()))
        

In [14]:

def run_game(control):
    done = False
    global x 
    global y
    global rad
    global vel
    global rewards
    global track1
    global track2
    global reward_added
    reward = 0    

    points = []
    points.append([x + m.cos(rad) * 10, y + m.sin(rad)*10])
    points.append([x + m.cos(rad + m.pi * 6/7) * 10, y + m.sin(rad + m.pi*6/7)*10])
    points.append([x + m.cos(rad + m.pi*8/7) * 10, y + m.sin(rad+m.pi*8/7)*10])
    points = np.array(points)
    
    for event in pygame.event.get():
        if(event.type == pygame.QUIT):
            run = False
    keys= pygame.key.get_pressed()
    if(control == 0):
        rad -= 0.2
    if(control == 2):
        rad +=0.2
    if(control == 1):
        vel +=2
    if(keys[pygame.K_q]):
        pygame.quit()
    x +=m.cos(rad) * vel
    y +=m.sin(rad) * vel
    vel *=0.85
    win.fill((255,255,255))
    
    pygame.draw.polygon(win, (255,0,0), points)
    if(detect_coll(points, track1) or detect_coll(points, track2)):
        x ,y = 90,50
        vel = 0
        rad = 0
        done = True
    i = 0
    reward += vel
    while (i<len(rewards)):
        if(detect_coll(points, np.array(rewards[i]))):
            reward += reward_added
            reward_added +=100*vel
            del rewards[i]
            i-=1
        i+=1
            
    
    distances = []
    
    num_dist = 8
    for i in range (num_dist):
        dist1= distancelineshape(np.array([x,y]),np.array([x + 1000*m.cos(rad + m.pi*i/num_dist * 2), y + 1000*m.sin(rad+m.pi*i/num_dist * 2)]),track1)
        dist2= distancelineshape(np.array([x,y]),np.array([x + 1000*m.cos(rad + m.pi*i/num_dist * 2), y + 1000*m.sin(rad+m.pi*i/num_dist * 2)]),track2)
        distances.append(min(dist1,dist2))
        
    pygame.draw.lines(win, (100,100,100), True,track1 ,1)
    pygame.draw.lines(win, (100,100,100), True,track2 ,1)
    pygame.display.update()
    return distances, reward, done
pygame.quit()

In [15]:
discount_rate = 0.95
def discount_normalize_rewards(rewards):
    discounted_rewards = np.zeros_like(rewards)
    total_rewards = 0
    
    for i in reversed(range(len(rewards))):
        total_rewards = total_rewards*discount_rate + rewards[i]
        discounted_rewards[i] = total_rewards
    
    discounted_rewards -= np.mean(discounted_rewards)
    discounted_rewards /= np.std(discounted_rewards)
    return(discounted_rewards)

In [7]:
# TODO Create the training loop
tf.reset_default_graph()
num_actions = 3
state_size = 8
path = "./CarGame-pg/"
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
training_episodes = 1000
max_steps_per_episode = 1000
episode_batch_size = 100

agent = Agent(num_actions,state_size)

init = tf.global_variables_initializer()



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [8]:
#distances,reward,done = run_game(-1)
#with tf.Session(config=config) as sess:
    #pygame.init()
    #sess.run(init)
    #action_probabilities = sess.run(agent.outputs, feed_dict={agent.input_layer: [distances]})
    #action_choice = np.random.choice(range(num_actions), p=action_probabilities[0])
    #distances_new, reward, done = run_game(action_choice)

In [9]:


saver = tf.train.Saver(max_to_keep=2)

if not os.path.exists(path):
    os.makedirs(path)

with tf.Session(config=config) as sess:
    sess.run(init)
    
    total_episode_rewards = []
    gradient_buffer = sess.run(tf.trainable_variables())
    for index, gradient in enumerate(gradient_buffer):
        gradient_buffer[index] = gradient * 0
    episode = 0
    while True:
        #pygame.init()
        track1 = np.array([[70, 380],[70, 325],[70, 270],[80, 165],[65, 132],[50, 100],[50,60],[50, 20],[240, 20],[280, 100],[320, 180],[320, 220],[320, 260],[280, 380],[70, 380]])
        track2 = np.array([[120, 330],[120,300],[120, 270],[130, 165],[115, 132],[100, 100],[100, 85],[100,  70],[145, 70],[190,  70],[230, 125],[270, 180],[270, 260],[230, 330],[120, 330]])
        rewards = list(map(lambda i, j:[list(i),list(j)], track1, track2))
        reward_added = 400
        #win = pygame.display.set_mode((500,500))
        #pygame.display.set_caption("Car Game")
        x = 90
        y = 51
        rad = 0
        vel = 1
        distances,reward,done = run_game(-1)
        
        episode_history = []
        episode_rewards = 0
        
        for step in range(max_steps_per_episode):
            
           
            
            action_probabilities = sess.run(agent.outputs, feed_dict={agent.input_layer: [distances]})
            action_choice = np.random.choice(range(num_actions), p=action_probabilities[0])
            distances_new, reward, done = run_game(action_choice)
#            print(distances_new)
            episode_history.append([distances, action_choice, reward, distances_new])
            distances = distances_new
            episode_rewards += reward
            
            if done or step + 1 == max_steps_per_episode:
                total_episode_rewards.append(episode_rewards)
                episode_history = np.array(episode_history)
                ep_gradients = sess.run(agent.gradients, feed_dict={agent.input_layer: np.vstack(episode_history[:, 0]),
                                                                    agent.actions: episode_history[:, 1],
                                                                    agent.rewards: episode_history[:, 2]})
                # add the gradients to the grad buffer:
                for index, gradient in enumerate(ep_gradients):
                    gradient_buffer[index] += gradient
                pygame.quit()
                break
            
        if episode % episode_batch_size == 0:
        
            feed_dict_gradients = dict(zip(agent.gradients_to_apply, gradient_buffer))
            
            sess.run(agent.update_gradients, feed_dict=feed_dict_gradients)
            
            for index, gradient in enumerate(gradient_buffer):
                gradient_buffer[index] = gradient * 0
                
            if episode % 1000 == 0:
                saver.save(sess, path + "pg-checkpoint", episode)
                print("Average reward / 100 eps: " + str(np.mean(total_episode_rewards[-100:])))
        episode+=1
    print("END")

error: video system not initialized

In [17]:
x = 90
y = 51
rad = 0
vel = 1
win = pygame.display.set_mode((500,500))
track1 = np.array([[70, 380],[70, 325],[70, 270],[80, 165],[65, 132],[50, 100],[50,60],[50, 20],[240, 20],[280, 100],[320, 180],[320, 220],[320, 260],[280, 380],[70, 380]])
track2 = np.array([[120, 330],[120,300],[120, 270],[130, 165],[115, 132],[100, 100],[100, 85],[100,  70],[145, 70],[190,  70],[230, 125],[270, 180],[270, 260],[230, 330],[120, 330]])
rewards = list(map(lambda i, j:[list(i),list(j)], track1, track2))
reward_added = 400
reward = 0
while True:
    choice = -1
    pygame.init()
    events = pygame.event.get()
    for event in events:
        if event.type == pygame.KEYDOWN:
            if event.key == pygame.K_LEFT:
                choice = 2
            if event.key == pygame.K_RIGHT:
                choice = 0
            if event.key == pygame.K_UP:
                choice = 1
    x = run_game(choice)
    reward+=x[1]
    
    if(not x[2]):
        print(reward)
        break
        

TypeError: can only concatenate tuple (not "float") to tuple