In [None]:
!pip install pygame



In [6]:
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
import random
import pygame, sys
from collections import deque
from IPython.display import clear_output
import time
from pygame.locals import *
from random import randint
from random import choice

In [7]:
class Paddle:
    def __init__(self, x, y, l, w, s, n):
        self.ypos = y
        self.xpos = x
        self.length = l
        self.width = w
        self.speed = s
        self.number = n

    def move(self, ypos, action, playernumber):
        if playernumber == 1:
            if action[pygame.K_UP]:
                if(ypos-self.speed < -60):
                    self.ypos = -60
                else:
                    self.ypos -= self.speed
            if action[pygame.K_DOWN]:
                if(ypos+self.length+self.speed > 780):
                    self.ypos = 780-self.length
                else:
                    self.ypos += self.speed

        if playernumber == 2:
            if action == 0:
                self.ypos -= self.speed
                if (self.ypos < 0):
                    self.ypos = 0
            if action == 1:
                self.ypos += self.speed
                if(self.ypos > 720):
                    self.ypos = 720

    def continuepos(self):
        return int(self.ypos)

In [8]:
class SimpleBall:
    def __init__(self, x, y, xspeed, yspeed, speed_rate, size):
        self.xspeed = xspeed
        self.yspeed = yspeed
        self.xpos = x
        self.ypos = y
        self.size = size
        self.speed_rate = speed_rate

    def move(self, paddle1, paddle2):

        hit = 0 

        if(self.ypos + self.yspeed < 0):
            self.ypos = 0
            self.yspeed = -self.yspeed
            
        elif(self.ypos + self.yspeed > 720):
            self.ypos = 720
            self.yspeed = -self.yspeed

        else:
            self.ypos += self.yspeed

        if(self.xpos < self.size and ((paddle1.ypos - self.size <= self.ypos) and (self.ypos <= (paddle1.ypos + paddle1.length)))):
            self.xspeed -= self.speed_rate
            self.xspeed = -self.xspeed
            self.xpos = self.size

            if (self.yspeed > 0):
                self.yspeed += self.speed_rate
            else:
                self.yspeed -= self.speed_rate

            hit = 1
        
        elif(self.xpos > 740 and ((paddle2.ypos - self.size <= self.ypos) and (self.ypos <= (paddle2.ypos + paddle2.length)))):

            self.xspeed += self.speed_rate
            self.xspeed = -self.xspeed
            self.xpos = 740

            if (self.yspeed > 0):
                self.yspeed += self.speed_rate
            else:
                self.yspeed -= self.speed_rate
            
            hit = 1
        else:
            self.xpos += self.xspeed

        return hit
    
    def continuepos (self):
        return self.xpos,self.ypos
    
    def continuevel (self):
        return self.xspeed, self.yspeed

In [9]:
class DiscretePongMDP():
    def __init__(self, scale):
        
        self.scale = scale

        self.paddle1 = Paddle(740, 300, 120, 20, 2*self.scale, 2)
        self.paddle2 = Paddle(20, 300, 120, 20, 2*self.scale, 2)

        self.pongball = SimpleBall(780/2, 720/2 , choice([-1,1]) * 2 *self.scale, choice([-1,1]) * 2 * self.scale, 0.5*self.scale, 20)


    def update(self, a):
    
        running = True
        r = 0
        hit = 0

        self.paddle2.move(self.paddle2.ypos, a, self.paddle2.number)
       
        if (self.paddle1.ypos + 60 > self.pongball.ypos):
            a = 0
        else:
            a = 1
        self.paddle1.move(self.paddle1.ypos, a, self.paddle1.number)

        hit = self.pongball.move(self.paddle2, self.paddle1)

        if (self.pongball.xpos < 0):
            r = -100
            running = False

        if (self.pongball.xpos > 780):
            r = 100
            running = False

        xv, yv = self.pongball.continuevel()
        x, y = self.pongball.continuepos()
        p = self.paddle2.continuepos()

        return (p, x, y, xv, yv), r, running, hit

In [None]:
class DQNAgent:
    def __init__(self, stateSpaceSize, actionSpace):

        self.actionSpace = actionSpace

        self.gamma = 0.99

        self.syncModelsepisodes = 5

        self.model = self.createDqnModel(stateSpaceSize, actionSpace)

        self.targetModel = self.createDqnModel(stateSpaceSize, actionSpace)

        self.memory = []
        self.batchSize = 32

    def createDqnModel(self, stateSpaceSize, actionSpace):
        inputs = Input(shape=(stateSpaceSize, ))
        x = Dense(128, activation='relu')(inputs)
        x = Dense(128, activation='relu')(x)
        x = Dense(64, activation='relu')(x)
        x = Dense(len(actionSpace), activation='linear')(x)
        model = Model(inputs, x)
        model.summary()
        model.compile(loss='mse', optimizer='adam')

        return model

    def act(self, state, epsilon):
        if np.random.rand() > epsilon:
            normalState = self.prepState(state)
            qValues = self.model.predict(normalState)
            return np.argmax(qValues[0])
        else:
            return random.choice(self.actionSpace)

    def calcTarget(self, nextState, reward):
        nestNormalState = self.prepState(nextState)
        maxQValue = np.max(self.targetModel.predict(nestNormalState))
        targetQVal = reward + self.gamma * maxQValue
        return targetQVal

    def remember(self, state, action, reward, nextState, done):
        item = (state, action, reward, nextState, done)
        if len(self.memory) > 10000:
            self.memory.pop(0)
        self.memory.append(item)
        
    def prepState(self, state):
        return np.expand_dims(np.asarray([state[0]/720, state[1]/780, state[2]/720, state[3]/10, state[4]/10]), axis=0)

    def replay(self):
        batch = random.sample(self.memory, self.batchSize)
        xTrain, yTrain = [], []

        for state, action, reward, nextState, done in batch:
            normalState = self.prepState(state)
            qValues = self.model.predict(normalState)
            
            targetQVal = self.calcTarget(nextState, reward)

            qValues[0][action] = reward if done else targetQVal

            xTrain.append(normalState[0])
            yTrain.append(qValues[0])

        self.model.fit(np.array(xTrain),
                         np.array(yTrain),
                         batch_size=self.batchSize,
                         epochs=1,
                         verbose=0)

    def syncModels(self):
        self.targetModel.set_weights(self.model.get_weights()) 

def getLearningRate(episode):
    return np.max([0.8 * np.power(0.9992, episode), 0.02])

def getExplorationRate(episode):
    return np.max([1 * np.power(0.9995, episode), 0.02])

In [15]:
NumEpisodes = 5000
MaxTimeSteps = 50000
tracker = np.zeros((int(NumEpisodes/100)+1,2))

tf.compat.v1.disable_eager_execution()

agent = DQNAgent(5, [0, 1])

NumEpisodes = 5000
tracker = np.zeros((int(NumEpisodes/100)+1,2))

trainInterval = 10

gameStep = 0

plotScores = []

sumScores = 0
avgScores = []

for episode in range(NumEpisodes):
    scores = [0, 0]
    while True:
        MDP = DiscretePongMDP(scale=10)
        s, r, running, temp = MDP.update(2)
        explorationRate = getExplorationRate(episode)
        running = True
        while(running):
            a = agent.act(s, explorationRate)
            ns, r, running, hit = MDP.update(a)
            agent.remember(s, a, r, ns, not running)

            s = ns
            if r == 100:
                scores[1] += 1
            elif r == -100:
                scores[0] += 1
            
            if len(agent.memory) > 1e2 and gameStep % 50 == 0:
                agent.replay()

            gameStep += 1

        if scores[0] == 21 or scores[1] == 21:
            break

    sumScores += scores[1] - scores[0]
    avgScores.append(sumScores / (episode + 1))

    if episode % agent.syncModelsepisodes == 0:
        agent.syncModels()


    print('Episode {} result: {} - {} , epsilon: {}'.format(episode, scores[0], scores[1], explorationRate))

sns.set_style("darkgrid")

plt.figure()
df = pd.DataFrame({'Avg Scores': avgScores, 'Episodes': range(NumEpisodes)})

ax = sns.lineplot(x="Episodes", y="Avg Scores", data=df).set_title("Deep-Q Learning Agent")

plt.show()

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 5)]               0         
                                                                 
 dense_32 (Dense)            (None, 128)               768       
                                                                 
 dense_33 (Dense)            (None, 128)               16512     
                                                                 
 dense_34 (Dense)            (None, 64)                8256      
                                                                 
 dense_35 (Dense)            (None, 2)                 130       
                                                                 
Total params: 25,666
Trainable params: 25,666
Non-trainable params: 0
_________________________________________________________________
Model: "model_9"
______________________________________

2022-02-07 19:08:59.144888: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-07 19:08:59.163301: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-07 19:08:59.206094: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-07 19:08:59.236976: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-07 19:08:59.434805: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-07 19:08:59.456286: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-07 19:08:59.470022: I tensorflow/core/grappler/optimizers/cust

Episode 0 result: 21 - 3 , epsilon: 1.0
Average Reward	 -1.9758507135016465
Episode 1 result: 21 - 1 , epsilon: 0.9995
Average Reward	 -2.7
Episode 2 result: 21 - 0 , epsilon: 0.9990002500000001
Average Reward	 -3.2
Episode 3 result: 21 - 0 , epsilon: 0.9985007498750001
Average Reward	 -3.3
Episode 4 result: 21 - 1 , epsilon: 0.9980014995000627
Average Reward	 -2.5
Episode 5 result: 21 - 0 , epsilon: 0.9975024987503127
Average Reward	 -2.4
Episode 6 result: 21 - 0 , epsilon: 0.9970037475009377
Average Reward	 -2.9
Episode 7 result: 21 - 0 , epsilon: 0.9965052456271872
Average Reward	 -3.0
Episode 8 result: 21 - 0 , epsilon: 0.9960069930043737
Average Reward	 -3.3
Episode 9 result: 21 - 1 , epsilon: 0.9955089895078716
Average Reward	 -3.0
Episode 10 result: 21 - 0 , epsilon: 0.9950112350131177
Average Reward	 -3.0
Episode 11 result: 21 - 0 , epsilon: 0.9945137293956112
Average Reward	 -2.8
Episode 12 result: 21 - 1 , epsilon: 0.9940164725309134
Average Reward	 -2.8
Episode 13 result: 21