In [None]:
!pip install gym[atari,accept-rom-license]==0.21.0

Collecting gym[accept-rom-license,atari]==0.21.0
  Downloading gym-0.21.0.tar.gz (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.4 MB/s 
Collecting ale-py~=0.7.1
  Downloading ale_py-0.7.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 34.9 MB/s 
[?25hCollecting autorom[accept-rom-license]~=0.4.2
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting AutoROM.accept-rom-license
  Downloading AutoROM.accept-rom-license-0.4.2.tar.gz (9.8 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: gym, AutoROM.accept-rom-license
  Building wheel for gym (setup.py) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.21.0-py3-none-any.whl size=1616826 sha256=394dd4970f5d5e619e124507811e62e2112d18b5b57aa8521306a27556f75983
  Stored in directory: /ro

In [8]:
import random
import gym
import cv2
import time


INF = int(1e15)

class Agent:

    def __init__(self, isLearning=True):
        self.env = gym.make('Pong-v0')

        self.prevBallPos = (40, 40)
        self.prevNon0BallDirec = (0, 0)
        self.prevBallDirec = 0
        self.isLearning = isLearning
        self.minEpsilon = 0.01
        self.seen = 0

    def preProcess(self, state):
        state = state[35:195]
        state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY)
        thresh = 110
        state = cv2.threshold(state, thresh, 255, cv2.THRESH_BINARY)[1]

        state = state[::2, ::2]
        stateInfo = [(0,0), 0, 0]

        paddle1Found = False
        paddle2Found = False
        ballFound = False

        paddle1Col = 8
        paddle2Col = 70

        gameBoardDown = 80
        gameBoardRight = 80

        for i in range(gameBoardDown):

            if not paddle2Found:
                if state[i][paddle2Col] == 255:
                    stateInfo[1] = i
                    paddle2Found = True
                    break
            
        
        ballFound = False
        for i in range(-5, 5):
            for j in range(-5, 5):
                if 0 <= self.prevBallPos[0] + i < gameBoardDown:
                    if 9 < self.prevBallPos[1] + j < 70:
                        if state[self.prevBallPos[0] + i][self.prevBallPos[1] + j] == 255:
                            stateInfo[0] = (self.prevBallPos[0] + i, self.prevBallPos[1] + j)
                            ballFound = True
                            break
            if ballFound:
                break

        if not ballFound:
            stateInfo[0] = self.prevBallPos
        
        ballDirec = (0, 0)
        
        if stateInfo[0] == (0,0) and self.prevBallPos == (0,0):
            ballDirec = (0,0)
        else:
            newBallX, newBallY = stateInfo[0]
            oldBallX, oldBallY = self.prevBallPos
            ballDirec = (newBallX - oldBallX, newBallY - oldBallY)
        
        self.prevBallPos = stateInfo[0]

        didReflect = False
        if ballDirec[1] != 0:
            if ballDirec[1] < 0 and self.prevNon0BallDirec[1] > 0:
                didReflect = True
            self.prevNon0BallDirec = ballDirec

        if ballDirec == (0, 0):
            stateInfo[2] = self.prevBallDirec
        else:
            if ballDirec[0] > 0:
                if ballDirec[1] > 0:
                    stateInfo[2] = 2
                else:
                    stateInfo[2] = 3
            else:
                if ballDirec[1] > 0:
                    stateInfo[2] = 0
                else:
                    stateInfo[2] = 1

            self.prevBallDirec = stateInfo[2]

        return (stateInfo[0], stateInfo[1], stateInfo[2]), 1 if didReflect else 0

    def resetGame(self):
        return self.preProcess(self.env.reset())[0]

    def epsilonGreedy(self, Qtable, triedStateActions, state, actions, epsilon):
        notTried = []
        for action in actions:
            if (state, action) not in triedStateActions:
                notTried.append(action)

        if notTried != []:
            return random.choice(notTried)

        if random.random() < epsilon and self.isLearning:
            return random.choice(actions)
        else:
            return self.getBestAction(Qtable, state, actions)


    def getBestAction(self, Qtable, state, actions):
        bestAction = 0
        bestQ = -INF

        for action in actions:
            if (state, action) not in Qtable:
                Qtable[state, action] = 0
            if Qtable[state, action] > bestQ:
                bestQ = Qtable[state, action]
                bestAction = action

        return bestAction

    def getAction(self, Qtable, triedStateActions, state, actions, epsilon):
        return self.epsilonGreedy(Qtable, triedStateActions, state, actions, epsilon)

    def Qlearning(self):
        Qtable = {}
        actions = [0, 2, 3]

        triedStateActions = {}

        epsilon = 1.0
        alpha = 0.1
        gamma = 0.99
        episodes = 20000

        minEpsilon = 0.01
        endEpsilonDecayEp = episodes * 0.6
        
        epsilonDecay = (1.0 - minEpsilon) / endEpsilonDecayEp

        scores = []

        for episode in range(episodes):

            state = self.resetGame()
            epsilon -= epsilonDecay

            if epsilon < minEpsilon:
                epsilon = self.minEpsilon

            currScore = 0
            done = False

            while not done:
                action = self.getAction(Qtable, triedStateActions, state, actions, epsilon)

                triedStateActions[state, action] = True
                newState, reward, done, _ = self.env.step(action)
                currScore += reward
                nextState, reflectionReward = self.preProcess(newState)
                nextStateBestAction = self.getBestAction(Qtable, nextState, actions)

                if (state, action) not in Qtable:
                    Qtable[state, action] = 0

                Qtable[state,action] = Qtable[state, action] + alpha * ((reward * 3 + reflectionReward) + gamma * Qtable[nextState, nextStateBestAction] - Qtable[state,action])
                state = nextState
                if reward == 1 or reward == -1:
                    self.prevBallPos = (40, 40)
                    self.prevNon0BallDirec = (0, 0)
                    self.prevBallDirec = 0

            scores.append(currScore)

            print('Episode: {} - Score: {} - Epsilon: {}'.format(episode, currScore, epsilon))


if __name__ == '__main__':
    RlAgent = Agent()
    RlAgent.Qlearning()

Episode: 0 - Score: -21.0 - Epsilon: 0.9999175
Episode: 1 - Score: -21.0 - Epsilon: 0.999835
Episode: 2 - Score: -21.0 - Epsilon: 0.9997525
Episode: 3 - Score: -21.0 - Epsilon: 0.9996700000000001
Episode: 4 - Score: -18.0 - Epsilon: 0.9995875000000001
Episode: 5 - Score: -21.0 - Epsilon: 0.9995050000000001
Episode: 6 - Score: -20.0 - Epsilon: 0.9994225000000001
Episode: 7 - Score: -21.0 - Epsilon: 0.9993400000000001
Episode: 8 - Score: -21.0 - Epsilon: 0.9992575000000001
Episode: 9 - Score: -21.0 - Epsilon: 0.9991750000000001
Episode: 10 - Score: -21.0 - Epsilon: 0.9990925000000002
Episode: 11 - Score: -20.0 - Epsilon: 0.9990100000000002
Episode: 12 - Score: -21.0 - Epsilon: 0.9989275000000002
Episode: 13 - Score: -18.0 - Epsilon: 0.9988450000000002
Episode: 14 - Score: -21.0 - Epsilon: 0.9987625000000002
Episode: 15 - Score: -19.0 - Epsilon: 0.9986800000000002
Episode: 16 - Score: -20.0 - Epsilon: 0.9985975000000002
Episode: 17 - Score: -21.0 - Epsilon: 0.9985150000000003
Episode: 18 