In [None]:
pip install gym



In [None]:
import random
from collections import defaultdict
import gym
import numpy as np
import tensorflow as tf
print(tf.__version__)
import keras

2.3.0


In [None]:
ENV = gym.make("Pong-v0")

In [None]:
from collections import deque

class Buffer:
    def __init__(self, size=10000):
        self.size = size
        self.buffer = deque()

    def add(self, s, a, r, s2, t):
        s = np.stack((s[0], s[1], s[2], s[3]), axis=2)
        s2 = np.stack((s2[0], s2[1], s2[2], s2[3]), axis=2)
        self.buffer.append((s, a, r, s2, t))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

SCREEN_SHAPE = (210, 160, 3)
FOUR_FRAME_SHAPE = (84, 84, 4)

import keras.layers as L

class DQN:
    def __init__(self, buffer, batch_size=32, min_buffer=10000, gamma=0.99, lr=1e-4):
        self.buffer = buffer
        self.min_buffer = min_buffer
        self.batch_size = batch_size
        self.gamma = gamma        
        self.model = self.build_model(lr)
        self.target_model = self.build_model(lr)
        self.copy_weights()

    def build_model(self, lr = 0.001):
        X = L.Input(shape = FOUR_FRAME_SHAPE)
        x = L.Conv2D(32, 8, strides=(4,4), padding="same", activation="relu")(X)
        x = L.Conv2D(64, 4, strides=(2,2), padding="same", activation="relu")(x)
        x = L.Conv2D(64, 3, strides=(1,1), padding="same", activation="relu")(x)
        x = L.Flatten()(x)
        x = L.Dense(512,activation="relu")(x)
        Y = L.Dense(6, activation=None)(x)
        model = keras.models.Model(inputs=X, outputs=Y)
        model.compile(optimizer=keras.optimizers.Adam(lr), loss="mse")
        return model

    def train(self):
        if len(self.buffer.buffer) < self.min_buffer:
            return
        
        states, actions, rewards, next_states, terminal = map(np.array, zip(*self.buffer.sample(self.batch_size)))
        next_state_values = self.target_model.predict(next_states)
        
        next_state_action_values = np.max(next_state_values, axis=1)
        
        targets = self.model.predict(states)
        
        targets[range(self.batch_size), actions] = rewards + self.gamma * next_state_action_values * np.invert(terminal)
        self.model.train_on_batch(states, targets)
        

    def copy_weights(self):
        frm = self.model
        to = self.target_model
        for l_target, l_src in zip(to.layers, frm.layers):
            l_target.set_weights(l_src.get_weights())

    def predict(self, x):
        states = np.stack((x[0], x[1], x[2], x[3]), axis=2)
        return self.model.predict(np.array([states]))

In [None]:
import cv2

def preprocess(img):
    img = img[:,:,0]*0.299 + img[:,:,1]*0.587 + img[:,:,2]*0.114; #колориметрия
    img = img/255.
    img = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
    img = img[18:102, :]
    return np.reshape(img, [84, 84, 1])

zero_img = np.zeros((210, 105, 3))
zero_img[:,:,0] = 130
zero_img[:,:,1] = 160
zero_img[:,:,2] = 205
pp = preprocess(zero_img)
pp.shape, pp[0][0]

((84, 84, 1), array([0.61239219]))

In [None]:
class Agent:
    def __init__(self):
        self.env = ENV        
        self.epsilon = 1
        self.buffer = Buffer(10000)
        self.DQN = DQN(self.buffer, min_buffer=10000)
        self.copy_period = 2000
        self.iter = 0
        self.eps_step = 1e-6
    
    def sample_action(self, state):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.DQN.predict(state)[0])    
    
    def play_episode(self, num):
        observation = self.env.reset()
        done = False
        states = deque(maxlen=4)
        states.append(preprocess(observation))
        prev_states = deque(maxlen=4)
        total_reward = 0
        k = 4
        while not done:          
            if len(states) < k:
                action = self.env.action_space.sample()
            else:
                action = self.sample_action(states)         

            if len(states) > 0:
                prev_states.append(states[-1])

            observation, reward, done, _ = self.env.step(action)
            states.append(preprocess(observation))

            if len(states) == k and len(prev_states) == k:
                self.buffer.add(list(prev_states), action, reward, list(states), done) # S A R S' A

            total_reward += reward
            
            self.iter += 1
            if self.iter % 10 == 0:
                self.DQN.train()
            self.epsilon = max(0.1, self.epsilon - self.eps_step)
            if self.iter % self.copy_period == 0:
                self.DQN.copy_weights()
        return total_reward

In [None]:
agent = Agent()
agent.DQN.model.summary()

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 84, 84, 4)]       0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 21, 21, 32)        8224      
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 11, 11, 64)        32832     
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 11, 11, 64)        36928     
_________________________________________________________________
flatten_2 (Flatten)          (None, 7744)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 512)               3965440   
_________________________________________________________________
dense_5 (Dense)              (None, 6)                

In [None]:
s = ENV.reset()

NUM_EPISODES = 100000

agent = Agent()

for i in range(NUM_EPISODES):
    total_reward = agent.play_episode(i)
    print(f"Episode {i} >>>> total reward:", total_reward)
    if i>0 and i % 10 == 0:
        print("Saving the model...")
        agent.DQN.model.save(f"./drive/My Drive/models/pong/pong.{i:6}.hdf5")
        
ENV.close()

Episode 0 >>>> total reward: -20.0
Episode 1 >>>> total reward: -21.0
Episode 2 >>>> total reward: -19.0
Episode 3 >>>> total reward: -21.0
Episode 4 >>>> total reward: -20.0
Episode 5 >>>> total reward: -20.0
Episode 6 >>>> total reward: -19.0
Episode 7 >>>> total reward: -21.0
Episode 8 >>>> total reward: -21.0
Episode 9 >>>> total reward: -20.0
Episode 10 >>>> total reward: -21.0
Saving the model...
Episode 11 >>>> total reward: -21.0
Episode 12 >>>> total reward: -21.0
Episode 13 >>>> total reward: -21.0
Episode 14 >>>> total reward: -21.0
Episode 15 >>>> total reward: -21.0
Episode 16 >>>> total reward: -18.0
Episode 17 >>>> total reward: -19.0
Episode 18 >>>> total reward: -18.0
Episode 19 >>>> total reward: -20.0
Episode 20 >>>> total reward: -20.0
Saving the model...
