In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import random
import gym
import pylab
import numpy as np
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
import cv2
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# !python -m atari_py.import_roms /content/drive/MyDrive/Optimization_II/

In [None]:
def create_model(input_shape, action_space, lr):
    X_input = Input(input_shape)
    X = Flatten(input_shape=input_shape)(X_input)
    X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X)
    action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X)
    value = Dense(1, kernel_initializer='he_uniform')(X)

    Actor = Model(inputs = X_input, outputs = action)
    Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(learning_rate=lr))

    Critic = Model(inputs = X_input, outputs = value)
    Critic.compile(loss='mse', optimizer=RMSprop(learning_rate=lr))

    return Actor, Critic

In [None]:
env = gym.make('Pong-v0')
actor, critic = create_model((4,80,80),env.action_space.n,0.000025)

In [None]:
actor.summary()

In [None]:
critic.summary()

In [None]:
class Pong_Algorithm:
    # Actor-Critic Main Optimization Algorithm
    def __init__(self, env_name):
        # Initialization
        # Environment and PPO parameters
        self.env_name = env_name       
        self.env = gym.make(env_name)
        self.action_size = self.env.action_space.n
        self.train_games = 5000
        self.test_games = 200
        self.max_average = -21.0
        self.lr = 0.000025

        self.ROWS = 80
        self.COLS = 80
        self.REM_STEP = 4

        # Instantiate games and plot memory
        self.frames, self.actions, self.rewards = [], [], []
        self.scores, self.games, self.average = [], [], []

        self.Save_Path = 'Models'
        self.frame_size = (self.REM_STEP, self.ROWS, self.COLS)
        self.frame_buffer = np.zeros(self.frame_size)
        
        if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
        self.path = f'{self.env_name}_A2C_{self.lr}'
        self.Model_name = os.path.join(self.Save_Path, self.path)

        # Create Actor-Critic network model
        self.Actor, self.Critic = create_model(input_shape=self.frame_size, action_space = self.action_size, lr=self.lr)
#         self.Actor = load_model('NN_WinPong_AC2_Actor_1268.tf')
#         self.Critic = load_model('NN_WinPong_AC2_Critic_1268.tf')

    def store_feed(self, feed, action, reward):
        # store game actions to memory
        self.frames.append(feed)
        action_onehot = np.zeros([self.action_size])
        action_onehot[action] = 1
        self.actions.append(action_onehot)
        self.rewards.append(reward)

    def decide_action(self, feed):
        prediction = self.Actor.predict(feed)[0]
        action = np.random.choice(self.action_size, p=prediction)
        return action

    def discount_rewards(self, reward):
        # Compute the delta-discounted rewards over an game
        delt = 0.99 # discount factor
        nr = r.shape[0]
        discounted_r = np.zeros(nr)
        for t in range(nr):
            # start at the end
            if r[nr-t-1] > 0:
                discounted_r[nr-t-1] = 1 
            elif r[nr-t-1] < 0: 
                discounted_r[nr-t-1] = -1
            elif t==0: 
                discounted_r[nr-t-1] = 0
            elif discounted_r[nr-t-1] == 0: 
                discounted_r[nr-t-1] = delt*discounted_r[nr-t]
        return discounted_r
                
    def play_a_game(self):
        # reshape memory to appropriate shape for training
        feed = np.vstack(self.frames)
        actions = np.vstack(self.actions)
        # Compute discounted rewards
        discounted_r = self.discount_rewards(self.rewards)
        # Get Critic network predictions
        predictions = self.Critic.predict(feed)[:, 0]
        # Compute weights (advantages)
        weights = discounted_r - predictions

        # training Actor and Critic networks
        self.Actor.fit(feed, actions, sample_weight=weights, epochs=1, verbose=0)
        self.Critic.fit(feed, discounted_r, epochs=1, verbose=0)
        # save models
        self.Actor.save('NN_WinPong_AC2_Actor_temp.tf')
        self.Critic.save('NN_WinPong_AC2_Critic_temp.tf')
        # reset training memory
        self.frames, self.actions, self.rewards = [], [], []
    
    def load_actor(self, Actor_name):
        self.Actor = load_model(Actor_name)

    def save_scores(self, score, game):
        self.scores.append(score)
        self.games.append(game)
        self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
        df_scores = pd.DataFrame({'scores':agent.scores,'games':agent.games,'averages':agent.average})
        df_scores.to_csv('/content/drive/MyDrive/Optimization_II/Scores_temp.csv')
        print('Scores Saved!')
        return self.average[-1]

    def prepro(self, frame):
        # cropping frame to 80x80 size
        frame_cropped = frame[35:195:2, ::2,:]
        if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS:
            frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)

        # converting to black and white for faster training
        frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2]
        frame_rgb[frame_rgb < 100] = 0
        frame_rgb[frame_rgb >= 100] = 255    

        new_frame = np.array(frame_rgb).astype(np.float32) / 255.0
        self.frame_buffer = np.roll(self.frame_buffer, 1, axis = 0)
        self.frame_buffer[0,:,:] = new_frame
        
        return np.expand_dims(self.frame_buffer, axis=0)

    def reset_environment(self):
        frame = self.env.reset()
        for i in range(self.REM_STEP):
            frame = self.prepro(frame)
        return frame

    def take_step(self, action):
        pix_new, reward, done, info = self.env.step(action)
        pix = self.prepro(pix_new)
        return pix, reward, done, info
    
    def run(self):
        for e in range(self.train_games):
            current_frame = self.reset_environment()
            done = False
            score = 0
            while not done:
                # Actor picks an action
                action = self.decide_action(current_frame)
                # Retrieve new state, reward, and whether the game is done
                future_frame, reward, done, _ = self.take_step(action)
                # Memorize (state, action, reward) for training
                self.store_feed(current_frame, action, reward)
                # Update current state
                current_frame = future_frame
                score += reward
                if done:
                    average = self.save_scores(score, e)
                    self.play_a_game()
        self.env.close()

    def test(self, Actor_name):
        self.load(Actor_name)
        game_scores = []
        for e in range(self.test_games):
            feed = self.reset_environment()
            done = False
            score = 0
            while not done:
                action = np.argmax(self.Actor.predict(feed))
                pix, reward, done, _ = self.take_step(action)
                score += reward    
                if done:
                    game_scores.append(score)
                    print(f'Game: {e}/{self.test_games}, Score: {score}')
                    break
        self.env.close()

In [None]:
env_name = 'Pong-v0'
ac_pong = Pong_Algorithm(env_name)
ac_pong.run()

In [None]:
ac_pong.test('NN_WinPong_AC2_Actor.tf')

In [None]:
df_train_scores = pd.read_csv('Scores_All.csv')
df_train_scores.drop_duplicates(subset = 'Game', keep = 'last', inplace = True)
df_train_scores['50-day Average Score'] = df_train_scores['Score'].rolling(50, 1).mean()

plt.figure(figsize=(12,6))
plt.title('Scores vs Games Trained')
plt.plot(df_train_scores['Game'],df_train_scores['Score'], 'b', label = 'Score')
plt.plot(df_train_scores['Game'],df_train_scores['50-day Average Score'], 'r', label = '50-day Average')
plt.grid()
plt.legend()
plt.show()

In [None]:
df_test_scores = pd.read_csv('scores_test.csv')
df_test_scores['score'].hist()
plt.title('Histogram of Test Scores')
plt.show()


In [None]:
df_test_scores.describe()['score']

In [None]:
len(df_test_scores[df_test_scores['score']>0])