In [206]:
!pip install numpy
!pip install tensorflow
!pip install keras
!pip install gym
!pip install pygame
!pip install opencv-python
!pip install pillow



In [207]:
!pip install git+https://github.com/ntasfi/PyGame-Learning-Environment.git

Collecting git+https://github.com/ntasfi/PyGame-Learning-Environment.git
  Cloning https://github.com/ntasfi/PyGame-Learning-Environment.git to /tmp/pip-req-build-pt38if0z
  Running command git clone --filter=blob:none --quiet https://github.com/ntasfi/PyGame-Learning-Environment.git /tmp/pip-req-build-pt38if0z
  Resolved https://github.com/ntasfi/PyGame-Learning-Environment.git to commit 3dbe79dc0c35559bb441b9359948aabf9bb3d331
  Preparing metadata (setup.py) ... [?25ldone


In [208]:
import os
import urllib.request
from pathlib import Path
import numpy as np
from collections import deque
from tqdm import tqdm
import warnings
import cv2
import random
import gc
import tensorflow as tf

# Deep learning imports
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Input, Concatenate, Conv2D, Lambda, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Lambda

# Game environment
from ple import PLE
from ple.games.flappybird import FlappyBird

# Suppress warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['SDL_AUDIODRIVER'] = 'dummy'

def setup_flappy_bird_environment():
    """Setup Flappy Bird environment and download required assets"""
    import ple
    ple_path = os.path.dirname(ple.__file__)
    asset_path = os.path.join(ple_path, 'games', 'flappybird', 'assets')
    os.makedirs(asset_path, exist_ok=True)
    
    assets = {
        # Basic assets
        'background-day.png': 'https://raw.githubusercontent.com/ntasfi/PyGame-Learning-Environment/master/ple/games/flappybird/assets/background-day.png',
        'background-night.png': 'https://raw.githubusercontent.com/ntasfi/PyGame-Learning-Environment/master/ple/games/flappybird/assets/background-night.png',
        'base.png': 'https://raw.githubusercontent.com/ntasfi/PyGame-Learning-Environment/master/ple/games/flappybird/assets/base.png',
        'pipe-green.png': 'https://raw.githubusercontent.com/ntasfi/PyGame-Learning-Environment/master/ple/games/flappybird/assets/pipe-green.png',
        'pipe-red.png': 'https://raw.githubusercontent.com/ntasfi/PyGame-Learning-Environment/master/ple/games/flappybird/assets/pipe-red.png',
        
        # Bird variants
        'bluebird-upflap.png': 'https://raw.githubusercontent.com/ntasfi/PyGame-Learning-Environment/master/ple/games/flappybird/assets/bluebird-upflap.png',
        'bluebird-midflap.png': 'https://raw.githubusercontent.com/ntasfi/PyGame-Learning-Environment/master/ple/games/flappybird/assets/bluebird-midflap.png',
        'bluebird-downflap.png': 'https://raw.githubusercontent.com/ntasfi/PyGame-Learning-Environment/master/ple/games/flappybird/assets/bluebird-downflap.png',
    }
    
    for filename, url in assets.items():
        file_path = os.path.join(asset_path, filename)
        if not os.path.exists(file_path):
            print(f"Downloading {filename}...")
            try:
                urllib.request.urlretrieve(url, file_path)
                print(f"Successfully downloaded {filename}")
            except Exception as e:
                print(f"Failed to download {filename}: {e}")
                return None
    
    # Create symbolic links for other bird variants
    bird_colors = ['red', 'yellow']
    flap_types = ['upflap', 'midflap', 'downflap']
    
    for color in bird_colors:
        for flap in flap_types:
            src = os.path.join(asset_path, f"bluebird-{flap}.png")
            dst = os.path.join(asset_path, f"{color}bird-{flap}.png")
            if not os.path.exists(dst):
                os.symlink(src, dst)
    
    return asset_path

In [209]:
# Debug function to verify assets
def verify_assets():
    """
    Verify that all required assets are present in the correct location
    """
    import ple
    ple_path = os.path.dirname(ple.__file__)
    asset_path = os.path.join(ple_path, 'games', 'flappybird', 'assets')
    
    print(f"Asset path: {asset_path}")
    print(f"Asset directory exists: {os.path.exists(asset_path)}")
    
    if os.path.exists(asset_path):
        print("Contents of asset directory:")
        files = os.listdir(asset_path)
        print("\n".join(sorted(files)))
        
        # Check for all required variants
        bird_variants = ['blue', 'red', 'yellow']
        flap_types = ['upflap', 'midflap', 'downflap']
        
        for bird in bird_variants:
            for flap in flap_types:
                filename = f"{bird}bird-{flap}.png"
                exists = filename in files
                print(f"{filename}: {'✓' if exists else '✗'}")

In [210]:
def build_optimized_model(input_shape=(84, 84, 4)):
    """
    Optimized lightweight model architecture
    """
    input_img = Input(shape=input_shape)
    
    # Efficient CNN architecture
    x = Conv2D(32, (8, 8), strides=(4, 4), activation='relu')(input_img)
    x = Conv2D(64, (4, 4), strides=(2, 2), activation='relu')(x)
    x = Conv2D(64, (3, 3), strides=(1, 1), activation='relu')(x)
    x = GlobalAveragePooling2D()(x)
    
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    output = Dense(2, activation='linear')(x)
    
    model = Model(inputs=input_img, outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0002), loss='huber')
    return model

In [211]:
class OptimizedDQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=20000)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.99
        self.learning_rate = 0.0002
        self.batch_size = 64
        self.min_replay_size = 500  # Wait for more samples before training
        self.model = build_optimized_model(state_size)
        self.target_model = build_optimized_model(state_size)
        self.update_target_model()

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.randint(self.action_size)
        state = np.expand_dims(state, axis=0)
        act_values = self.model.predict(state, verbose=0)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        
        # Use numpy arrays for better performance
        minibatch = random.sample(self.memory, batch_size)
        states = np.array([exp[0] for exp in minibatch])
        actions = np.array([exp[1] for exp in minibatch])
        rewards = np.array([exp[2] for exp in minibatch])
        next_states = np.array([exp[3] for exp in minibatch])
        dones = np.array([exp[4] for exp in minibatch])
        
        # Batch predictions
        target_values = self.target_model.predict(next_states, batch_size=batch_size, verbose=0)
        targets = rewards + self.gamma * np.max(target_values, axis=1) * (1 - dones)
        
        target_f = self.model.predict(states, batch_size=batch_size, verbose=0)
        for i, action in enumerate(actions):
            target_f[i][action] = targets[i]
        
        self.model.fit(states, target_f, epochs=1, verbose=0, batch_size=batch_size)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [212]:
def process_state_optimized(state):
    """
    Optimized state preprocessing for 84x84 input
    """
    state = cv2.resize(state, (84, 84), interpolation=cv2.INTER_AREA)
    state = state.astype(np.float32) / 255.0
    gray = cv2.cvtColor((state * 255).astype(np.uint8), cv2.COLOR_RGB2GRAY)
    gray = np.expand_dims(gray, axis=-1).astype(np.float32) / 255.0
    return np.concatenate([state, gray], axis=-1)

def train_agent_optimized(n_episodes=500, update_target_freq=2):
    """Optimized training loop for lightweight model"""
    # Environment setup
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=False)
    env.init()
    
    # Initialize agent
    state_size = (84, 84, 4)
    action_size = len(env.getActionSet())
    agent = OptimizedDQNAgent(state_size, action_size)
    
    # Training metrics
    scores = []
    recent_scores = deque(maxlen=100)
    best_score = float('-inf')
    training_start_time = time.time()
    
    # Progress bar
    progress_bar = tqdm(range(n_episodes), desc='Training')
    
    for episode in progress_bar:
        env.reset_game()
        state = process_state_optimized(env.getScreenRGB())
        total_reward = 0
        steps = 0
        episode_start_time = time.time()
        
        while not env.game_over():
            action = agent.act(state)
            raw_reward = env.act(env.getActionSet()[action])
            next_state = process_state_optimized(env.getScreenRGB())
            done = env.game_over()
            
            # Calculate reward
            reward = calculate_optimized_reward(game.getGameState(), raw_reward, done, steps)
            
            # Store experience
            agent.remember(state, action, reward, next_state, done)
            
            # Train if we have enough samples
            if len(agent.memory) >= agent.min_replay_size:
                agent.replay(agent.batch_size)
            
            state = next_state
            total_reward += reward
            steps += 1
        
        # Update target network
        if episode % update_target_freq == 0:
            agent.update_target_model()
        
        # Update metrics
        scores.append(total_reward)
        recent_scores.append(total_reward)
        avg_score = np.mean(recent_scores)
        
        # Save best model
        if total_reward > best_score:
            best_score = total_reward
            agent.model.save('flappy_bird_best_model.keras')
        
        # Calculate speed metrics
        episode_time = time.time() - episode_start_time
        total_time = time.time() - training_start_time
        avg_episode_time = total_time / (episode + 1)
        
        # Update progress bar
        progress_bar.set_postfix({
            'Score': f'{total_reward:.1f}',
            'Avg': f'{avg_score:.1f}',
            'Best': f'{best_score:.1f}',
            'Steps': steps,
            'ε': f'{agent.epsilon:.2f}',
            'Time/ep': f'{episode_time:.1f}s'
        })
        
        # Save checkpoint every 50 episodes
        if episode % 50 == 0 and episode > 0:
            agent.model.save(f'flappy_bird_checkpoint_{episode}.keras')
           
        gc.collect()
        tf.keras.backend.clear_session()
    
    return agent, scores

def calculate_optimized_reward(game_state, raw_reward, done, steps):
    """Efficient reward calculation"""
    if done:
        return -5.0
    
    reward = 0.1  # Living reward
    
    # Quick calculations using numpy operations
    player_y = game_state['player_y']
    pipe_center_y = (game_state['next_pipe_top_y'] + game_state['next_pipe_bottom_y']) / 2
    vertical_distance = abs(player_y - pipe_center_y)
    pipe_distance = game_state['next_pipe_dist_to_player']
    
    # Vectorized reward calculation
    reward += np.interp(vertical_distance, [0, 25, 50], [2.0, 0.5, 0.0])
    reward += np.interp(pipe_distance, [0, 25, 50], [3.0, 1.0, 0.0])
    
    if raw_reward > 0:  # Passed through pipe
        reward += 10.0
    
    if steps > 100:  # Survival bonus
        reward *= 1.5
    
    return reward


In [213]:
def test_agent(agent, n_episodes=10):
    """Test the trained agent"""
    asset_path = setup_flappy_bird_environment()
    if asset_path is None:
        print("Failed to set up environment for testing. Exiting.")
        return
    
    try:
        game = FlappyBird()
        env = PLE(game, fps=30, display_screen=True)
        env.init()
    except Exception as e:
        print(f"Failed to initialize game for testing: {e}")
        return
    
    agent.epsilon = 0  # Pure exploitation during testing
    test_scores = []
    
    for episode in range(n_episodes):
        env.reset_game()
        state = preprocess_state(env.getScreenRGB())
        total_reward = 0
        done = False
        
        while not done:
            action = agent.act(state)
            reward = env.act(env.getActionSet()[action])
            state = preprocess_state(env.getScreenRGB())
            done = env.game_over()
            total_reward += reward
        
        test_scores.append(total_reward)
        print(f'Test Episode {episode + 1}: Total Reward = {total_reward}')
    
    print(f'Average Test Score: {np.mean(test_scores):.2f}')

In [214]:
# Debug cell - Run this before main execution
import os
from ple import PLE
import ple

# Print asset directory information
ple_path = os.path.dirname(ple.__file__)
asset_path = os.path.join(ple_path, 'games', 'flappybird', 'assets')
print(f"Asset path: {asset_path}")
print(f"Asset directory exists: {os.path.exists(asset_path)}")
if os.path.exists(asset_path):
    print("Contents of asset directory:")
    print(os.listdir(asset_path))

Asset path: /anaconda/envs/azureml_py38/lib/python3.10/site-packages/ple/games/flappybird/assets
Asset directory exists: True
Contents of asset directory:
['redbird-midflap.png', 'bluebird-midflap.png', 'background-day.png', 'redbird-upflap.png', 'base.png', 'yellowbird-downflap.png', 'pipe-red.png', 'background-night.png', 'pipe-green.png', 'bluebird-upflap.png', 'redbird-downflap.png', 'bluebird-downflap.png', 'yellowbird-upflap.png', 'yellowbird-midflap.png']


In [None]:
if __name__ == "__main__":
    trained_agent, training_scores = train_agent(n_episodes=500)
    if trained_agent is not None:
        test_agent(trained_agent, n_episodes=10)

  0%|          | 0/500 [00:00<?, ?it/s]

Episode: 1/500, Score: 19.70, Game Score: -5.0, Avg Score: 19.70, Epsilon: 1.000, Steps: 58, Best Score: 19.70


  0%|          | 2/500 [02:09<10:30:20, 75.95s/it]

Episode: 2/500, Score: 42.10, Game Score: -5.0, Avg Score: 30.90, Epsilon: 0.945, Steps: 62, Best Score: 42.10


  1%|          | 3/500 [04:04<12:56:18, 93.72s/it]

Episode: 3/500, Score: 20.20, Game Score: -5.0, Avg Score: 27.33, Epsilon: 0.896, Steps: 53, Best Score: 42.10


  1%|          | 4/500 [05:55<13:51:46, 100.62s/it]

Episode: 4/500, Score: 11.50, Game Score: -5.0, Avg Score: 23.38, Epsilon: 0.851, Steps: 51, Best Score: 42.10


  1%|          | 5/500 [07:28<13:26:10, 97.72s/it] 

Episode: 5/500, Score: 20.10, Game Score: -5.0, Avg Score: 22.72, Epsilon: 0.816, Steps: 42, Best Score: 42.10


  1%|          | 6/500 [09:34<14:44:24, 107.42s/it]

Episode: 6/500, Score: 10.90, Game Score: -5.0, Avg Score: 20.75, Epsilon: 0.772, Steps: 55, Best Score: 42.10


  1%|▏         | 7/500 [11:51<16:03:48, 117.30s/it]

Episode: 7/500, Score: 20.10, Game Score: -5.0, Avg Score: 20.66, Epsilon: 0.730, Steps: 57, Best Score: 42.10


  2%|▏         | 8/500 [13:39<15:35:30, 114.09s/it]

Episode: 8/500, Score: 18.00, Game Score: -5.0, Avg Score: 20.33, Epsilon: 0.697, Steps: 46, Best Score: 42.10


  2%|▏         | 9/500 [14:51<13:48:07, 101.20s/it]

Episode: 9/500, Score: 13.20, Game Score: -5.0, Avg Score: 19.53, Epsilon: 0.674, Steps: 33, Best Score: 42.10


  2%|▏         | 10/500 [16:34<13:50:46, 101.73s/it]

Episode: 10/500, Score: 22.10, Game Score: -5.0, Avg Score: 19.79, Epsilon: 0.643, Steps: 47, Best Score: 42.10
Episode: 11/500, Score: 17.00, Game Score: -5.0, Avg Score: 19.54, Epsilon: 0.614, Steps: 46, Best Score: 42.10


  2%|▏         | 12/500 [20:10<14:19:08, 105.63s/it]

Episode: 12/500, Score: 23.20, Game Score: -5.0, Avg Score: 19.84, Epsilon: 0.583, Steps: 53, Best Score: 42.10


  3%|▎         | 13/500 [22:06<14:41:41, 108.63s/it]

Episode: 13/500, Score: 19.20, Game Score: -5.0, Avg Score: 19.79, Epsilon: 0.553, Steps: 53, Best Score: 42.10


  3%|▎         | 14/500 [24:39<16:29:17, 122.14s/it]

Episode: 14/500, Score: 84.40, Game Score: -4.0, Avg Score: 24.41, Epsilon: 0.515, Steps: 70, Best Score: 84.40


  3%|▎         | 15/500 [26:41<16:27:22, 122.15s/it]

Episode: 15/500, Score: 23.00, Game Score: -5.0, Avg Score: 24.31, Epsilon: 0.487, Steps: 56, Best Score: 84.40
