In [1]:
import import_ipynb
import stage
import numpy as np
from IPython.display import clear_output
from sty import fg, bg, ef, rs
import random
import matplotlib.pyplot as plt
import copy

importing Jupyter notebook from stage.ipynb


## Play Randomly

In [2]:
class Play_random:
    def __init__(self,env: stage.Environment):
        self.env = env
        pass
    def play(self, limit = 500):
        
        env = self.env
        frames = []
        time = 0
        env.reset()
        agents_n = len(self.env.agents)

        while not env.isDone():
            agent = time % agents_n
    
            state = env.getState()
        
            action = random.randint(0, 4)
            
            time += 1
    
            env.nextStep(agent, action)
    
            frames.append(env.render())
    
            if time > limit:
                break
        return frames
    def show(self):
        frames = self.play()
        stage.print_frames(frames)
        
    

## Q-table

In [3]:
class Q_table:
    def __init__(self, actionSize: int):
        self.table = {}
        self.actionSize = actionSize
        
    def add(self, state):
        if state not in self.table:
            self.table[state] = np.random.randn(self.actionSize)*0.001
    def get(self, state, action):
        self.add(state)
        return self.table[state][action]
        
    def getMax(self, state):
        self.add(state)
        return np.argmax(self.table[state])
    
    def getMin(self, state):
        self.add(state)
        return np.argmin(self.table[state])
    def maxVal(self, state):
        self.add(state)
        return np.max(self.table[state])

    def setVal(self, state, action, set_as):
        self.table[state][action] = set_as
    

## Q-learning

In [4]:
class ClassicQlearning:
    def __init__(self,env: stage.Environment):
        self.env = env
        self.agent_n = len(env.agents)
        self.q_tables = []
        for i in range(self.agent_n):
            self.q_tables.append(Q_table(5))
        
    def train(self, alpha = 0.1, gamma = 0.6, epsilon = 0.1, epochs = 1_000, cap = 2_000):
        
        env = self.env
        env.reset()
        totalTime = 0
        for i in range(1, epochs):
            env.reset()
            time = 0

            while not env.isDone():
                
                time += 1
                
                if time > cap:
                    break
                
                # Forward Prop
                for agent in range(self.agent_n):
                    
                    state = env.agents[agent].getPos()
                    q_table = self.q_tables[agent]

                    if random.uniform(0, 1) < epsilon:
                        action = random.randint(0, 4) # Explore
                    else:
                        action = q_table.getMax(state) # Exploit learned valued

                    env.nextStep(agent, action)
                
    
                    reward = env.reward(agent)
                                
                    next_state = env.agents[agent].getPos()
                    old_value = q_table.get(state, action)
                    
                    next_max = q_table.maxVal(next_state)
                    
                    new_value = (1 - alpha) * old_value + alpha * (reward + gamma * (next_max))
                    
                    q_table.setVal(state, action, new_value)
                    
                
            
            totalTime += time

            if i % 100 == 0:
                clear_output(wait=True)
                print(f"Episode: {i}")
                print(f"Time: {totalTime/100}")
                totalTime = 0


        print("Training Finished!")
    def play(self, cap = 2_000, epsilon = 0.1):
        
        env = self.env
        frames = []
        time = 0
        env.reset()
        while not env.isDone():
            # Forward Prop
            for agent in range(self.agent_n):
                    
                state = env.agents[agent].getPos()
                q_table = self.q_tables[agent]

                if random.uniform(0, 1) < epsilon:
                    action = random.randint(0, 4) # Explore
                else:
                    action = q_table.getMax(state) # Exploit learned valued

                env.nextStep(agent, action)
    
            frames.append(env.render())
    
            if time > cap:
                break
        return frames
    
    def show(self, cap = 2_000, epsilon = 0.1):
        frames = self.play(cap, epsilon)
        stage.print_frames(frames)

## Mean-field RL

In [5]:
class MeanField:
    def __init__(self,env: stage.Environment):
        self.env = env
        self.agent_n = len(env.agents)
        self.q_tables = []
        for i in range(self.agent_n):
            self.q_tables.append(Q_table(5))
        
    def train(self, alpha = 0.1, gamma = 0.6, epsilon = 0.1, epochs = 1_000, cap = 2_000):
        
        env = self.env
        env.reset()
        totalTime = 0
        for i in range(1, epochs):
            env.reset()
            time = 0

            while not env.isDone():
                
                time += 1
                
                if time > cap:
                    break
                
                # Forward Prop
                states_action = []
                meanVal = 0
                for agent in range(self.agent_n):
                    
                    state = env.getState()
                    q_table = self.q_tables[agent]

                    if random.uniform(0, 1) < epsilon:
                        action = random.randint(0, 4) # Explore
                    else:
                        action = q_table.getMax(state) # Exploit learned valued

                    env.nextStep(agent, action)
                    states_action.append((state, action))
                    
                    # calculate mean Val
                    next_state = env.getState()
                    meanVal += q_table.maxVal(next_state)
                
                # Back prop
                for agent in range(self.agent_n):
                    
                    q_table = self.q_tables[agent]
                    state, action = states_action[agent]
                    
                    reward = env.reward(agent)   
                    
                    old_value = q_table.get(state, action)
                    
                    new_value = (1 - alpha) * old_value + alpha * (reward + gamma * (meanVal/self.agent_n))
                    
                    q_table.setVal(state, action, new_value)
                    
                    
                
            
            totalTime += time

            if i % 100 == 0:
                clear_output(wait=True)
                print(f"Episode: {i}")
                print(f"Time: {totalTime/100}")
                totalTime = 0


        print("Training Finished!")
    def play(self, cap = 2_000, epsilon = 0.1):
        
        env = self.env
        frames = []
        time = 0
        env.reset()
        while not env.isDone():
            # Forward Prop
            for agent in range(self.agent_n):
                time +=1
                    
                state = env.getState()
                q_table = self.q_tables[agent]

                if random.uniform(0, 1) < epsilon:
                    action = random.randint(0, 4) # Explore
                else:
                    action = q_table.getMax(state) # Exploit learned valued

                env.nextStep(agent, action)
    
            frames.append(env.render())
    
            if time > cap:
                break
        return frames
    
    def show(self, cap = 2_000, epsilon = 0.1):
        frames = self.play(cap, epsilon)
        stage.print_frames(frames)

https://www.ijcai.org/proceedings/2021/0070.pdf