In [1]:
# added SPE to the base strategies at first, then deleted

In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from stable_baselines3 import SAC, PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
import time
import sys
from src.environments import ConPricingGame
import src.globals as gl
import src.classes as cl

In [2]:
class Iter_row:
    def __init__(self, adv, agent_return, adv_return, rewards, adv_rewards, actions, prices, adv_prices, demands, adv_demands):
        self.adv = adv
        self.agent_return = agent_return
        self.adv_return = adv_return
        self.rewards = rewards
        self.adv_rewards = adv_rewards
        self.actions = actions
        self.prices = prices
        self.adv_prices = adv_prices
        self.demands = demands
        self.adv_demands = adv_demands

In [3]:
def find_base_agent(db,alg, memory, cost, strategies, strategy_probs):
    """ the startegies should be the same class of agents as we are training. if low cost then low-cost strategies should be given to find similar ones. The trained agents that are not even added will be considered """
    strats=strategies.copy()
    probs=strategy_probs.copy()
    for i in range(len(probs)-1):
        for j in range(i+1,len(probs)):
            if probs[i]< probs[j]:
                strats[i],strats[j] = strats[j], strats[i]
                probs[i],probs[j] = probs[j], probs[i]
    for st in strats:
        if st.type==cl.StrategyType.sb3_model and  memory == st.memory and (alg is st.model) :
            return st.name
    query=f'SELECT name FROM {cl.DataBase.AGENTS_TABLE} WHERE cost={cost} and memory={memory} and alg=\"{str(alg)}\" ORDER BY id DESC'
    db.cursor.execute(query)
    tmp= db.cursor.fetchone()
    if tmp is not None:
        return tmp[0]
    return None

In [4]:
def training(db,base_agent, env_class, costs, adv_mixed_strategy, target_payoff, num_procs, alg, lr, memory):
    """
    trains an agent against adversaries. if the expected payoff of new agent is greater than expected payoff of NE, \
        returns acceptable=true and the new strategy and payoff to be added to the the strategies and matrix.
    """
    

    acceptable = False

    pricing_game = env_class(
        tuple_costs=costs, adversary_mixed_strategy=adv_mixed_strategy, memory=memory)

    seed = int(time.time())

    model_name = f"{job_name}-{str(seed)}"
    models_dir = f"{gl.MODELS_DIR}/{model_name}"

    log_dir = f"{gl.LOG_DIR}/{model_name}"

    # if not os.path.exists(models_dir):
    #     os.makedirs(models_dir)

    # if not os.path.exists(log_dir):
    #     os.makedirs(log_dir)

    # number_episodes = gl.NUM_EPISODES + gl.EPISODE_ADV_INCREASE * \
    #     (adv_mixed_strategy.support_size-1)
    # train_env = make_vec_env(env_class, n_envs=num_procs, seed=seed, vec_env_cls=SubprocVecEnv, env_kwargs=dict(
    #     tuple_costs=costs, adversary_mixed_strategy=adv_mixed_strategy, memory=memory))

    train_env = env_class(tuple_costs=costs, adversary_mixed_strategy=adv_mixed_strategy, memory=memory)
    seed=0
    if base_agent is None:
        number_episodes = gl.N_EPISODES_BASE * (1 + gl.EPISODE_INCREASE_PORTION * (adv_mixed_strategy.support_size-1))
        if alg is SAC:
            model = alg('MlpPolicy', train_env, learning_rate=lr,
                        verbose=0, tensorboard_log=log_dir, gamma=gl.GAMMA, target_entropy=0)
        else:
            model = alg('MlpPolicy', train_env, learning_rate=lr,
                        verbose=0, tensorboard_log=log_dir, gamma=gl.GAMMA)
    else:
        number_episodes = gl.N_EPISODES_LOAD * (1 + gl.EPISODE_INCREASE_PORTION * (adv_mixed_strategy.support_size-1))
        base_agent_dir = f"{gl.MODELS_DIR}/{base_agent}"
        if alg is SAC:
            model = alg.load(base_agent_dir, train_env, learning_rate=lr,
                             verbose=0, tensorboard_log=log_dir, gamma=gl.GAMMA, target_entropy=0)
        else:
            model = alg.load(base_agent_dir, train_env, learning_rate=lr,
                             verbose=0, tensorboard_log=log_dir, gamma=gl.GAMMA)

    start = time.time()
    # for i in range(gl.NUM_MODEL_SAVE):
    # tmp = (number_episodes/gl.NUM_MODEL_SAVE)
    # model.learn(total_timesteps=tmp, reset_num_timesteps=False,
    #             tb_log_name=model_name)
    # model.save(os.path.join(models_dir, str(tmp*(i+1))))
    model.learn(total_timesteps=number_episodes, tb_log_name=model_name)
    model.save(models_dir)
    running_time = time.time() - start

    agent_payoffs = np.zeros(len(adv_mixed_strategy.strategies))
    adv_payoffs = np.zeros(len(adv_mixed_strategy.strategies))
    expected_payoff = 0

    model_strategy = cl.Strategy(strategy_type=cl.StrategyType.sb3_model,
                                 model_or_func=alg, name=model_name, action_step=pricing_game.action_step,memory=memory)
    iter_rows = []
    for strategy_index in range(len(adv_mixed_strategy.strategies)):
        if adv_mixed_strategy.strategy_probs[strategy_index] > 0:
            payoffs = []
            for _ in range(gl.NUM_STOCHASTIC_ITER):
                # returns = algorithm.play_trained_agent(adversary=(
                #     (adv_mixed_strategy._strategies[strategy_index]).to_mixed_strategy()), iterNum=gl.num_stochastic_iter)
                payoffs.append(model_strategy.play_against(
                    env=pricing_game, adversary=adv_mixed_strategy.strategies[strategy_index]))
                
                #adv, agent_return, adv_return, rewards, adv_rewards, actions, prices, adv_prices, demands, adv_demands
                iter_row = Iter_row(adv=pricing_game.adversary_strategy.name, agent_return=sum(pricing_game.profit[0]), adv_return=sum(pricing_game.profit[1]), rewards=str(
                    pricing_game.profit[0]), adv_rewards=str(pricing_game.profit[1]), actions=str(pricing_game.actions),prices=str(pricing_game.prices[0]), adv_prices=str(pricing_game.prices[1]) ,demands=str(pricing_game.demand_potential[0]), adv_demands=str(pricing_game.demand_potential[1]))

                iter_rows.append(iter_row)

            mean_payoffs = np.array(payoffs).mean(axis=0)

            agent_payoffs[strategy_index] = mean_payoffs[0]
            adv_payoffs[strategy_index] = mean_payoffs[1]
            expected_payoff += (agent_payoffs[strategy_index]) * \
                (adv_mixed_strategy.strategy_probs[strategy_index])

    acceptable = (expected_payoff > target_payoff)
    # agent_id=db.insert_new_agent(model_name,number_episodes,costs[0], str(adv_mixed_strategy), expected_payoff,target_payoff, lr,memory, acceptable, pricing_game.action_step, seed,num_procs,running_time)
    agent_id = db.insert_new_agent(db.AgentRow(model_name, base_agent, number_episodes, costs[0], str(
        adv_mixed_strategy), expected_payoff, target_payoff,  str(alg),lr, memory, acceptable, pricing_game.action_step, seed, num_procs, running_time))

    if expected_payoff > target_payoff:
        acceptable = True
        for row in iter_rows:
            db.insert_new_iteration(agent_id, row.adv, row.agent_return, row.adv_return, row.rewards,
                                    row.adv_rewards, row.actions, row.prices, row.adv_prices, row.demands, row.adv_demands)
        # compute the payoff against all adv strategies, to be added to the matrix
        for strategy_index in range(len(adv_mixed_strategy.strategies)):
            if adv_mixed_strategy.strategy_probs[strategy_index] == 0:
                payoffs = []
                for _ in range(gl.NUM_STOCHASTIC_ITER):
                    payoffs.append(model_strategy.play_against(
                        env=pricing_game, adversary=adv_mixed_strategy.strategies[strategy_index]))
                mean_payoffs = np.array(payoffs).mean(axis=0)

                agent_payoffs[strategy_index] = mean_payoffs[0]
                adv_payoffs[strategy_index] = mean_payoffs[1]

    return [acceptable, agent_payoffs, adv_payoffs, model_strategy, expected_payoff]


In [None]:
env_class = ConPricingGame
gl.initialize()

num_rounds = 50

job_name = "rnd_Feb5"
db_name = job_name+".db"
db = cl.DataBase(db_name)
low_strts, high_strts=db.get_list_of_added_strategies()
cl.set_job_name(job_name)
# num_procs = gl.NUM_PROCESS if (len(sys.argv) < 2) else int(sys.argv[1])
num_procs = 7



# changing params
lrs = [0.0003, 0.00016]
memories = [12,18]
# memories_agents=[[None]*len(memories)]*2
algs = [SAC,PPO]

equilibria = []

cl.create_directories()

# strt1 = cl.Strategy(
#     cl.StrategyType.static, model_or_func=cl.myopic, name="myopic")
# strt2 = cl.Strategy(
#     cl.StrategyType.static, model_or_func=cl.const, name="const", first_price=132)
# strt3 = cl.Strategy(
#     cl.StrategyType.static, model_or_func=cl.guess, name="guess", first_price=132)
# strt4 = cl.Strategy(
#     cl.StrategyType.static, model_or_func=cl.spe, name="spe")

train_env = env_class(tuple_costs=None, adversary_mixed_strategy=None, memory=12)
model_name="rnd_start"
log_dir = f"{gl.LOG_DIR}/{model_name}"
model = SAC('MlpPolicy', train_env,
                        verbose=0, tensorboard_log=log_dir, gamma=gl.GAMMA, target_entropy=0)
# model.learn(total_timesteps=1, tb_log_name=model_name)
model.save(f"{gl.MODELS_DIR}/{model_name}")

strt_rnd= cl.Strategy(strategy_type=cl.StrategyType.sb3_model,
                                 model_or_func=SAC, name=model_name, action_step=None,memory=12)

bimatrix_game = cl.BimatrixGame(
    low_cost_strategies=[strt_rnd]+low_strts, high_cost_strategies=[strt_rnd]+high_strts, env_class=env_class)

bimatrix_game.reset_matrix()
bimatrix_game.fill_matrix()



cl.prt("\n" + time.ctime(time.time())+"\n"+("-"*50)+"\n")

dictionaries = bimatrix_game.compute_equilibria()

# low_cost_probabilities, high_cost_probabilities, low_cost_payoff, high_cost_payoff = bimatrix_game.compute_equilibria()
for round in range(num_rounds):
    cl.prt(f"Round {round} of {num_rounds}")
    
    added_low=0
    added_high=0
    # for equilibrium in dictionaries:
    for equi_i in range(len(dictionaries)):
        new_equi_low = 0
        new_equi_high = 0
        equi = dictionaries[equi_i]
        # low_prob_str = ", ".join(
        #     map("{0:.2f}".format, equi["low_cost_probs"]))
        # high_prob_str = ", ".join(
        #     map("{0:.2f}".format, equi["high_cost_probs"]))
        cl.prt(
            f'equi: {str(equi["low_cost_support"])}, {str(equi["high_cost_support"])}\n payoffs= {equi["low_cost_payoff"]:.2f}, {equi["high_cost_payoff"]:.2f}')
    
        # train a low-cost agent
        high_mixed_strat = cl.MixedStrategy(
            strategies_lst=bimatrix_game.high_strategies, probablities_lst=((equi["high_cost_probs"]+([0]*added_high)) if added_high> 0 else equi["high_cost_probs"]))
    
        base_agent=None
        for alg in algs:
            for lr in lrs:
                for mem_i,memory in enumerate(memories):
                    # base_strt=(high_mixed_strat.strategies[np.argmax(
                    #     np.array(high_mixed_strat.strategy_probs))])
                    # if base_strt.type == cl.StrategyType.sb3_model:
                    #     base_agent = base_strt.name
                    base_agent= find_base_agent(db=db,memory=memory, alg=alg, cost=gl.LOW_COST, strategies= bimatrix_game.low_strategies,strategy_probs=((equi["low_cost_probs"]+([0]*added_low)) if added_low > 0 else equi["low_cost_probs"])) 
                    print(f'training low-cost player with base={base_agent} ,alg={str(alg)}, lr={lr:.4f}, memory={memory}')
    
                    [acceptable, agent_payoffs, adv_payoffs, agent_strategy, expected_payoff] = training(db=db,base_agent=base_agent, env_class=env_class, costs=[
                                                                        gl.LOW_COST, gl.HIGH_COST], adv_mixed_strategy=high_mixed_strat, target_payoff=equi["low_cost_payoff"], num_procs=num_procs, alg=alg, lr=lr, memory=memory)
                    if acceptable:
                        new_equi_low += 1
                        added_low+=1
                        
                        # update[int(i/2)] = True
                        bimatrix_game.low_strategies.append(agent_strategy)
                        bimatrix_game.add_low_cost_row(agent_payoffs, adv_payoffs)
    
                        # cl.prt(f"low cost player {agent_strategy.name} added, trained with ", [
                        #     equi["low_cost_probabilities"], equi["high_cost_probabilities"], equi["low_cost_payoff"], equi["high_cost_payoff"]])
                        cl.prt(
                            f'low-cost player {agent_strategy.name} , payoff= {expected_payoff:.2f} added, base={base_agent} ,alg={str(alg)}, lr={lr:.4f}, memory={memory}')
    
        # train a high-cost agent
        low_mixed_strat = cl.MixedStrategy(
            strategies_lst=bimatrix_game.low_strategies, probablities_lst=((equi["low_cost_probs"]+([0]*added_low)) if added_low > 0 else equi["low_cost_probs"]))
        
        base_agent=None
        for alg in algs:
            for lr in lrs:
                for memory in memories:
                    
                    # base_strt=(low_mixed_strat.strategies[np.argmax(
                    #     np.array(low_mixed_strat.strategy_probs))])
                    # if base_strt.type == cl.StrategyType.sb3_model:
                    #     base_agent = base_strt.name
                    base_agent= find_base_agent(db=db,cost=gl.HIGH_COST,memory=memory, strategies= bimatrix_game.high_strategies,strategy_probs=((equi["high_cost_probs"]+([0]*added_high)) if added_high> 0 else equi["high_cost_probs"]),alg=alg)
                    print(f'training high-cost player with base={base_agent} ,alg={str(alg)}, lr={lr:.4f}, memory={memory}')
                    [acceptable, agent_payoffs, adv_payoffs, agent_strategy, expected_payoff] = training(db=db,base_agent=base_agent, env_class=env_class, costs=[
                        gl.HIGH_COST, gl.LOW_COST], adv_mixed_strategy=low_mixed_strat, target_payoff=equi["high_cost_payoff"], num_procs=num_procs, alg=alg, memory=memory, lr=lr)
                    if acceptable:
                        new_equi_high += 1
                        added_high+=1
                        bimatrix_game.high_strategies.append(agent_strategy)
                        bimatrix_game.add_high_cost_col(adv_payoffs, agent_payoffs)
    
                        cl.prt(
                            f'high-cost player {agent_strategy.name} , payoff= {expected_payoff:.2f} added, base={base_agent}, alg={str(alg)}, lr={lr:.4f}, memory={memory}')
    
        if new_equi_low>0 or new_equi_high>0:
            equilibria.append(
                [equi["low_cost_probs"], equi["high_cost_probs"], equi["low_cost_payoff"], equi["high_cost_payoff"]])
            
            
    if added_low==0 and added_high==0:
        gl.N_EPISODES_BASE *= 1.1
        gl.N_EPISODES_LOAD *= 1.1
    else:
        dictionaries = bimatrix_game.compute_equilibria()

training low-cost player with base=rnd_start ,alg=<class 'stable_baselines3.sac.sac.SAC'>, lr=0.0003, memory=12
training low-cost player with base=None ,alg=<class 'stable_baselines3.sac.sac.SAC'>, lr=0.0003, memory=18
training low-cost player with base=rnd_start ,alg=<class 'stable_baselines3.sac.sac.SAC'>, lr=0.0002, memory=12
training low-cost player with base=rnd_Feb5-1707144365 ,alg=<class 'stable_baselines3.sac.sac.SAC'>, lr=0.0002, memory=18
training low-cost player with base=None ,alg=<class 'stable_baselines3.ppo.ppo.PPO'>, lr=0.0003, memory=12
training low-cost player with base=None ,alg=<class 'stable_baselines3.ppo.ppo.PPO'>, lr=0.0003, memory=18
training low-cost player with base=rnd_Feb5-1707205022 ,alg=<class 'stable_baselines3.ppo.ppo.PPO'>, lr=0.0002, memory=12
training low-cost player with base=rnd_Feb5-1707206605 ,alg=<class 'stable_baselines3.ppo.ppo.PPO'>, lr=0.0002, memory=18
training high-cost player with base=rnd_start ,alg=<class 'stable_baselines3.sac.sac.SAC'

In [None]:
# env=ConPricingGame(tuple_costs=[57,71], adversary_mixed_strategy= (cl.Strategy(
#     cl.StrategyType.static, model_or_func=cl.myopic, name="myopic")).to_mixed_strategy(), memory=3)

# policy = (PPO.load("models/"+"NOV24-1700860722", env=env)).predict