In [1]:
import numpy as np

In [12]:
num_states = 3
num_actions_per_player = 2
num_actions = num_actions_per_player**2
num_trans = 3
reward_range = [-1,1]

def generate_random_trans_and_rewards():
    trans_prob_matrices = []
    reward_matrices = []
    for _ in range(num_trans):
        trans_prob_matrix = []
        reward_matrix = []
        for s in range(num_states):
            trans_prob_matrix_for_s = []
            reward_matrix_for_s = []
            for a in range(num_actions):
                rands = np.random.uniform(0,1, num_states)
                rand_probs = list(rands/sum(rands))
                trans_prob_matrix_for_s.append(rand_probs)
                rs = np.random.uniform(*reward_range, num_states)
                reward_matrix_for_s.append(list(rs))
            trans_prob_matrix.append(trans_prob_matrix_for_s)
            reward_matrix.append(reward_matrix_for_s)
        trans_prob_matrices.append(trans_prob_matrix)
        reward_matrices.append(reward_matrix)
    return trans_prob_matrices, reward_matrices

tpmxs, rmxs =  generate_random_trans_and_rewards()  # shape: (trans, state, action, next_state)
print(np.array(tpmxs).shape, np.array(rmxs).shape, rmxs) # shape: (trans, state, action, next_state)


(3, 3, 4, 3) (3, 3, 4, 3) [[[[0.3467567654286501, -0.34091283442462816, -0.5789680111630586], [0.4835608572436221, 0.08290562160040937, 0.5025827055969319], [0.38146084179130724, -0.15388039953402854, 0.10246282709099175], [0.20623516832905553, -0.34476867760623575, 0.3794605769365267]], [[-0.5979937587756445, 0.43640003504477765, 0.2033118157104774], [0.6338579952749988, 0.35350868438074823, -0.9407164825729974], [-0.1315120446226501, -0.45244028123615143, -0.6979882926485159], [-0.3410059200264419, -0.9041320063903293, -0.8310812261136384]], [[-0.289745139773828, -0.6141492486106037, -0.09275754456588636], [-0.0065158426337410535, -0.5322776810312382, 0.5476167508939815], [-0.4227775166883523, 0.3381724281960061, 0.2401423898702051], [0.9242131831849734, -0.8896340899336708, 0.3982982530946184]]], [[[0.08763971914667645, -0.773980333283455, -0.8815137096112955], [0.4013720108808865, 0.4367411122368767, -0.9267067624759577], [-0.9501087549126122, 0.5995285838353452, 0.4173926450987559

In [63]:
import ecos
from scipy.sparse import csr_matrix

def NashEquilibriumECOSSolver(M):
    """
    https://github.com/embotech/ecos-python
    min  c*x
    s.t. A*x = b
         G*x <= h
    https://github.com/embotech/ecos/wiki/Usage-from-MATLAB
    args: 
        c,b,h: numpy.array
        A, G: Scipy sparse matrix
    """
    row, col = M.shape
    c = np.zeros(row+1)
    # max z
    c[-1] = -1  
    
    # x1+x2+...+xn=1
    A = np.ones(row+1)
    A[-1] = 0.
    A = csr_matrix([A])
    b=np.array([1.])
    
    # M.T*x<=z
    G1 = np.ones((col, row+1))
    G1[:col, :row] = -1. * M.T
    # x>=0
    G2 = np.zeros((row, row+1))
    for i in range(row):
        G2[i, i]=-1. 
    # x<=1.
    G3 = np.zeros((row, row+1))
    for i in range(row):
        G3[i, i]=1. 
    G = csr_matrix(np.concatenate((G1, G2, G3)))
    h = np.concatenate((np.zeros(2*row), np.ones(row)))
    
    # specify number of variables
    dims={'l': col+2*row, 'q': []}
                       
    solution = ecos.solve(c,G,h,dims,A,b, verbose=False)

    p1_value = solution['x'][:row]
    p2_value = solution['z'][:col] # z is the dual variable of x
    # There are at least two bad cases with above constrained optimization,
    # where the constraints are not fully satisfied (some numerical issue):
    # 1. the sum of vars is larger than 1.
    # 2. the value of var may be negative.
    abs_p1_value = np.abs(p1_value)
    abs_p2_value = np.abs(p2_value)
    p1_value = abs_p1_value/np.sum(abs_p1_value)
    p2_value = abs_p2_value/np.sum(abs_p2_value)

    return (p1_value, p2_value)

In [75]:
import copy
import pandas

class ArbitraryMDP():
    def __init__(self, num_states=3, num_actions_per_player=2, num_trans=3):
        self.num_states = num_states
        self.num_actions_per_player = num_actions_per_player
        self.num_actions = self.num_actions_per_player**2
        self.num_trans = num_trans
        self.reward_range = [-1,1]
        self.state = None
        self._construct_game()

    def _construct_game(self, ):
        self.trans_prob_matrices, self.reward_matrices = self.generate_random_trans_and_rewards()

    def generate_random_trans_and_rewards(self,):
        trans_prob_matrices = []
        reward_matrices = []
        for _ in range(self.num_trans):
            trans_prob_matrix = []
            reward_matrix = []
            for s in range(self.num_states):
                trans_prob_matrix_for_s = []
                reward_matrix_for_s = []
                for a in range(self.num_actions):
                    rands = np.random.uniform(0,1, self.num_states)
                    rand_probs = list(rands/sum(rands))
                    trans_prob_matrix_for_s.append(rand_probs)
                    rs = np.random.uniform(*self.reward_range, self.num_states)
                    reward_matrix_for_s.append(list(rs))
                trans_prob_matrix.append(trans_prob_matrix_for_s)
                reward_matrix.append(reward_matrix_for_s)
            trans_prob_matrices.append(trans_prob_matrix)
            reward_matrices.append(reward_matrix)

        return trans_prob_matrices, reward_matrices

    def reset(self, ):
        self.state = np.random.randint(0, self.num_states)  # randomly pick one state as initial
        self.trans = 0
        obs = self.state
        return obs

    def step(self, a):
        trans_prob = self.trans_prob_matrices[self.trans][self.state][a]
        next_state = np.random.choice([i for i in range(self.num_states)], p=trans_prob)
        self.state = next_state
        obs = self.state
        reward = self.reward_matrices[self.trans][self.state][a][next_state]
        self.trans += 1
        done = False if self.trans < self.num_trans else True

        return obs, reward, done, None

    def NEsolver(self,):
        self.Nash_v = []
        for tm, rm in zip(self.trans_prob_matrices[::-1], self.reward_matrices[::-1]): # inverse enumerate 
            if len(self.Nash_v) > 0:
                rm = np.array(rm)+np.array(self.Nash_v[-1])  # broadcast sum on rm's last dim, last one in Nash_v is for the next state
            trm = np.einsum("ijk,ijk->ij", tm, rm)  # transition prob * reward for the last dimension in (state, action, next_state)
            trm = trm.reshape(-1, self.num_actions_per_player, self.num_actions_per_player) # action list to matrix
            ne_values = []
            for s_payoff in trm:
                ne = NashEquilibriumECOSSolver(s_payoff)
                ne_value = ne[0]@s_payoff@ne[1].T
                ne_values.append(ne_value)  # each value is a Nash equilibrium value on one state
            self.Nash_v.append(ne_values)  # (trans, state)
        print('Nash values of all states: ', self.Nash_v)

env = ArbitraryMDP()
env.NEsolver()
obs = env.reset()
print(obs)    

for _ in range(env.num_trans+1):
    o,r,d,_ = env.step(1)
    print(o,r,d)
    if d:
        break


0
1
2
                                                   0  \
0  [[0.9492277496511203, -0.7230850668264706, 0.4...   
1  [[0.27079133106918807, -0.06955473452313932, 0...   
2  [[-0.20057271773597818, -0.5008545145233938, 0...   

                                                   1  \
0  [[0.017306001658161874, 0.7543340801951506, 0....   
1  [[0.842845663847265, -0.9504237847419348, -0.7...   
2  [[-0.5490177394287643, 0.7215394473492391, 0.8...   

                                                   2  
0  [[0.09268065392861335, -0.710504797377669, 0.9...  
1  [[0.35157506192187826, -0.24766323322395367, 0...  
2  [[0.06404206096902332, 0.49747060543367594, -0...  
Nash values of all states:  [[0.6299419917107387, -0.09736993972611034, -0.30655330721599117], [0.3141539927602728, -0.36649219268142685, 0.2868228665333546], [-0.012930308050426989, 0.6033907736557464, 0.4710433901943522]]
1
0 0.36818102372533024 False
2 0.9833582438622486 False
2 0.4609506516912787 True


  warn("Converting G to a CSC matrix; may take a while.")
  warn("Converting A to a CSC matrix; may take a while.")


In [69]:
a=np.array([[1,2], [3,4]])
b=np.array([1,1])
a+b

array([[2, 3],
       [4, 5]])

In [22]:
np.arange(2)

array([0, 1])