In [718]:
import numpy as np
import pandas as pd
np.random.seed(42)

In [719]:
def createtransactiondata(shape, number):
    """
    Artificial Transactions Data Creator.

    It creates (number, shape, shape + 1) numpy array. 
    Number index means individual persons, shape index means states, also (shape + 1) index means states but includes end state.
    Last element of each row represents probability of going to end state.
    All rows represents probability, so sum of individual row is equals one.
    Those probabilities represents of going from one state to another.
    For example data[0][0][5] means probability of going state 1 to state 6.


    Args:
        shape: Number of states. Must be higher than 5.
        number: Number of person.
    
    Return:
        numpy array (number, shape, shape + 1)
    """
    transaction_data = np.array([])
    for u in range(number):
        human = np.zeros((shape,shape+1))
        for k in range(shape):
            X = np.array([])

            if k == 0:
                for i,j in [(75,94),(16,25),(9,15)]:
                    X = np.hstack((X,np.random.randint(i,j) + np.random.rand()))
            elif k == 1:
                for i,j in [(14,22),(60,90),(14,22),(2,10)]:
                    X = np.hstack((X,np.random.randint(i,j) + np.random.rand()))
            elif k == shape-2:
                for i,j in [(1,10),(9,13),(16,19),(35,50)]:
                    X = np.hstack((X,np.random.randint(i,j) + np.random.rand()))
            elif k == shape-1:
                for i,j in [(4,13),(13,19),(30,40)]:
                    X = np.hstack((X,np.random.randint(i,j) + np.random.rand()))
            else:
                for i,j in [(1,7),(7,23),(60,94),(7,23),(1,7)]:
                    X = np.hstack((X,np.random.randint(i,j) + np.random.rand()))

            X /= 100

            if X.sum() > 1:
                surplus = X.sum() - 1
                total = X.sum()
                for i,j in enumerate(X):
                    X[i] = j - j * surplus / total

            elif X.sum() < 1:
                human[k][shape] = (1 - X.sum())

            if (k == 0) or (k == 1):
                for l in range(len(X)):
                    human[k][l] = X[l] 
            else:
                for l in range(len(X)):
                    human[k][k+l-2] = X[l] 
        if transaction_data.ndim == 3:
            transaction_data = np.vstack((transaction_data,human.reshape(1,shape,shape+1)))
        else:
            transaction_data = human.reshape(1,shape,shape+1)
    return transaction_data


For illustrative purpose data numpy array inserted in a pandas dataframe. 
Example data is created for 7 playing states and 1 end states for one person.
Rows and columns represents states. Last columns represents end state so it's not exists in index.

In [720]:
shape = 7
number = 1
pd.DataFrame(createtransactiondata(shape=shape, number=number)[0],
        columns=[state for state in range(1,shape+2)],
        index=[state for state in range(1,shape+1)])

Unnamed: 0,1,2,3,4,5,6,7,8
1,0.707881,0.204227,0.087892,0.0,0.0,0.0,0.0,0.0
2,0.13096,0.641314,0.172268,0.055457,0.0,0.0,0.0,0.0
3,0.043212,0.071591,0.705403,0.144592,0.035202,0.0,0.0,0.0
4,0.0,0.030337,0.134779,0.631895,0.174579,0.02841,0.0,0.0
5,0.0,0.0,0.045924,0.098599,0.661705,0.130133,0.019656,0.043982
6,0.0,0.0,0.0,0.023854,0.100977,0.186833,0.464952,0.223385
7,0.0,0.0,0.0,0.0,0.069093,0.161822,0.333117,0.435967


In [710]:
def createrewarddata(shape,option,number,initial):
    """
    Artificial Rewards Data Creator.

    It creates (number, shape, option) numpy array. 
    
    Each reward array desingned to decresing from top left to buttom right. That means
    first states rewards higher than the last states rewards.

    Args:
        shape: Number of states. Must be higher than 5.
        option: Number of choice. Using purpose is creating more reward columns.
        number: Number of person.
        initial: Base of maximum reward. Result could be +- (initial / 10).
    
    Return:
        numpy array (number, shape, shape + 1)
    """
    reward_data = np.array([])
    for _ in range(number):
        reward = np.zeros((shape,option))
        for i in range(shape):
            for j in range(option):
                if j == 0 and i == 0:
                    reward[i][j] = initial + np.random.randint(-initial/10,initial/10)
                elif i != 0 and j == 0:
                    reward[i][j] = reward[i-1][j] - np.random.randint(initial/50,initial/20)
                elif i == 0 and j != 0:
                    reward[i][j] = reward[i][j-1] - np.random.randint(initial/50,initial/20)
                else:
                    value_1 = reward[i][j-1] - np.random.randint(initial/50,initial/20)
                    value_2 = reward[i-1][j] - np.random.randint(initial/50,initial/20)
                    reward[i][j] = min(value_1,value_2)
        if reward_data.ndim == 3:
            reward_data = np.vstack((reward_data,reward.reshape(1,shape,option)))
        else:
            reward_data = reward.reshape(1,shape,option)
    return reward_data
    

For illustrative purpose data numpy array inserted in a pandas dataframe. Rows represents rewards, columns represents options.
Example data is created for 10 playing states so there is no reward for end state. Also there 3 option and each option's policy differ than other. Finally this data created for one person.


In [711]:
shape=10
option=3
number=1
initial=100
pd.DataFrame(createrewarddata(shape=shape,option=option,number=number,initial=initial)[0],  
                                columns=["option_"+str(i+1) for i in range(option)],
                                index = [index for index in range(1,shape+1)]
)

Unnamed: 0,option_1,option_2,option_3
1,95.0,92.0,89.0
2,92.0,88.0,85.0
3,89.0,85.0,81.0
4,85.0,81.0,78.0
5,83.0,78.0,74.0
6,80.0,76.0,72.0
7,76.0,74.0,68.0
8,74.0,70.0,64.0
9,72.0,66.0,60.0
10,70.0,62.0,58.0


In [712]:
class user():
    def __init__(self, total_state, rewards, transaction):
        self.N = (total_state + 1) #plus one for end state
        self.transaction = transaction #transaction array
        self.rewards = rewards #reward array
    def isEnd(self,state):
        """
        Checks the given state is end state.
        """
        return state == self.N
    def actions(self,state):
        """
        For this problem, our actions are stay or go.
        """
        result = []
        if state <= self.N:
            result.append("stay")
        if state <= self.N:
            result.append("go")
        return result
    def succProbReward(self, action, state):
        """
        If we pick the stay action, our reward is 2.5 and we continue playing with probability of going another state
        or we pick go and take reward but game going to end.

        return:
            tuple(state,probability,reward)

        """
        result = []
        if action == "stay":
            for i in range(self.N):
                result.append((i+1, self.transaction[state-1][i],2.5))
        elif action == "go":
            result.append((self.N, 1, self.rewards[state-1]))
        return result
    def discount(self):
        """
        Discount rate for another round.
        """
        return 0.97**(1/365)
    def states(self):
        """
        Range list of total states.
        """
        return range(1,self.N + 1)

In [713]:
def valueIteration(mdp):
    # initialize all states to 0
    V = {}
    for state in mdp.states():
        V[state] = 0

    def Q(state,action):
        # returns right hand side of the algorithm.
        return sum(prob*(reward+mdp.discount()*V[newState]) \
            for newState,prob,reward in mdp.succProbReward(action,state))
    while True:
        newV = {}
        for state in mdp.states():
            if mdp.isEnd(state):
                newV[state] = 0 #if end state policy is 0
            else:
                newV[state] = max(Q(state,action) for action in mdp.actions(state)) #gets max policy of given state wrt probability of going another state.
        if max(abs(V[state]-newV[state]) for state in mdp.states()) < 1e-4: #checks convergence
            break
        V = newV #update v after convergence
        pi = {}
        for state in mdp.states():
            if mdp.isEnd(state):
                pi[state] = 'none'
            else: 
                pi[state] = max((Q(state,action),action) for action in mdp.actions(state))[1] #gets action of max policy
    outcome = []
    for state in mdp.states():
        outcome.append((state,V[state],pi[state]))
    return outcome


In [714]:
def main(shape,number,option,initial):
    transaction_data = createtransactiondata(shape=shape, number=number)
    reward_data = createrewarddata(shape=shape,option=option,number=number,initial=initial)
    columns = ["option_"+str(i+1) for i in range(option)] + ["user_id","state"] + ["option_"+str(i+1)+"_exp_life" for i in range(option)]
    outcome = pd.DataFrame(columns=columns) 
    for num_user in range(number):
        option_list = {}
        option_list["user_id"] = num_user
        for num_state in range(shape):
            option_list["state"] = num_state+1
            for num_option in range(option):
                mdp = user(total_state=shape,rewards=reward_data[num_user].T[num_option], transaction = transaction_data[num_user])
                vit = valueIteration(mdp)
                option_list["option_"+str(num_option+1)] = vit[num_state][2]
                option_list["option_"+str(num_option+1)+"_exp_life"] = vit[num_state][1]
            outcome = outcome.append(option_list,ignore_index=True)
    return outcome.set_index(["user_id","state"])

Result of all of these, in given conditions, person whom user_id is zero is if in state 2 and in option 3 must stay for getting maximum policy or if he/she is in option_4, must go.

In [717]:
main(shape = 10, number = 2, option = 4, initial = 3120)

Unnamed: 0_level_0,Unnamed: 1_level_0,option_1,option_2,option_3,option_4,option_1_exp_life,option_2_exp_life,option_3_exp_life,option_4_exp_life
user_id,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1,go,go,go,go,3226.0,3071.0,2961.0,2831.0
0,2,go,go,stay,go,3103.0,2960.0,2825.144675,2736.0
0,3,stay,go,stay,stay,3002.672174,2870.0,2730.782042,2619.526051
0,4,stay,stay,go,go,2881.106368,2733.269126,2615.0,2494.0
0,5,go,go,go,go,2745.0,2589.0,2471.0,2332.0
0,6,go,go,go,go,2622.0,2439.0,2321.0,2231.0
0,7,go,go,go,go,2489.0,2364.0,2209.0,2110.0
0,8,stay,go,stay,stay,2370.957521,2278.0,2110.196821,1992.380458
0,9,go,go,go,go,2253.0,2153.0,2015.0,1875.0
0,10,go,go,go,go,2141.0,2008.0,1883.0,1781.0
