In [335]:
import numpy as np
import pickle
import random
import pandas as pd
from tqdm import tqdm

corr = None
ifTest = False
Ztr = {}
Zte = {}


def returnNext(state, action):
    # Returns (priceChange, nextState) as a tuple
    #nextState is actually a 2-tuple representing the state
    a,b = state
    L = []
    if ifTest:
        L = Zte[(a,b)]
    else:
        L = Ztr[(a,b)]
    r = L[random.randint(0,len(L)-1)]
    pricc = None
    if action==-1:
        if r[0]==0:
            pricc =  r[4]
        elif r[0]==1:
            pricc =  r[3]
        else:
            pricc =  -r[3]
    elif action==1:
        if r[2]==0:
            pricc =  r[4]
        elif r[2]==1:
            pricc =  r[3]
        else:
            pricc =  -r[3]
    else:
        if r[1]==0:
            pricc =  r[4]
        elif r[1]==1:
            pricc =  r[3]
        else:
            pricc =  -r[3]
    num = 3*(10*a + b) + action + 1

    F = corr[num,:]
    ch = np.random.choice(np.arange(0,100),p = list(F))
    ca = ch // 10
    cb = ch % 10
    return (pricc,(ca,cb))

In [336]:

# ftr = input('Test : Filename to read from?')
# fte = input('Train : Filename to read?')

fte ="./test.pic"
ftr ="./train.pic"


with open(ftr,'rb') as f:
    Ztr = pickle.load(f)

with open(fte,'rb') as f:
    Zte = pickle.load(f)

with open('correlations.npy','rb') as f:
    corr = np.load(f)

In [337]:
# total number of States and Actions
n_states =100
n_actions = 3
# np.random.seed(0)

# Q-Learning
Q-Learning is an Off-Policy algorithm for Temporal Difference learning. It can be proven that given sufficient training under any $\epsilon$-soft policy, the algorithm converges with probability 1 to a close approximation of the action-value function for an arbitrary target policy. Q-Learning learns the optimal policy even when actions are selected according to a more exploratory or even random policy.

So, We are going to use Q-Learning to train our Markov Chain to take best possible action $\textbf{a}$ in the given state $\textbf{s}$. We start off by first declaring a matrix Q, of dimension $n_{states} \times n_{actions}$, which we are going to train to take decisions for us in a given state $\textbf{s}$

$$lr~ (\alpha)=0.1$$
$$gamma ~(\gamma) =0.99$$
$$ epsilon ~(\epsilon) =0.9 $$
$$epsilon\_decay =0.01$$
$$epsilon\_final = 0.001$$

In [338]:
# initial Q-table
Q = np.zeros([n_states, n_actions])
epsilons = []
cum_rewards=[]

# learning rate
alpha = 0.1

In [339]:
# discount factor
gamma = 0.99
# e-greedy exploitation
epsilon = 0.9
epsilon_decay = 0.01
epsilon_final = 0.001

In [340]:
# training parameters
n_episodes = 2000
n_steps = 100

In [341]:
def getState(state, epsilon):
    """this function returns state based on the value of random number generated"""
    global Q
    p = np.random.uniform(0,1)
    
    action =None
    if p>epsilon:
        rand_values = Q[state]
        action = np.argmax(rand_values)-1
    else:
        action = np.random.randint(n_actions)-1
        
    return action

In [342]:
def getMinIndex_Value(l):
    min_value = min(l)
    min_index = l.index(min_value)
    
    return min_value, min_index

In [345]:
# Train your Markov Decision Process, make use of the returnNext function to model and act on the priceChanges dependence on the states and actions
def Train():
    
    global n_episodes, n_steps, epsilon, epsilon_decay, epsilon_final, gamma, alpha, epsilons, Q
    
    for i in tqdm(range(n_episodes)):
        # randomly generaating states between 0 and 100
        state = random.randint(0,99)
        # initial cumulative reward set
        cum_reward =0
        # initial price of bond
        inventory=[]
        price =100
        balance = 0
        bondbal = 0
        networth = 0
        
        for j in range(n_steps):
            # performing n_steps within each iteration
            p = np.random.rand()
            
            action = None
            if p>epsilon:
                rand_values = Q[state]
                action = np.argmax(rand_values)-1
            else:
                action = np.random.randint(n_actions)-1
            
            picc, nxt_state = returnNext((state//10,state%10), action)
            
            new_state =nxt_state[0]*10+nxt_state[1]
            
            old_networth = networth
            
            reward =0
            price+=picc
            if action==1:
                inventory.append(price)
                balance-=10*price
                bondbal+=10
            elif action ==-1 and len(inventory)>0:
                balance+=10*price
                bondbal-=10
            
            networth = balance + bondbal*price
            delta = networth - old_networth
            reward =delta
#             reward = max(delta,0)
            
            Q[state][action+1] = ((1-alpha)*Q[state][action+1]) + alpha*(reward + gamma*np.max(Q[new_state]))
            
            cum_reward+=reward
            state = new_state
        
        
        if epsilon > epsilon_final:
                epsilon*=(1-epsilon_decay)
                epsilons.append(epsilon)
        cum_rewards.append(cum_reward)
        
ifTest  = False
Train()

100%|██████████| 2000/2000 [00:13<00:00, 147.54it/s]


In [346]:
for i in range(20):
    print((i+1)*100,": mean espiode reward: ",\
           np.mean(cum_rewards[100*i:100*(i+1)]))

100 : mean espiode reward:  97.17470198639934
200 : mean espiode reward:  815.2496620388232
300 : mean espiode reward:  753.8025774131397
400 : mean espiode reward:  694.6271299080213
500 : mean espiode reward:  1087.6620075890353
600 : mean espiode reward:  1054.6013208550444
700 : mean espiode reward:  968.1742399904266
800 : mean espiode reward:  991.8479593327571
900 : mean espiode reward:  944.3620987445007
1000 : mean espiode reward:  882.4573485274982
1100 : mean espiode reward:  984.9932932868599
1200 : mean espiode reward:  911.8143534715718
1300 : mean espiode reward:  958.8562667462883
1400 : mean espiode reward:  1025.3360312764216
1500 : mean espiode reward:  976.8046605383477
1600 : mean espiode reward:  941.6191266940415
1700 : mean espiode reward:  927.3681896534796
1800 : mean espiode reward:  999.0413030269707
1900 : mean espiode reward:  1080.0506252533223
2000 : mean espiode reward:  991.5221802846122


In [347]:
    
# This you need to write after training your MDP, this should just take in the state and return the Action you would perform
# Please do not make use of the returnNext function inside here, that would defeat the purpose of training the model, as it would be known to you!

def Run(state):
    global epsilon
    s = state[0]*10+ state[1]%10
    action =getState(s, epsilon)
    return action
    

In [377]:
# This is the main function, you don't need to tamper with it!
def mainRun(iter = 1000):
    initstate = (random.randint(0,9),random.randint(0,9))
    i = 0
    initprice = 100.00
    price = initprice
    balance = 0
    bondbal = 0
    networth = 0
    st = initstate
    while i < iter:
        act = Run(st)
        pricCh, ns = returnNext(st,act)
        price += pricCh
        if act==1:
            balance -= 10*price
            bondbal += 10
        elif act==-1:
            balance += 10*price
            bondbal -= 10
        
        networth = balance + bondbal*price
        st = ns
        if i%(iter//10) ==0:
            print('Your Networth has went from 0 to ',networth)
        i+=1

In [378]:
ifTest =True
mainRun(100000)

Your Networth has went from 0 to  0.0
Your Networth has went from 0 to  -16.641611909344647
Your Networth has went from 0 to  600.5418212118711
Your Networth has went from 0 to  2986.6999072877043
Your Networth has went from 0 to  3985.29936897476
Your Networth has went from 0 to  6658.352418057904
Your Networth has went from 0 to  10831.746360472389
Your Networth has went from 0 to  15414.767571776538
Your Networth has went from 0 to  20267.80550587458
Your Networth has went from 0 to  19545.625301151114


As you can see in the output above, for **iter=100000**, the networth keeps on increasing in long term

In [363]:
np.save('Q3.npy',Q)

In [46]:
Q = np.load('Q.npy')