In [40]:
import pandas as pd
import numpy as np
import yfinance as yf

Function to randomly sample an action

In [41]:
def sample_action(policy, action , state):
    return np.random.choice(len(action), p=policy[state])

Function to generate a random episode

In [42]:
def generate_episode(policy, action, state):
    episode = [sample_action(policy, action, state[s]) for s in range(len(state))]
    
    return episode

Function for epsilon-greedy policy improvement

In [44]:
def epsilon_greedy_policy_improve(Q_value, nS, nA, epsilon, new_policy):
    for s in range(nS):
        #for each state, store the argmax(Q_value) in a list
        possible_action = np.argwhere(Q_value[s] == np.amax(Q_value[s])).flatten().tolist()
        
        #for each (s,a), if there are multiple actions that maximize Q(s), then update the greedy-epsilon policy 
        for a in range(nA):
            if a in possible_action:
                new_policy[s][a] = epsilon/nA + (1 - epsilon)/len(possible_action)
                
            else:
                new_policy[s][a] = epsilon/nA
    
    return new_policy

Function that calculates the average value of the holdings in the portfolio

In [45]:
def average_price(price, holdings, holdings_list, price_list):
    holdings_list.append(holdings)
    price_list.append(price)
    
    init_price = price_list[0]
    init_holdings = holdings_list[0]
    
    delta_holdings = []

    
    if len(price_list) == 1:
        return price_list[0], holdings_list, price_list
    
    else:
        for x, y in zip(holdings_list, holdings_list[1:]):
            delta_holdings.append(y - x)
    
        avg_price =  ( np.dot(price_list[1:], delta_holdings) + init_price*init_holdings ) / (init_holdings + np.sum(delta_holdings))
        return avg_price, holdings_list, price_list

Function that determines total portfolio value

In [46]:
def portfolio_value(holdings, cash, price, holdings_list, price_list, action):
    if action == 0:
        cash -= price
        holdings += 1
  
    if action == 1:
        cash += price
        holdings -= 1
        
    if action == 2:
        pass
    
    avg_price, holdings_list, price_list = average_price(price, holdings, holdings_list, price_list)
    
    return avg_price, holdings_list, price_list, holdings, cash

Function that determines the reward. It outputs a list that contains the reward at each decision epoch in the episode.

In [47]:
def generate_returns(episode, price, cash_s1, holdings_s1, price_list, holdings_list, gamma):
    p_v = []
    avg_price = price_list[0]
    
    for s in range(len(episode)):
        
        avg_price, holdings_list, price_list, holdings_s1, cash_s1 = portfolio_value(holdings_s1, cash_s1, price[s], holdings_list, price_list, episode[s])
        
        #store portfolio values in list at each time step
        p_v.append(cash_s1 + price[s]*holdings_s1)
    
    rew = []
    for x,y in zip(p_v, p_v[1:]):
        rew.append(y - x)
    rew.append(0)
        
    gamma_list = np.array([gamma**i for i in range(len(episode))])

    return np.dot(gamma_list,rew), rew

Helper function that calculates total returns of the portfolio

In [48]:
def pf_returns(episode, price, cash_s1, holdings_s1, price_list, holdings_list, gamma):
    p_v = []
    avg_price = price_list[0]
    
    for s in range(len(episode)):
        
        avg_price, holdings_list, price_list, holdings_s1, cash_s1 = portfolio_value(holdings_s1, cash_s1, price[s], holdings_list, price_list, episode[s])
     
        #store portfolio values in list at each time step
        p_v.append(cash_s1 + price[s]*holdings_s1)
        
    return p_v

function that executes MC Policy Evaluation

In [49]:
def mc_policy_evaluation(policy, Q_value, price, cash_s1, holdings_s1, state, action, gamma=0.9):
    #generate one episode
    episode = generate_episode(policy, action, state)
    
    #initialize holdings list, with 1000 stock ownership in the beginning
    holdings_list = [1000]
    
    price_list = [price[0]]
  
    #extract the discounted future rewards, and the immediate rewards
    G, rw = generate_returns(episode, price, cash_s1, holdings_s1, price_list, holdings_list, gamma=0.9)
    
    num_visits = np.zeros((len(state),len(action)))
    
    visits = []

    for v in range(len(episode)):
        s = state[v]
        a = episode[v]
        r = rw[v]

        if (s,a) not in visits:
            num_visits[s][a] += 1
            Q_value[s][a] += (1/num_visits[s][a])*(G - Q_value[s][a])
            
    return Q_value, holdings_list

Function that implements Policy Improvement

In [50]:
def mc_glie(iterations, gamma, price, cash_s1, holdings_s1, state, action):
    nS = len(state)
    nA = len(action)
    Q_value = np.zeros((nS, nA))
    policy = np.ones((nS,nA))/nA  # initially all actions are equally likely
    epsilon = 1
    
    eps_final = 0.001
    decay_rate = (epsilon - eps_final)/iterations 

    for k in range(1,iterations+1):   
        Q_value, holdings_list = mc_policy_evaluation(policy, Q_value, price, cash_s1, holdings_s1, state, action, gamma=0.9)
        policy = epsilon_greedy_policy_improve(Q_value, nS, nA, epsilon,policy)

        epsilon -= decay_rate

    det_policy = np.argmax(Q_value, axis=1)

    return det_policy, holdings_list, Q_value

extract stock price data from yfinance library, and setting model parameters.

In [51]:
iterations = 1000
gamma = 0.90

price = list(np.round(list(aapl["Close"]),2)[::10])
action = [0,1,2]
state = np.arange(len(price))

nA = len(action)
nS = len(price)
holdings_list = [1000]
holdings_s1 = holdings_list[0]
price_list = [price[0]]

cash_s1 = 10000
opt_pol, holdings_list, Q_v = mc_glie(iterations, gamma, price, cash_s1, holdings_s1, state, action)

Printing the total returns

In [52]:
returns = pf_returns(opt_pol, price, cash_s1, holdings_s1, [price[0]], [1000], gamma)[-1]

print("The total returns is $",returns)

The total returns is $ 82087.98000000001
