## PPO 

### Imports

In [114]:
import os
import numpy as np
import pandas as pd
import torch as T
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal 
import torch
from gym import Env
from gym.spaces import Discrete, Box

import cvxpy as cp

import warnings
warnings.filterwarnings("ignore")

### PPO Memory Class

In [121]:
#Code inspired by https://www.youtube.com/watch?v=K2qjAixgLqk
class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards =[]
        self.dones =[]

        self.batch_size = batch_size
    
    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype= np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i + self.batch_size] for i in batch_start][0]

        return np.array(self.states),\
                np.array(self.actions),\
                np.array(self.probs),\
                np.array(self.vals),\
                np.array(self.rewards),\
                np.array(self.dones),\
                batches
    def store_memory(self, state, action, prob, val, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(prob)
        self.vals.append(val)
        self.rewards.append(reward)
        self.dones.append(done)
    
    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions =[]
        self.rewards = []
        self.dones = []
        self.vals =[]

class ActorNetwork(nn.Module):
    def __init__(self, n_actions, input_dims, alpha, fc1_dims=256, fc2_dims=256, chkpt_dir='checkpoint_directory/'):
        super(ActorNetwork, self).__init__()

        self.checkpoint_file = os.path.join(chkpt_dir, 'actor_torch_ppo')
        self.input_layer = nn.Linear(input_dims, fc1_dims)
        self.hidden_1 = nn.Linear(fc1_dims,fc2_dims)
        self.mean = nn.Linear(fc2_dims,n_actions)
        self.std = nn.Linear(fc2_dims, n_actions)

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
    
    def forward(self, state):
        input = self.input_layer(state)
        output = nn.functional.relu(input)
        h1 = self.hidden_1(output)
        h1_output = nn.functional.relu(h1)
        mean = self.mean(h1_output)
        # std = nn.functional.softplus(T.exp(self.std(h1_output)))  #ensures that the std is positive
        std = nn.functional.softplus(self.std(h1_output))
        return mean.flatten(), std.flatten()

    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)
    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

class CriticNetwork(nn.Module):
    def __init__(self, input_dims, alpha, fc1_dims=256, fc2_dims=256, chkpt_dir = 'checkpoint_directory/'):
        super(CriticNetwork, self).__init__()
        self.checkpoint_file = os.path.join(chkpt_dir, 'critic_torch_ppo')
        
        self.critic = nn.Sequential(
                                    nn.Linear(input_dims, fc1_dims),
                                    nn.ReLU(),
                                    nn.Linear(fc1_dims, fc2_dims),
                                    nn.ReLU(),
                                    nn.Linear(fc2_dims, 1)
        )

        self.optimizer = optim.Adam(self.parameters(), lr = alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.cpu().to(self.device)

    def forward(self, state):
        value = self.critic(state)

        return value
    def save_checkpoint(self):
        T.save(self.state_dict(), self.checkpoint_file)
        
    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

class Agent:
    def __init__(self, n_actions, input_dims, gamma=0.99, alpha=0.0003, gae_lambda=0.95, 
                 policy_clip=0.1, batch_size=64, N=2048, n_epochs =10):
        
        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.policy_clip = policy_clip
        self.n_epochs = n_epochs

        self.actor = ActorNetwork(n_actions, input_dims, alpha)
        self.critic = CriticNetwork(input_dims, alpha)
        self.memory = PPOMemory(batch_size)
    
    def remember(self, state, action, probs, vals, reward, done):
        self.memory.store_memory(state, action, probs, vals, reward, done)
    
    def save_models(self):
        # print('.....saving models.....')
        self.actor.save_checkpoint()
        self.critic.save_checkpoint()

    def load_models(self):
        print('.....loading models.....')
        self.actor.load_checkpoint()
        self.critic.load_checkpoint()
    
    def choose_action(self, observation):
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)
        mean_actions, std_actions = self.actor(state)
        value = self.critic(state)

        dist = Normal(mean_actions, std_actions)
        sampled_actions = dist.sample()
        probs = dist.log_prob(sampled_actions)
        probs = probs.cpu().detach().numpy()
    
        action = sampled_actions.cpu().numpy()

        value = T.squeeze(value).item()

        return action, probs, value, #mean_actions, std_actions
    
    def learn(self):
        for _ in range(self.n_epochs):
            state_arr, action_arr, old_probs_arr, vals_arr, reward_arr, done_arr, batches = self.memory.generate_batches()

            values = vals_arr

            advantage = np.zeros(len(reward_arr), dtype = np.float32)

            for t in range(len(reward_arr)-1):
                discount = 1
                a_t=0
                for k in range(t, len(reward_arr)-1):
                    a_t += discount*(reward_arr[k] + self.gamma*values[k+1]* (1-int(done_arr[k])) - values[k])
                    discount *= self.gamma * self.gae_lambda
                advantage[t] = a_t
            
            advantage = T.tensor(advantage).to(self.actor.device)
            values = T.tensor(values).to(self.actor.device)
            for batch in batches:
                states = T.tensor(state_arr[batch], dtype = T.float).to(self.actor.device)
                old_probs = T.tensor(old_probs_arr[batch]).to(self.actor.device)
                actions = T.tensor(action_arr[batch]).to(self.actor.device)  
                # print('states', states)
                mean_actions, std_actions = self.actor(states)
                # print('mean_actions', mean_actions)
                # print('std_actions',std_actions)
                dist = Normal(mean_actions, std_actions)
                new_probs = dist.log_prob(actions)
                prob_ratio = new_probs.exp() / old_probs.exp()

                critic_value = self.critic(states)
                critic_value = T.squeeze(critic_value)

                weighted_probs = advantage[batch] *prob_ratio
                weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip, 1+self.policy_clip) * advantage[batch]
                actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()

                returns = advantage[batch] + values[batch]
                critic_loss = (returns-critic_value)**2
                critic_loss = critic_loss.mean()

                total_loss = actor_loss + 0.5*critic_loss
                self.actor.optimizer.zero_grad()
                self.critic.optimizer.zero_grad()
                total_loss.backward()
                self.actor.optimizer.step()
                self.critic.optimizer.step()
        self.memory.clear_memory()

In [122]:
class Environment(Env):
    def __init__(self):
        self.observation_space = Box(low=-np.inf, high=np.inf)
        self.action_space = Box(np.array([-np.inf, -np.inf]), np.array([np.inf, np.inf]))
        self.asset_length = 0
        self.cur_mu =[]
        self.cur_var = []
        self.portfolio_index=[]
        self.portfolio_mu=[]
        self.portfolio_Q =[]
        
    def step(self, action, stock_vector, df_Q, old_observation, mvo_gamma, MVO, derivative, find_stock):
        self.cur_mu.append(action[0])
        self.cur_var.append(action[1])
        self.portfolio_index, self.portfolio_mu, self.portfolio_Q = find_stock(self.cur_mu[-1],self.cur_var[-1], stock_vector, df_Q, self.portfolio_index, self.portfolio_mu, self.portfolio_Q)

        w, self.state = MVO(np.array(self.portfolio_mu), self.portfolio_Q, mvo_gamma) 
        
        # self.transition = derivative(self.portfolio_mu,self.portfolio_Q, mvo_gamma, w)
        # reward = self.transition.item()

        self.transition = np.exp(self.state[0] - old_observation[0])
        reward = self.transition

        self.asset_length += 1
        self.done = (self.asset_length >= 50)

        return self.state, reward, self.done, self.portfolio_index 
    
    def reset(self):
        self.state = np.array([0])
        self.asset_length = 0
        self.cur_mu =[]
        self.cur_var = []
        self.portfolio_index=[]
        self.portfolio_mu=[]
        self.portfolio_Q =[]
        return self.state

In [123]:
def find_stock(mu,var, stock_vector, df_Q, portfolio_index, portfolio_mu, portfolio_Q):

    # combine predicted mu and var into 2-D array
    stock = np.hstack((mu,var))

    # initialize distance
    smallest_distance = np.inf

    # for all the 2-D stocks in the 1000 synthetic dataset,
    # calculate the euclidean distance from predicted stock
    for i,s in enumerate(stock_vector):
        distance = np.linalg.norm(stock - s)

        # choose stock that is closest distance to predicted stock
        # do not choose a stock that has already been chosen
        if (distance < smallest_distance) and (i not in portfolio_index):
            smallest_distance = distance
            stock_index = i
    
    # find chosen stock mu and append
    stock_mu = stock_vector[i][0]
    portfolio_mu.append(stock_mu)

    # append chosen stock index to portfolio index
    portfolio_index.append(stock_index)

    # apped chosen stock to current portfolio Q matrix
    portfolio_Q = df_Q[portfolio_index].loc[portfolio_index]
    portfolio_Q = np.array(portfolio_Q)

    return portfolio_index, portfolio_mu, portfolio_Q

In [124]:
def MVO(mu,Q,gamma=1):

    # number of stocks 
    n = len(mu)

    # if there is only one stock in the portfolio,
    # the matrix multiplication is different as Q is not a matrix but a 1-D vector
    if n == 1:

      # define weights
      w = cp.Variable(n)

      # define constraints
      constraints = [
          cp.sum(w) == 1, # Sum to 1
          w>=0 # Disallow Short Sales
      ]

      # objective function
      risk = w*Q
      targetRet=gamma*mu@w
      prob = cp.Problem(cp.Maximize(targetRet - risk), constraints=constraints)
      prob.solve()
      
      return w.value,targetRet.value-risk.value
    
    # if there are multiple stocks in the portfolio,
    # the matrix multiplication is quad_form as Q is a (n,n) matrix

    else:
      # define weights
      w = cp.Variable(n)

      # define constraints
      constraints = [
          cp.sum(w) == 1, # Sum to 1
          w>=0 # Disallow Short Sales
      ]

      # objective Function
      risk = cp.quad_form(w, Q)
      targetRet=gamma*mu.T@w
      prob = cp.Problem(cp.Maximize(targetRet - risk), constraints=constraints)
      prob.solve()

      return w.value,[targetRet.value-risk.value]
       

In [52]:
def derivative(mu,Q,gamma,w):

  # initialize risk
  risk=0

  # if portfolio has one stock,
  # derivative reduces to 1-D vector multiplication
  if len(mu)<=1:
     return (mu*gamma) - Q*w


  # if portfolio has more than one stock,
  # perform element-wise multiplication of Q-matrix with weights
  else:
    for i in range(len(mu)):
        risk+=Q[i][-1]*w[i]
    Ret=mu[-1]*gamma
        # Ret+=mu[-1]*gamma
    return Ret-risk

## Load Data

In [125]:
path_to_data = 'Data\synthetic1000'
date = '2016-02-07'
df_mu = np.loadtxt('{}/{}/mu'.format(path_to_data, date))
df_Q = np.loadtxt('{}/{}/Q'.format(path_to_data, date))
df_Q = pd.DataFrame(df_Q)


# get the variance for each stock from the Q-matrix
# variance is the diagonal of the Q-matrix
df_var = []
for i in range(df_Q.shape[0]):
    for j in range(df_Q.shape[1]):
        if i==j:
            df_var.append(df_Q[i][j])
df_var = np.array(df_var)

# combine the mu and variance of each stock
# used to calculate the euclidean distance of predicted stock
stock_vector = []
for i in range(len(df_mu)):
    temp = np.hstack((df_mu[i],df_var[i]))
    stock_vector.append(temp)
stock_vector = np.array(stock_vector)


## Constants

In [126]:
# define Environment object
env = Environment()

# learning interval
N = 500

# learning parameters
batch_size = 64
n_epochs = 10
alpha = 0.0003
gamma = 0.99
gae_lambda = 0.95
mvo_gamma =1 

# define number of actions for Actor
n_actions = env.action_space.shape[0]
# define input dimensions for Actor and Critic
input_dims = env.observation_space.shape[0]

# define Agent object
agent = Agent(n_actions, input_dims, gamma=gamma, alpha=alpha, gae_lambda=gae_lambda, 
                 policy_clip=0.1, batch_size=batch_size, N=N, n_epochs = n_epochs)

# training loop parameters
n_games = 800
best_score = -np.inf
score_history = []
learn_iters = 0
avg_score = 0
n_steps = 0
best_50_MVO = -np.inf
best_50_index = []

In [127]:
path_to_data = 'Data/real/asset_'+'weekly'+'.pkl'
path_to_pxs = 'Data/real/prices_'+'weekly'+'.pkl'
returns = pd.read_pickle(path_to_data).reset_index()
prices = pd.read_pickle(path_to_pxs).reset_index()
returns.head()

Unnamed: 0,date,AMZN,TSLA,HD,BABA,TM,MCD,NKE,LOW,SBUX,...,CMS,AES,EBR,ATO,AGR,BIP,EVRG,LNT,WTRG,NI
0,2014-01-03,-0.005893,-0.005777,-0.005465,0.0,-0.011565,-0.00505,-0.007757,-0.012109,-0.01837,...,-0.017931,-0.021365,-0.027027,-0.014971,0.0,-0.007904,0.0,-0.017636,-0.01738,-0.012774
1,2014-01-10,0.003077,-0.025676,0.001465,0.0,0.002489,-0.007665,-0.014225,0.014913,0.009357,...,0.031191,0.023239,-0.003968,0.023916,0.0,0.000257,0.0,0.023279,-0.00604,0.037277
2,2014-01-17,0.004904,0.16669,-0.012316,0.0,-0.013409,-0.009081,-0.045892,-0.038043,-0.035664,...,-0.025083,-0.015829,-0.059761,0.009168,0.0,-0.029034,0.0,-0.011182,0.021267,0.006534
3,2014-01-24,-0.030054,0.026998,-0.022716,0.0,-0.010991,-0.005267,-0.023709,0.004621,0.001068,...,0.004162,-0.040559,-0.076271,0.014276,0.0,-0.019582,0.0,-0.010528,-0.002975,-0.008852
4,2014-01-31,-0.074587,0.039003,-0.029181,0.0,-0.026468,-0.002753,0.016748,-0.032197,-0.05148,...,0.047099,0.028454,-0.027523,0.023886,0.0,-0.007018,0.0,0.034127,0.020887,0.023221


In [79]:
from scipy.stats import gmean
lookback = 30

## Main

In [128]:
def train_for_date(date, env, agent):
    # training loop parameters
    n_games = 50
    best_score = -np.inf
    score_history = []
    learn_iters = 0
    avg_score = 0
    n_steps = 0
    best_50_MVO = -np.inf
    best_50_index = []

    ret_today = returns[(returns['date'] < date)].tail(lookback)
    ret_today = ret_today.reset_index().drop('date', axis=1)
    AssetReturns_np = ret_today.to_numpy()
    df_Q = np.cov(AssetReturns_np, rowvar=False)
    df_mu = 1 - (gmean(1+AssetReturns_np))
    df_Q = pd.DataFrame(df_Q)

    # get the variance for each stock from the Q-matrix
    # variance is the diagonal of the Q-matrix
    df_var = []
    for i in range(df_Q.shape[0]):
        for j in range(df_Q.shape[1]):
            if i==j:
                df_var.append(df_Q[i][j])
    df_var = np.array(df_var)

    # combine the mu and variance of each stock
    # used to calculate the euclidean distance of predicted stock
    stock_vector = []
    for i in range(len(df_mu)):
        temp = np.hstack((df_mu[i],df_var[i]))
        stock_vector.append(temp)
    stock_vector = np.array(stock_vector)

    # Training Loop for RL
    for i in range(n_games):
        try:
            # reset the environment
            observation = env.reset()
            done = False

            # score tracks the total accumulated reward
            score = 0
            # stock counter to indicate when 50 stocks are reached
            stock_counter=0

            # start Agent loop
            while not done:
                # observe the environment and let Agent choose an action
                action, probs, value = agent.choose_action(observation)
                
                # step the environment by updating new state space, reward recieved, and updating done
                observation_, reward, done, portfolio_index = env.step(action, stock_vector, df_Q, observation, mvo_gamma, MVO, derivative, find_stock)

                # increment n_steps
                n_steps+=1
                #increment reward total
                score+= reward
                
                # store (S,A,R,S') sequence in memory
                agent.remember(observation, action, probs, value, reward, done)

                # if total training steps can be divided by learning interval
                # perform gradient ascent in Agent
                if n_steps % N == 0:
                    agent.learn()
                    learn_iters+=1
                
                # store new observation as current observation
                observation = observation_

                # accumulate total rewards
                score_history.append(score)
                
                # get the average of the last 100 rewards
                avg_score=np.mean(score_history[-100:])

                # update the stock counter
                stock_counter+=1

                # when the RL is predicting the 50th stock in the portfolio
                # and the predicted MVO value is the highest yet
                if (stock_counter==49) and (observation[0] > best_50_MVO):
                    best_50_index = portfolio_index
                    best_50_MVO = observation

                # if the average score is better than the best
                # average score, than save the agent models
                if avg_score > best_score:
                    best_score = avg_score
                    agent.save_models()
                
            # print('best_50_MVO', best_50_MVO)
            # print('episode', i, 'score%.1f' % score, 'avg score%.1f' % avg_score,
            #           'time_steps', n_steps, 'learning_steps', learn_iters)
            print('episode', i, 'score', score, 'avg score', avg_score,
                    'time_steps', n_steps, 'learning_steps', learn_iters)
        except Exception as e:
            print(e)
            best_portfolio_mu = df_mu[best_50_index]
            best_portfolio_Q = df_Q[best_50_index].loc[best_50_index]
            w, ret = MVO(best_portfolio_mu,best_portfolio_Q,gamma=1)

            return w, ret
        
    best_portfolio_mu = df_mu[best_50_index]
    best_portfolio_Q = df_Q[best_50_index].loc[best_50_index]
    w, ret = MVO(best_portfolio_mu,best_portfolio_Q,gamma=1)
    weights_full = np.zeros(288)
    weights_full[best_50_index] = w
    return weights_full, ret

In [129]:
from datetime import datetime

def generate_date_list(data, start, end):
    start = datetime.fromisoformat(start)
    end = datetime.fromisoformat(end)

    # Train model from start_date to date
    mask = (data['date'] >= start) & (data['date'] <= end)

    data = data.loc[mask]
    return data.date.apply(lambda x: x.date()).unique().tolist()

In [130]:
# define Environment object
env = Environment()

# learning interval
N = 35

# learning parameters
batch_size = 64
n_epochs = 10
alpha = 0.0003
gamma = 0.99
gae_lambda = 0.95
mvo_gamma =1 

# define number of actions for Actor
n_actions = env.action_space.shape[0]
# define input dimensions for Actor and Critic
input_dims = env.observation_space.shape[0]

# define Agent object
agent = Agent(n_actions, input_dims, gamma=gamma, alpha=alpha, gae_lambda=gae_lambda, 
                policy_clip=0.1, batch_size=batch_size, N=N, n_epochs = n_epochs)

assets = [x for x in list(prices.columns) if not x=='date']
holdings = pd.DataFrame(columns=['date']+assets)
portVal = pd.DataFrame(columns=['date', 'Wealth'])
first = True
InitialValue = 100000

TRAIN_DATE_START = '2015-01-01'
TRAIN_DATE_END = '2018-01-01'
dates = generate_date_list(returns, TRAIN_DATE_START, TRAIN_DATE_END)
noShares = None

for date in dates:
    currentPrices = (prices[prices['date']==str(date)]
    .drop('date',axis=1)
    .values
    .flatten())

    # Update Portfolio Value
    if first:
        portVal.loc[len(portVal)] = [date] + [InitialValue]
        CurrentPortfolioValue = InitialValue
        first = False
    else:     
        CurrentPortfolioValue = np.dot(currentPrices,noShares)
        portVal.loc[len(portVal)] = [date] + [CurrentPortfolioValue]
    date = str(date)
    x, mvo_val = train_for_date(date, env, agent)

    # Update shares held
    # 50% of 100k = 50k. If price is 100 we have 50,000/100=50 shares
    print("x: {}. CurrentPortfolioValue: {}. currentPrices: {}".format(x, CurrentPortfolioValue, currentPrices))
    noShares = np.divide(x*CurrentPortfolioValue, currentPrices)
    print('Done {}'.format(date))

episode 0 score 49.994011199712716 avg score 25.493738753083022 time_steps 50 learning_steps 1
episode 1 score 49.99417752407752 avg score 25.4938036188208 time_steps 100 learning_steps 2
episode 2 score 49.99409435120445 avg score 25.493837585647448 time_steps 150 learning_steps 4
episode 3 score 49.99408430210361 avg score 25.493895142543547 time_steps 200 learning_steps 5
episode 4 score 49.99415944238252 avg score 25.49388637637476 time_steps 250 learning_steps 7
episode 5 score 49.99414841439601 avg score 25.493754193238377 time_steps 300 learning_steps 8
episode 6 score 49.993947367774005 avg score 25.493691111699903 time_steps 350 learning_steps 10
episode 7 score 49.99408416344497 avg score 25.49382970804179 time_steps 400 learning_steps 11
episode 8 score 49.99409510424102 avg score 25.493915107534058 time_steps 450 learning_steps 12
episode 9 score 49.993980492013186 avg score 25.493758018172898 time_steps 500 learning_steps 14
episode 10 score 49.99409584621951 avg score 25.

ValueError: shapes (0,) and (288,) not aligned: 0 (dim 0) != 288 (dim 0)

In [150]:
def agent_inference(date):
    ret_today = returns[(returns['date'] < date)].tail(lookback)
    ret_today = ret_today.reset_index().drop('date', axis=1)
    AssetReturns_np = ret_today.to_numpy()
    df_Q = np.cov(AssetReturns_np, rowvar=False)
    df_mu = 1 - (gmean(1+AssetReturns_np))
    df_Q = pd.DataFrame(df_Q)

    # get the variance for each stock from the Q-matrix
    # variance is the diagonal of the Q-matrix
    df_var = []
    for i in range(df_Q.shape[0]):
        for j in range(df_Q.shape[1]):
            if i==j:
                df_var.append(df_Q[i][j])
    df_var = np.array(df_var)

    # combine the mu and variance of each stock
    # used to calculate the euclidean distance of predicted stock
    stock_vector = []
    for i in range(len(df_mu)):
        temp = np.hstack((df_mu[i],df_var[i]))
        stock_vector.append(temp)
    stock_vector = np.array(stock_vector)

    observation = env.reset()

    done = False
    while not done:
        # observe the environment and let Agent choose an action
        action, probs, value = agent.choose_action(observation)
        
        # step the environment by updating new state space, reward recieved, and updating done
        observation_, reward, done, portfolio_index = env.step(action, stock_vector, df_Q, observation, mvo_gamma, MVO, derivative, find_stock)
        
        # store (S,A,R,S') sequence in memory
        agent.remember(observation, action, probs, value, reward, done)
        
        # store new observation as current observation
        observation = observation_
    
    best_50_index = portfolio_index
    best_portfolio_mu = df_mu[best_50_index]
    best_portfolio_Q = df_Q[best_50_index].loc[best_50_index]
    w, ret = MVO(best_portfolio_mu,best_portfolio_Q,gamma=1)
    weights_full = np.zeros(288)
    weights_full[best_50_index] = w

    return weights_full

In [152]:
w = agent_inference('2019-01-01')


In [None]:
first = True
InitialValue = 100000

TRAIN_DATE_START = '2019-01-01'
TRAIN_DATE_END = '2022-12-31'
dates = generate_date_list(returns, TRAIN_DATE_START, TRAIN_DATE_END)
noShares = None

for date in dates:
    currentPrices = (prices[prices['date']==str(date)]
    .drop('date',axis=1)
    .values
    .flatten())

    # Update Portfolio Value
    if first:
        portVal.loc[len(portVal)] = [date] + [InitialValue]
        CurrentPortfolioValue = InitialValue
        first = False
    else:     
        CurrentPortfolioValue = np.dot(currentPrices,noShares)
        portVal.loc[len(portVal)] = [date] + [CurrentPortfolioValue]
    date = str(date)

    # Update shares held
    # 50% of 100k = 50k. If price is 100 we have 50,000/100=50 shares
    print("x: {}. CurrentPortfolioValue: {}. currentPrices: {}".format(x, CurrentPortfolioValue, currentPrices))
    noShares = np.divide(x*CurrentPortfolioValue, currentPrices)
    print('Done {}'.format(date))

In [120]:
noShares

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       9.91702296e-08,            nan, 5.76703897e-07, 0.00000000e+00,
       0.00000000e+00,            nan, 0.00000000e+00, 1.25154759e-07,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.14291660e-08,
                  nan, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       2.30874630e-06, 3.92928171e-07, 0.00000000e+00, 2.45558062e-07,
                  nan, 0.00000000e+00,            nan, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00,            nan,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.04282088e-08,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.64312210e-07,
       0.00000000e+00,            nan, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 3.88168992e-03, 2.26101783e-07,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.54676642e-07,
      

## Best Predicted Portfolio

In [61]:
best_portfolio_indices = best_50_index

best_portfolio_mu = df_mu[best_portfolio_indices]
best_portfolio_Q = df_Q[best_portfolio_indices].loc[best_portfolio_indices]

w, ret = MVO(best_portfolio_mu,best_portfolio_Q,gamma=1)

print('Best Predicted MVO Value',ret)

Best Predicted MVO Value [-19.045725812384593]


## Random Search Portfolio

In [481]:
returns = []
for _ in range(2000):
    rand_portfolio_indices = list(np.random.choice(1000, 50, replace=False))

    rand_portfolio_mu = df_mu[rand_portfolio_indices]
    rand_portfolio_Q = df_Q[rand_portfolio_indices].loc[rand_portfolio_indices]

    w, ret = MVO(rand_portfolio_mu,rand_portfolio_Q,gamma=1)

    # print(ret)

    returns.append(ret)

returns = np.array(returns)
print('Best Random Portfolio MVO Vlaue', returns.max())

Best Random Portfolio MVO Vlaue -19.574344604269527


## To-Do List


0) exponential reward
1) add more to state space
1.1) sharpe ratio
2) add spatial pooling to state space
3) change neural networks
4) parameter tweeking
5) save actor model and call it
6) look at the minimum reward of the 50 portfolios at end of each game
