# Algorithm Implement

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
time_period = 15
class Q_Network(nn.Module):
    '''
    The input of this network should have shape (num_frame, 80, 80)
    '''

    def __init__(self, num_frame, num_action):
        super(Q_Network, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=num_frame, out_channels=32, kernel_size=(2,1), stride=1, padding=2)  # 16, 20, 20
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(2,1), stride=1)  # 32, 9, 9
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=(2,1), stride=1)  # 32, 9, 9
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=64, kernel_size=(2,1), stride=1)  # 32, 9, 9
        self.conv5 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=(2,2), stride=1)  # 32, 9, 9
        self.pool = nn.AvgPool2d(kernel_size=(2,1))
        #self.fc1 = nn.Linear(correct_num_features, 256)
        self.fc1 = nn.Linear(640, 256)
        self.fc2 = nn.Linear(256, num_action)
        self.fc3 = nn.Linear(256, 1)

    def forward(self, image):
        #print("Input Image Shape:", image.shape)
        x = F.relu(self.pool(self.conv1(image)))
        #print("Shape after conv1:", x.shape)
        x = F.relu(self.pool(self.conv2(x)))
        #print("Shape after conv2:", x.shape)
        x = F.relu(self.pool(self.conv3(x)))
        #print("Shape after conv3:", x.shape)
        num_features = x.shape[1] * x.shape[2] * x.shape[3]
        #x = x.view(-1, num_features)
        #print("Number of features to be flattened:", num_features)

        ### i removed the following:
        #2656, hex out the following
        x = x.view(-1, 640)
        #x = x.view(-1, 672)
        x = F.relu(self.fc1(x))
        x1 = self.fc2(x)
        x1 = x1 - torch.max(x1, dim=1, keepdim=True)[0]
        x2 = self.fc3(x)
        return x1 + x2

In [2]:
from torchsummary import summary
from google.colab import drive
drive.mount("/content/drive")



Mounted at /content/drive


In [3]:
!pip install finta

Collecting finta
  Downloading finta-1.3-py3-none-any.whl (29 kB)
Installing collected packages: finta
Successfully installed finta-1.3


In [4]:
from finta import TA
import pandas as pd
import pickle
import pandas as pd
import pickle


# Data Loading

In [5]:
file = open('/content/drive/MyDrive/Model/stock_data_w_all_features_novelty_lag_sentiment', 'rb')
data = pickle.load(file)
file.close()
data = data.reset_index()

### No need this cos date is already in proper format, use below if it is in this 'YYYY-MM-DD HH:MM:SS' format
##data['Date'] = [x[:10] for x in data['Date']]
from datetime import datetime
# Convert string to datetime.date
start_date = datetime.strptime('2022-01-01', '%Y-%m-%d').date()
end_date = datetime.strptime('2023-09-30', '%Y-%m-%d').date()

data = data[(data['Date'] >= start_date) & (data['Date'] < end_date)]

In [6]:
data.columns

Index(['Date', 'index', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
       'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
       'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
       'returnsClosePrevMktres10', 'returnsOpenPrevMktres10',
       'returnsOpenNextMktres10', 'next_10_day_relative_return',
       'next_day_return', 'next_day_relative_return', 'ticker', 'assetCode',
       'returnsClosePrevMktres10_lag_3_mean',
       'returnsClosePrevMktres10_lag_3_max',
       'returnsClosePrevMktres10_lag_3_min',
       'returnsClosePrevMktres10_lag_7_mean',
       'returnsClosePrevMktres10_lag_7_max',
       'returnsClosePrevMktres10_lag_7_min',
       'returnsClosePrevMktres10_lag_14_mean',
       'returnsClosePrevMktres10_lag_14_max',
       'returnsClosePrevMktres10_lag_14_min',
       'returnsClosePrevRaw10_lag_3_mean', 'returnsClosePrevRaw10_lag_3_max',
       'returnsClosePrevRaw10_lag_3_min', 

In [9]:
columns_to_drop = [
    'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
       'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
       'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
       'returnsClosePrevMktres10', 'returnsOpenPrevMktres10',
       'returnsOpenNextMktres10', 'next_10_day_relative_return',
       'next_day_return', 'next_day_relative_return',
       'returnsClosePrevMktres10_lag_3_mean',
       'returnsClosePrevMktres10_lag_3_max',
       'returnsClosePrevMktres10_lag_3_min',
       'returnsClosePrevMktres10_lag_7_mean',
       'returnsClosePrevMktres10_lag_7_max',
       'returnsClosePrevMktres10_lag_7_min',
       'returnsClosePrevMktres10_lag_14_mean',
       'returnsClosePrevMktres10_lag_14_max',
       'returnsClosePrevMktres10_lag_14_min',
       'returnsClosePrevRaw10_lag_3_mean', 'returnsClosePrevRaw10_lag_3_max',
       'returnsClosePrevRaw10_lag_3_min', 'returnsClosePrevRaw10_lag_7_mean',
       'returnsClosePrevRaw10_lag_7_max', 'returnsClosePrevRaw10_lag_7_min',
       'returnsClosePrevRaw10_lag_14_mean', 'returnsClosePrevRaw10_lag_14_max',
       'returnsClosePrevRaw10_lag_14_min', 'Open_lag_3_mean', 'Open_lag_3_max',
       'Open_lag_3_min', 'Open_lag_7_mean', 'Open_lag_7_max', 'Open_lag_7_min',
       'Open_lag_14_mean', 'Open_lag_14_max', 'Open_lag_14_min',
       'Close_lag_3_mean', 'Close_lag_3_max', 'Close_lag_3_min',
       'Close_lag_7_mean', 'Close_lag_7_max', 'Close_lag_7_min',
       'Close_lag_14_mean', 'Close_lag_14_max', 'Close_lag_14_min', 'length',
       'Neutral', 'Bullish', 'To the Moon!!', 'Nay', 'Bearish', 'Novelty_0.5',
       'Volume_0.5', 'Novelty_1', 'Volume_1', 'Novelty_3', 'Volume_3',
       'Novelty_5', 'Volume_5', 'Novelty_7', 'Volume_7'

]

data = data.drop(columns=columns_to_drop)

### selecting only AAPL
stock_df = data[data['ticker']=='AAPL']

In [10]:
stock_df

Unnamed: 0,Date,index,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,assetCode,Neutral,Bullish,To the Moon!!,Nay,Bearish,Novelty_0.5
0,2022-01-18,0.0,169.572495,170.590858,167.496227,167.881821,90956700.0,0.0,0.0,AAPL,AAPL,5590.994629,1403.873047,-1986.760132,-906.276489,-2661.511963,0.028156
13,2022-01-19,1.0,168.079558,169.147359,164.065425,164.352142,94815000.0,0.0,0.0,AAPL,AAPL,5929.884766,1903.575806,-2041.737061,-1189.204224,-3073.873047,0.021503
15,2022-01-20,2.0,165.093679,167.763175,162.325307,162.651581,91420500.0,0.0,0.0,AAPL,AAPL,17295.279297,3655.196533,-6478.432129,-2176.565186,-7700.579590,0.013233
24,2022-01-21,3.0,162.562606,164.451033,160.466559,160.575317,122848900.0,0.0,0.0,AAPL,AAPL,19163.902344,4726.145508,-7055.808105,-2776.794434,-9011.958008,0.011420
33,2022-01-24,4.0,158.212319,160.466561,152.952410,159.794235,162294600.0,0.0,0.0,AAPL,AAPL,11122.288086,2142.950195,-3645.181396,-1911.665527,-5043.228516,0.019466
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2956,2023-09-22,422.0,174.440176,176.847008,173.820996,174.560013,56725400.0,0.0,0.0,AAPL,AAPL,2931.233154,756.235840,-802.068298,-437.442932,-1699.409424,0.041174
2966,2023-09-25,423.0,173.970796,176.737156,173.920859,175.848328,46172700.0,0.0,0.0,AAPL,AAPL,2393.090820,858.375305,-595.124573,-361.992950,-1645.980835,0.041710
2972,2023-09-26,424.0,174.589987,174.969477,171.434141,171.733749,64588900.0,0.0,0.0,AAPL,AAPL,2775.072754,872.416138,-786.493103,-209.343750,-1805.105469,0.034391
2978,2023-09-27,425.0,172.392870,172.812316,168.827576,170.205750,66921800.0,0.0,0.0,AAPL,AAPL,4291.693359,1311.664673,-1226.913696,-859.468506,-2490.690918,0.026180


# Technical Indicators

In [11]:
stock_df['SMA42'] = TA.SMA(stock_df, 42)
stock_df['SMA5'] = TA.SMA(stock_df, 5)
stock_df['SMA15'] = TA.SMA(stock_df, 15)
stock_df['AO'] = TA.AO(stock_df)
stock_df['OVB'] = TA.OBV(stock_df)
stock_df[['VW_MACD','MACD_SIGNAL']] = TA.VW_MACD(stock_df)
stock_df['RSI'] = TA.RSI(stock_df)
stock_df['CMO'] = TA.CMO(stock_df)
stock_df = stock_df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_df['SMA42'] = TA.SMA(stock_df, 42)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_df['SMA5'] = TA.SMA(stock_df, 5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_df['SMA15'] = TA.SMA(stock_df, 15)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[ro

In [12]:
stock_df

Unnamed: 0,Date,index,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,...,Novelty_0.5,SMA42,SMA5,SMA15,AO,OVB,VW_MACD,MACD_SIGNAL,RSI,CMO
288,2022-03-17,41.0,157.018023,159.384034,156.047863,159.007843,75615400.0,0.0,0.0,AAPL,...,0.028569,163.824657,154.562921,158.921404,-10.070054,-1.074049e+09,-2.501124,-2.314608,47.462891,-1.584176
295,2022-03-18,42.0,158.898979,162.829134,158.156507,162.334152,123511700.0,0.0,0.0,AAPL,...,0.030468,163.692570,156.394354,158.863986,-8.911010,-9.505369e+08,-1.795699,-2.210819,51.843215,9.439337
301,2022-03-21,43.0,161.868852,164.680358,161.373870,163.720093,95811400.0,0.0,0.0,AAPL,...,0.043247,163.677521,159.316724,158.881147,-6.208071,-8.547255e+08,-1.265492,-2.021744,53.579868,13.771305
314,2022-03-22,44.0,163.848774,167.719533,163.254806,167.125565,81532000.0,0.0,0.0,AAPL,...,0.030108,163.784045,162.035168,159.252053,-3.218346,-7.731935e+08,-0.662270,-1.749837,57.623725,23.731767
318,2022-03-23,45.0,166.303898,170.907220,165.967299,168.501617,98062700.0,0.0,0.0,AAPL,...,0.029496,163.972766,164.137854,159.492945,-0.537880,-6.751308e+08,0.040476,-1.391762,59.171457,27.492117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2956,2023-09-22,422.0,174.440176,176.847008,173.820996,174.560013,56725400.0,0.0,0.0,AAPL,...,0.041174,181.227135,176.018103,178.410286,-2.118759,-1.688732e+08,-1.758145,-1.503953,41.791109,-18.388096
2966,2023-09-25,423.0,173.970796,176.737156,173.920859,175.848328,46172700.0,0.0,0.0,AAPL,...,0.041710,180.795377,175.640601,177.519459,-2.430850,-1.227005e+08,-1.706661,-1.544495,44.442558,-10.691717
2972,2023-09-26,424.0,174.589987,174.969477,171.434141,171.733749,64588900.0,0.0,0.0,AAPL,...,0.034391,180.296048,174.220471,176.338348,-3.265232,-1.872894e+08,-1.966439,-1.628884,38.422897,-27.432676
2978,2023-09-27,425.0,172.392870,172.812316,168.827576,170.205750,66921800.0,0.0,0.0,AAPL,...,0.026180,179.698361,173.209799,175.507442,-4.338828,-2.542112e+08,-2.275615,-1.758230,36.448496,-32.642578


# Replay Buffer

In [54]:
class SumTree:

    def __init__(self, capacity):

        self.capacity = capacity
        # the first capacity-1 positions are not leaves
        self.vals = [0 for _ in range(2*capacity - 1)] # think about why if you are not familiar with this

    def retrive(self, num):
        '''
        This function find the first index whose cumsum is no smaller than num
        '''
        ind = 0 # search from root
        while ind < self.capacity-1: # not a leaf
            left = 2*ind + 1
            right = left + 1
            if num > self.vals[left]: # the sum of the whole left tree is not large enouth
                num -= self.vals[left] # think about why?
                ind = right
            else: # search in the left tree
                ind = left
        return ind - self.capacity + 1

    def update(self, delta, ind):
        '''
        Change the value at ind by delta, and update the tree
        Notice that this ind should be the index in real memory part, instead of the ind in self.vals
        '''
        ind += self.capacity - 1
        while True:
            self.vals[ind] += delta
            if ind == 0:
                break
            ind -= 1
            ind //= 2

In [55]:
# from collections import deque

# test = deque(maxlen=5)
# for i in range(10):
#     test.append(i)
#     print(test)

import numpy as np
import random
import bisect
import torch

ALPHA = 0.5
EPSILON = 0.05
TD_INIT = 1

class Replay_Buffer:
    '''
    Vanilla replay buffer
    '''

    def __init__(self, capacity=int(1e6), batch_size=None):

        self.capacity = capacity
        self.memory = [None for _ in range(capacity)] # save tuples (state, action, reward, next_state, done)
        self.ind_max = 0 # how many transitions have been stored

    def remember(self, state, action, reward, next_state, done):

        ind = self.ind_max % self.capacity
        self.memory[ind] = (state, action, reward, next_state, done)
        self.ind_max += 1

    def sample(self, k):
        '''
        return sampled transitions. Make sure that there are at least k transitions stored before calling this method
        '''
        index_set = random.sample(list(range(len(self))), k)
        states = torch.from_numpy(np.vstack([self.memory[ind][0] for ind in index_set])).float()
        actions = torch.from_numpy(np.vstack([self.memory[ind][1] for ind in index_set])).long()
        rewards = torch.from_numpy(np.vstack([self.memory[ind][2] for ind in index_set])).float()
        next_states = torch.from_numpy(np.vstack([self.memory[ind][3] for ind in index_set])).float()
        dones = torch.from_numpy(np.vstack([self.memory[ind][4] for ind in index_set]).astype(np.uint8)).float()

        return states, actions, rewards, next_states, dones

    def __len__(self):
        return min(self.ind_max, self.capacity)

class Rank_Replay_Buffer:
    '''
    Rank-based replay buffer
    '''

    def __init__(self, capacity=int(1e6), batch_size=64):
        self.capacity = capacity
        self.batch_size = batch_size
        self.alpha = ALPHA
        self.memory = [None for _ in range(capacity)]
        self.segments = [-1] + [None for _ in range(batch_size)] # the ith index will be in [segments[i-1]+1, segments[i]]

        self.errors = [] # saves (-TD_error, index of transition), sorted
        self.memory_to_rank = [None for _ in range(capacity)]

        self.ind_max = 0 # how many transitions have been stored
        self.total_weights = 0 # sum of p_i
        self.cumulated_weights = []

    def remember(self, state, action, reward, next_state, done):
        index = self.ind_max % self.capacity
        if self.ind_max >= self.capacity: # memory is full, need to pop
            self.pop(index)
        else: # memory is not full, need to adjust weights and find segment points
            self.total_weights += (1/(1+self.ind_max))**self.alpha # memory is not full, calculate new weights
            self.cumulated_weights.append(self.total_weights)
            self.update_segments()

        max_error = -self.errors[0][0] if self.errors else 0
        self.insert(max_error, index)
        self.memory[index] = (state, action, reward, next_state, done)
        self.ind_max += 1

    def sample(self, batch_size=None): # notive that batch_size is not used. It's just to unify the calling form
        index_set = [random.randint(self.segments[i]+1, self.segments[i+1]) for i in range(self.batch_size)]
        probs = torch.from_numpy(np.vstack([(1/(1+ind))**self.alpha/self.total_weights for ind in index_set])).float()

        index_set = [self.errors[ind][1] for ind in index_set]
        states = torch.from_numpy(np.vstack([self.memory[ind][0] for ind in index_set])).float()
        actions = torch.from_numpy(np.vstack([self.memory[ind][1] for ind in index_set])).long()
        rewards = torch.from_numpy(np.vstack([self.memory[ind][2] for ind in index_set])).float()
        next_states = torch.from_numpy(np.vstack([self.memory[ind][3] for ind in index_set])).float()
        dones = torch.from_numpy(np.vstack([self.memory[ind][4] for ind in index_set]).astype(np.uint8)).float()
        for ind in index_set:
            self.pop(ind)

        return index_set, states, actions, rewards, next_states, dones, probs

    def insert(self, error, index):
        '''
        Input :
            error : the TD-error of this transition
            index : the location of this transition
        insert error into self.errors, update self.memory_to_rank and self.rank_to_memory accordingly
        '''
        ind = bisect.bisect(self.errors, (-error, index))
        self.memory_to_rank[index] = ind
        self.errors.insert(ind, (-error, index))
        for i in range(ind+1, len(self.errors)):
            self.memory_to_rank[self.errors[i][1]] += 1

    def pop(self, index):
        '''
        Input :
            index : the location of a transition
        remove this transition, update self.memory_to_rank and self.rank_to_memory accordingly
        '''
        ind = self.memory_to_rank[index]
        self.memory_to_rank[index] = None
        self.errors.pop(ind)
        for i in range(ind, len(self.errors)):
            self.memory_to_rank[self.errors[i][1]] -= 1

    def update_segments(self):
        '''
        Update the segment points.
        '''
        if self.ind_max+1 < self.batch_size: # if there is no enough transitions
            return None
        for i in range(self.batch_size):
            ind = bisect.bisect_left(self.cumulated_weights, self.total_weights*((i+1)/self.batch_size))
            self.segments[i+1] = max(ind, self.segments[i]+1)

    def __len__(self):
        return min(self.capacity, self.ind_max)


class Proportion_Replay_Buffer:
    '''
    Proportion-based replay buffer
    '''

    def __init__(self, capacity=int(1e6), batch_size=None):
        self.capacity = capacity
        self.alpha = ALPHA
        self.memory = [None for _ in range(capacity)]
        self.weights = SumTree(self.capacity)
        self.default = TD_INIT
        self.ind_max = 0

    def remember(self, state, action, reward, next_state, done):
        index = self.ind_max % self.capacity
        self.memory[index] = (state, action, reward, next_state, done)
        delta = self.default+EPSILON - self.weights.vals[index+self.capacity-1]
        self.weights.update(delta, index)
        self.ind_max += 1

    def sample(self, batch_size):
        index_set = [self.weights.retrive(self.weights.vals[0]*random.random()) for _ in range(batch_size)]
        #print(index_set)
        probs = torch.from_numpy(np.vstack([self.weights.vals[ind+self.capacity-1]/self.weights.vals[0] for ind in index_set])).float()

        states = torch.from_numpy(np.vstack([self.memory[ind][0] for ind in index_set])).float()
        actions = torch.from_numpy(np.vstack([self.memory[ind][1] for ind in index_set])).long()
        rewards = torch.from_numpy(np.vstack([self.memory[ind][2] for ind in index_set])).float()
        next_states = torch.from_numpy(np.vstack([self.memory[ind][3] for ind in index_set])).float()
        dones = torch.from_numpy(np.vstack([self.memory[ind][4] for ind in index_set]).astype(np.uint8)).float()

        return index_set, states, actions, rewards, next_states, dones, probs

    def insert(self, error, index):
        delta = error+EPSILON - self.weights.vals[index+self.capacity-1]
        self.weights.update(delta, index)

    def __len__(self):
        return min(self.capacity, self.ind_max)

In [56]:
tst = None
import random
from collections import deque
import torch
import torch.optim as optim
import numpy as np

# from networks import *

class Agent:

    def __init__(self, state_size, action_size, bs, lr, tau, gamma, device, visual=False):
        '''
        When dealing with visual inputs, state_size should work as num_of_frame
        '''
        self.state_size = state_size
        self.action_size = action_size
        self.bs = bs
        self.lr = lr
        self.tau = tau
        self.gamma = gamma
        self.device = device
        self.Q_local = Q_Network(self.state_size, self.action_size).to(device)
        self.Q_target = Q_Network(self.state_size, self.action_size).to(device)
        self.soft_update(1)
        self.optimizer = optim.Adam(self.Q_local.parameters(), self.lr)
        self.memory = Proportion_Replay_Buffer(int(1e5), bs)
        self.tst = None

    def act(self, state, eps=0):
        if random.random() > eps:
            state = torch.tensor(state, dtype=torch.float32).to(self.device)
            with torch.no_grad():
                action_values = self.Q_local(state)
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self):
        index_set, states, actions, rewards, next_states, dones, probs = self.memory.sample(self.bs)
        w = 1/len(self.memory)/probs
        w = w/torch.max(w)
        w = w.to(self.device)
        states = states.to(self.device)
        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        next_states = next_states.to(self.device)
        dones = dones.to(self.device)
        Q_values = self.Q_local(states)
        Q_values = torch.gather(input=Q_values, dim=-1, index=actions)
        with torch.no_grad():
            Q_targets = self.Q_target(next_states)
            Q_targets, _ = torch.max(input=Q_targets, dim=-1, keepdim=True)
            Q_targets = rewards + self.gamma * (1 - dones) * Q_targets

        deltas = Q_values - Q_targets
        loss = (w*deltas.pow(2)).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        deltas = np.abs(deltas.detach().cpu().numpy().reshape(-1))
        for i in range(self.bs):
            self.memory.insert(deltas[i], index_set[i])
    def soft_update(self, tau):
        for target_param, local_param in zip(self.Q_target.parameters(), self.Q_local.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

In [57]:
Drop_list = ['Date', 'index', 'ticker', 'assetCode']

indicators = [col for col in stock_df.columns if col not in Drop_list]
indicators

['Open',
 'High',
 'Low',
 'Close',
 'Volume',
 'Dividends',
 'Stock Splits',
 'SMA42',
 'SMA5',
 'SMA15',
 'AO',
 'OVB',
 'VW_MACD',
 'MACD_SIGNAL',
 'RSI',
 'CMO']

In [58]:
class Stock_Env:
    def __init__(self, initial_asset, data, cost):
        self.asset = initial_asset
        self.cash = initial_asset
        self.stock = 0
        self.data = data
        self.time = data.iloc[time_period]['Date']
        self.cost = cost
        self.history=[]
        self.total_cost = 0
        self.initial_asset = initial_asset
        self.rowid = time_period
        self.action_space = np.array(list(range(11)))

    def reset(self):
        self.asset = self.initial_asset
        self.cash = self.initial_asset
        self.stock = 0
        self.time = self.data.iloc[100]['Date']
        self.history=[]
        self.total_cost = 0
        self.rowid = time_period
        return self.data[:time_period][indicators].values

    def step(self, action):
        done = False
        states = self.data.iloc[self.rowid]
        self.rowid +=1
        if self.rowid == len(self.data)-1:
            done = True
        next_state = self.data.iloc[self.rowid]
        last_asset = self.asset
        price = next_state['Open']
        old_asset = self.cash + self.stock*price
        self.asset = old_asset
        target_value = action*0.1*self.asset
        distance = target_value - self.stock*price
        stock_distance = int(distance/(price*(1+self.cost)))
        self.stock += stock_distance
        self.cash = self.cash - distance - np.abs(stock_distance*self.cost*price)
        self.asset = self.cash+self.stock*price
        market_value = self.stock * next_state['Close']
        self.asset = market_value + self.cash
        reward = self.asset - last_asset
        self.time = next_state['Date']
        # self.stock = stock
        return (self.data[self.rowid-time_period:self.rowid][indicators].values, reward, done)

In [59]:
#env = gym.make()
env = Stock_Env(1000000, stock_df, 0.002)
num_episode = 5
max_t = 1000
reward_log = []

for _ in range(num_episode):

    # initialize
    env.reset()
    t = 0
    episodic_reward = 0

    for t in range(max_t):

        #env.render()
        action = np.random.randint(11) # random action
        _, reward, done = env.step(action)
        episodic_reward += reward
        if done:
            break

    reward_log.append(episodic_reward)

In [60]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent = Agent(1, len(env.action_space), 64, 0.001, 0.001, 0.99, device, True)

In [42]:
import warnings
warnings.filterwarnings('ignore')
#env = gym.make()
num_episode = 20000
max_t = 1000
reward_log = []
average_log = [] # monitor training process
eps = 1
eps_decay = 0.995
eps_min = 0.01
C = 4 # update weights every C steps

def train(env, agent, num_episode, eps_init, eps_decay, eps_min, max_t, num_frame=1, constant=0):
    rewards_log = []
    average_log = []
    eps = eps_init

    for i in range(1, 1 + num_episode):

        episodic_reward = 0
        done = False
        frame = env.reset()
        state_deque = deque(maxlen=num_frame)
        for _ in range(num_frame):
            state_deque.append(frame)
        state = np.stack(state_deque, axis=0)
        state = np.expand_dims(state, axis=0)
        t = 0

        while not done and t < max_t:

            t += 1
            action = agent.act(state, eps)
            frame, reward, done = env.step(action)
            state_deque.append(frame)
            next_state = np.stack(state_deque, axis=0)
            next_state = np.expand_dims(next_state, axis=0)
            agent.memory.memory.append((state, action, reward, next_state, done))

            if t % 5 == 0 and len(agent.memory) >= agent.bs:
                agent.learn()
                agent.soft_update(agent.tau)

            state = next_state.copy()
            episodic_reward += reward

        rewards_log.append(episodic_reward)
        average_log.append(np.mean(rewards_log[-100:]))
        print('\rEpisode {}, Reward {:.3f}, Average Reward {:.3f}'.format(i, episodic_reward, average_log[-1]), end='')
        if i % 100 == 0:
            print()

        eps = max(eps * eps_decay, eps_min)

    return rewards_log

In [65]:
def save_model(agent, rewards_log, filename="/content/drive/MyDrive/Model/RL_model.pth"):
    checkpoint = {
        'model_state_dict': agent.Q_local.state_dict(),
        'optimizer_state_dict': agent.optimizer.state_dict(),
        'rewards_log': rewards_log,
        # Include any additional information you want to save
    }
    torch.save(checkpoint, filename)

In [66]:
# Set the number of episodes
#num_episode = 100

#train(env, agent, num_episode, eps, eps_decay, eps_min, max_t, num_frame=1, constant=C)

# Train the agent
num_episode = 100
train_rewards = train(env, agent, num_episode, eps, eps_decay, eps_min, max_t, num_frame=1, constant=C)

# Save the trained model and training rewards
save_model(agent, train_rewards, filename="/content/drive/MyDrive/Model/RL_model.pth")

Episode 100, Reward -257792.946, Average Reward -222183.388


In [71]:
num_frame = 1  # Match this with the value used during the training of the saved model
state_size = num_frame
action_size = 11
bs = 64
lr = 0.001
tau = 0.1
gamma = 0.99
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

agent = Agent(state_size, action_size, bs, lr, tau, gamma, device)
optimizer = optim.Adam(agent.Q_local.parameters(), lr)

# Load the model
train_rewards = load_model(agent, optimizer, filename="/content/drive/MyDrive/Model/RL_model.pth")


In [72]:
train_rewards


[-290017.66139712033,
 -230977.517697579,
 -121526.00664517679,
 -221921.13481553667,
 -151168.90448217024,
 -254582.88484255958,
 -277390.30765663297,
 -145794.82501396816,
 -157459.90073491202,
 -384649.2566688503,
 -152392.15135436645,
 -150166.18951182626,
 -207951.62970033556,
 -183937.29692797374,
 -255911.01785304106,
 -319077.41401900747,
 -89127.86259903247,
 -119904.73281544656,
 -286630.52859607176,
 -201866.2328369593,
 -198622.43555255048,
 -41422.11218211707,
 -317039.04619508213,
 -204303.90572456783,
 -289350.07444728375,
 -288516.8781613563,
 -226228.1692878888,
 -109326.68913380266,
 -236850.98162640724,
 -282912.4715875725,
 -330888.0140607646,
 -140050.86489548674,
 -39123.34437197365,
 -325241.77331606345,
 -295974.9704124561,
 -279927.255039362,
 -91487.00277730636,
 -240187.71357533487,
 -142777.1521373986,
 -261766.60107315192,
 -247031.59639508824,
 -267172.6309043609,
 -135621.56173999095,
 -317093.5307353041,
 -238261.02646954672,
 -99182.09979610238,
 -28397

In [78]:
def validate(env, agent, num_episodes, max_t):
    rewards_log = []

    for i in range(1, num_episodes + 1):
        episodic_reward = 0
        done = False
        state = env.reset()

        # Add a batch dimension and adjust channel if necessary
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        if len(state.shape) == 3:  # If only a single channel, adjust to have channel dimension
            state = state.unsqueeze(1)  # Adds a channel dimension

        for _ in range(max_t):
            action = agent.act(state.to(agent.device), eps=0)  # Ensure state is on the same device as the model
            next_state, reward, done = env.step(action)  # Adjusted to match the expected return values

            next_state = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)
            if len(next_state.shape) == 3:
                next_state = next_state.unsqueeze(1)

            state = next_state
            episodic_reward += reward

            if done:
                break

        rewards_log.append(episodic_reward)
        print('Validation Episode {}: Reward {:.3f}'.format(i, episodic_reward))

    average_reward = np.mean(rewards_log)
    print('Average Reward over {} episodes: {:.3f}'.format(num_episodes, average_reward))
    return rewards_log


In [79]:
# ... Validate the agent ...
# Define the number of episodes for validation
num_validation_episodes = 100  # or any other number of episodes you want to validate on

# Define the maximum number of timesteps per episode
max_t = 1000  # Adjust this based on your environment's requirement

# ... Validate your agent ...
validation_rewards = validate(env, agent, num_validation_episodes, max_t)


Validation Episode 1: Reward 43839.700
Validation Episode 2: Reward 43839.700
Validation Episode 3: Reward 43839.700
Validation Episode 4: Reward 43839.700
Validation Episode 5: Reward 43839.700
Validation Episode 6: Reward 43839.700
Validation Episode 7: Reward 43839.700
Validation Episode 8: Reward 43839.700
Validation Episode 9: Reward 43839.700
Validation Episode 10: Reward 43839.700
Validation Episode 11: Reward 43839.700
Validation Episode 12: Reward 43839.700
Validation Episode 13: Reward 43839.700
Validation Episode 14: Reward 43839.700
Validation Episode 15: Reward 43839.700
Validation Episode 16: Reward 43839.700
Validation Episode 17: Reward 43839.700
Validation Episode 18: Reward 43839.700
Validation Episode 19: Reward 43839.700
Validation Episode 20: Reward 43839.700
Validation Episode 21: Reward 43839.700
Validation Episode 22: Reward 43839.700
Validation Episode 23: Reward 43839.700
Validation Episode 24: Reward 43839.700
Validation Episode 25: Reward 43839.700
Validatio