Creating a gym RL environment for the agent to learn within

Visualizing the environment and agent movements ans actions

Training the agent to learn the trading strategy to make high gains and profits

In [None]:
import pandas as pd
import gym
import numpy as np
from gym import spaces
from sklearn import preprocessing

In [None]:
def importingDataset():
  !wget -O Binance_BTCUSDT_1h.csv http://www.cryptodatadownload.com/cdd/Binance_BTCUSDT_1h.csv
  df = pd.read_csv('/content/Binance_BTCUSDT_1h.csv')
  df.reset_index(inplace=True)
  df.columns = ['Timestamp','Date','Symbol','Open','High','Low','Close', 'Volume_(BTC)','Volume_(USDT)']
  df = df.drop(index=0, axis=0)
  return df

In [None]:
df = importingDataset()
df.head()

--2020-11-12 16:38:25--  http://www.cryptodatadownload.com/cdd/Binance_BTCUSDT_1h.csv
Resolving www.cryptodatadownload.com (www.cryptodatadownload.com)... 35.173.69.207
Connecting to www.cryptodatadownload.com (www.cryptodatadownload.com)|35.173.69.207|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2491154 (2.4M) [text/csv]
Saving to: ‘Binance_BTCUSDT_1h.csv’


2020-11-12 16:38:25 (25.7 MB/s) - ‘Binance_BTCUSDT_1h.csv’ saved [2491154/2491154]



Unnamed: 0,Timestamp,Date,Symbol,Open,High,Low,Close,Volume_(BTC),Volume_(USDT)
1,1605164400.0,2020-11-12 07-AM,BTCUSDT,15791.17,15870.0,15762.0,15822.07,1488.23,23551242.78
2,1605160800.0,2020-11-12 06-AM,BTCUSDT,15747.94,15850.0,15727.71,15791.17,3064.98,48440072.0
3,1605157200.0,2020-11-12 05-AM,BTCUSDT,15661.38,15777.21,15649.01,15747.94,2415.54,37993604.9
4,1605153600.0,2020-11-12 04-AM,BTCUSDT,15628.14,15674.0,15591.54,15661.38,1299.9,20326220.74
5,1605150000.0,2020-11-12 03-AM,BTCUSDT,15616.49,15676.42,15601.85,15628.14,1751.85,27398870.97


In [None]:
#Agent Environment
class gymEnvironment(gym.Env):
    # GYM environment setup
    metadata = {'render.modes': ['live', 'file', 'none']}
    scaler = preprocessing.MinMaxScaler()
    viewer = None
    
    def __init__(self, df, init_balance=10000, init_btc_balance=0, btc_current_price=None, lookback_window=50, commission=0.0025,serial=False):
        super(gymEnvironment, self).__init__()
        
        self.df = df # Dataframe
        self.init_balance = init_balance #initial account balance
        self.balance = init_balance #initial account balance
        self.init_btc_balance = init_btc_balance #initial BTC balance
        self.btc_current_price = btc_current_price #BTC current price
        self.lookback_window = lookback_window #time steps in the past the agent will observe at each step
        self.commission = commission #flat commission from bitbns 0.25%
        self.serial = serial #data frame will be traversed in random slices by default
        self.current_step_btc = 0
        #Action space: buy, hold, sell (3), amounts for buy: 1/10, sell 3/10
        self.action_space = spaces.MultiDiscrete([3,10])
        
        #Obseriving OHLCV values, trade history, and net worth
        self.observation_space = spaces.Box(low=0, high=1, shape=(10, lookback_window+1), dtype=np.float16)
    
    def _reset_session(self):
        self.current_step = 0
        
        if self.serial:
            self.steps_left = len(self.df) - self.lookback_window - 1
            self.frame_start = self.lookback_window
        
        else:
            self.steps_left = np.random.randint(1, MAX_TRADING_SESSION)
            self.frame_start = np.random.randint(self.lookback_window, len(self.df) - self.steps_left)
        
        self.df_subset = self.df[self.frame_start - self.lookback_window:self.frame_start + self.steps_left]

    def _next_observation(self):
        end = self.current_step + self.lookback_window + 1
        
        obsrv = np.array([
            self.df_subset['Open'].values[self.current_step:end],  
            self.df_subset['High'].values[self.current_step:end],
            self.df_subset['Low'].values[self.current_step:end],
            self.df_subset['Close'].values[self.current_step:end],
            self.df_subset['Volume_(BTC)'].values[self.current_step:end],
          ])
        
        scaled_history = self.scaler.fit_transform(self.ac_history)
        obsrv = np.append(obsrv, scaled_history[:, -(self.lookback_window+1):], axis=0)
          
        return obsrv
    
    def _take_action(self, action, current_price):
        action_type = action[0]
        amount = action[1] / 10
        
        btc_bought = 0
        btc_sold = 0
        cost = 0
        sales = 0
        
        if action_type < 1:
            btc_bought = self.balance / current_price * amount
            cost = btc_bought * current_price * (1 + self.commission)
            self.btc_balance += btc_bought
            self.balance -= cost
            
        elif action_type < 2:
            btc_sold = self.btc_balance * amount
            sales = btc_sold * current_price  * (1 - self.commission)
            self.btc_balance -= btc_sold
            self.balance += sales
            
        if btc_sold > 0 or btc_bought > 0:
            self.trades.append({
              'step': self.frame_start+self.current_step,
              'amount': btc_sold if btc_sold > 0 else btc_bought,
              'total': sales if btc_sold > 0 else cost,
              'type': "sell" if btc_sold > 0 else "buy"
            })
            
        self.net_worth = self.balance + self.btc_balance * current_price
        self.ac_history = np.append(self.ac_history, [
            [self.net_worth],
            [btc_bought],
            [cost],
            [btc_sold],
            [sales]
            ], axis=1)

    def step(self, action):
        current_price = float(self.df_subset['Close'].iloc[self.current_step]) + 0.01
        #current_price = self.btc_current_price + 0.01
        self._take_action(action, current_price)
        self.steps_left -= 1
        self.current_step += 1
        self.current_step_btc += 1
        
        if self.steps_left == 0:
            self.balance += self.btc_balance * current_price
            self.btc_balance = 0
            self._reset_session()
           
        obs = self._next_observation()
        reward = self.net_worth
        done = self.net_worth <= 0

        return obs, reward, done, {}
        
    def reset(self):
        self.net_worth = self.init_balance
        self.balance = self.init_balance
        self.btc_balance = self.init_btc_balance
        
        self._reset_session()
        
        self.ac_history = np.repeat([[self.net_worth],[0],[0],[0],[0]], self.lookback_window+1, axis=1)
        
        self.trades = []
        
        return self._next_observation()
    
    def render(self, mode='human', **kwargs):
        return self.net_worth, self.trades

In [None]:
!pip install stable_baselines3



In [None]:
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import PPO

In [None]:
#df = df.sort_values('Date')
MAX_TRADING_SESSION = 10000  # ~ 6 days
#df.Timestamp = dfx.Timestamp.astype(float).astype(int)
slice_point = int(len(df)*0.7)
train_df = df[:slice_point]
test_df = df[slice_point:]

#btc_current_price = currentPrice('BTC',curr='USD').get('BTC').get('USD')
init_balance = 1000

# The algorithms require a vectorized environment to run
train_env = DummyVecEnv([lambda: gymEnvironment(train_df, init_balance=init_balance,
                                                serial=False)])

test_env = DummyVecEnv([lambda: gymEnvironment(test_df, init_balance=init_balance,
                                                serial=True)])

#Employing Proximal Policy Optimization
model = PPO('MlpPolicy',
             train_env,
             verbose=1, 
             tensorboard_log="./tensorboard/")

model.learn(total_timesteps=60000)

Using cpu device
Logging to ./tensorboard/PPO_2
-----------------------------
| time/              |      |
|    fps             | 722  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 531          |
|    iterations           | 2            |
|    time_elapsed         | 7            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0006284745 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -3.4         |
|    explained_variance   | -5.46e+08    |
|    learning_rate        | 0.0003       |
|    loss                 | 5.46e+07     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.000813    |
|    value_loss           | 1.09e+08     |
-----------

<stable_baselines3.ppo.ppo.PPO at 0x7fdccb27ccc0>

In [None]:
#Save Model
model.save("/content/models/ppo_RL_trader")



In [None]:
obsTrain = train_env.reset()
obsTest = test_env.reset()

# Predicting and aligning on the Train Data
for i in range(1000):
    action, _states = model.predict(obsTrain)
    obs, rewards, done, info = train_env.step(action)
    print('Actions: ',action)
    netW, trades = train_env.render()
    print('Initial Balance: ', init_balance)
    print('Current Balance: ', netW)

Actions:  [[1 4]]
Initial Balance:  1000
Current Balance:  1000.0
Actions:  [[1 0]]
Initial Balance:  1000
Current Balance:  1000.0
Actions:  [[2 2]]
Initial Balance:  1000
Current Balance:  1000.0
Actions:  [[2 7]]
Initial Balance:  1000
Current Balance:  1000.0
Actions:  [[2 0]]
Initial Balance:  1000
Current Balance:  1000.0
Actions:  [[2 7]]
Initial Balance:  1000
Current Balance:  1000.0
Actions:  [[1 2]]
Initial Balance:  1000
Current Balance:  1000.0
Actions:  [[1 7]]
Initial Balance:  1000
Current Balance:  1000.0
Actions:  [[1 3]]
Initial Balance:  1000
Current Balance:  1000.0
Actions:  [[1 6]]
Initial Balance:  1000
Current Balance:  1000.0
Actions:  [[1 2]]
Initial Balance:  1000
Current Balance:  1000.0
Actions:  [[1 1]]
Initial Balance:  1000
Current Balance:  1000.0
Actions:  [[1 4]]
Initial Balance:  1000
Current Balance:  1000.0
Actions:  [[1 1]]
Initial Balance:  1000
Current Balance:  1000.0
Actions:  [[0 4]]
Initial Balance:  1000
Current Balance:  999.0
Actions:  [

In [None]:
# Predicting the Test Data
for i in range(2000):
    action, _states = model.predict(obsTest)
    obs, rewards, done, info = test_env.step(action)
    print('Actions: ',action)
    netW, trades = test_env.render()
    print('Initial Balance: ', init_balance)
    print('Current Balance: ', netW)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Initial Balance:  1000
Current Balance:  949.1374024740716
Actions:  [[1 0]]
Initial Balance:  1000
Current Balance:  948.3667430503162
Actions:  [[2 2]]
Initial Balance:  1000
Current Balance:  947.783303085411
Actions:  [[0 2]]
Initial Balance:  1000
Current Balance:  947.1190429386104
Actions:  [[2 1]]
Initial Balance:  1000
Current Balance:  947.3987062554265
Actions:  [[1 0]]
Initial Balance:  1000
Current Balance:  946.6653668913309
Actions:  [[2 3]]
Initial Balance:  1000
Current Balance:  946.5478787042663
Actions:  [[0 1]]
Initial Balance:  1000
Current Balance:  946.6344157804926
Actions:  [[1 0]]
Initial Balance:  1000
Current Balance:  947.8798430388291
Actions:  [[2 0]]
Initial Balance:  1000
Current Balance:  945.781319494264
Actions:  [[1 3]]
Initial Balance:  1000
Current Balance:  947.3803400960221
Actions:  [[2 1]]
Initial Balance:  1000
Current Balance:  947.0754962088739
Actions:  [[2 3]]
Initial Balan