<a href="https://colab.research.google.com/github/sanchitvohra/crypto-bot/blob/main/Crypto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!git clone https://github.com/sanchitvohra/crypto-bot.git
!git pull
!mkdir /content/crypto-bot/data
!cp /content/drive/MyDrive/crypto-bot/crypto_data.npy /content/crypto-bot/data/
!mkdir /content/crypto-bot/checkpoints
%cd /content/crypto-bot/

Cloning into 'crypto-bot'...
remote: Enumerating objects: 140, done.[K
remote: Counting objects: 100% (140/140), done.[K
remote: Compressing objects: 100% (98/98), done.[K
remote: Total 140 (delta 53), reused 115 (delta 30), pack-reused 0[K
Receiving objects: 100% (140/140), 28.56 KiB | 835.00 KiB/s, done.
Resolving deltas: 100% (53/53), done.
fatal: not a git repository (or any of the parent directories): .git
/content/crypto-bot


In [None]:
%load_ext autoreload
%autoreload 2
import logging
import os
import math

import preprocessing
import environments
import models
import agents

In [None]:
# Setup logging settings
FORMAT = '[%(levelname)s] %(message)s'
logging.basicConfig(format=FORMAT)

logger = logging.getLogger('common')
logger.setLevel(logging.INFO)

logger.info("Training loop starting...")


# setup training configuration
training_steps = 32
K_epochs = 100
ep_len = 10000
action_std = 0.1                    # starting std for action distribution (Multivariate Normal)
action_std_decay_rate = 0.005       # linearly decay action_std (action_std = action_std - action_std_decay_rate)
min_action_std = 0.005              # minimum action_std (stop decay after action_std <= min_action_std)
action_std_decay_freq = 1           # action_std decay frequency (in num training steps)


# environment configuration
starting_balance = 1000000.0 # starting portfolio amount in dollars
max_trade = 100000.0         # max number of $ amount for buy/sell
trading_fee = 0.01           # trading fee during buy
history = 4                  # number of stacks in state
reward_scaling = 10 ** -4    # scale the reward signal down

# data loading
data = preprocessing.load_data()

# generate environments
envs = []
num_envs = 32
for i in range(num_envs): 
    envs.append(environments.CryptoEnv(data, starting_balance, max_trade, trading_fee, history))
state = envs[0].get_state(flatten=True)

# generate validation environment
venv = environments.CryptoEnv(data, starting_balance, max_trade, trading_fee, history)
validate_freq = 2

state_dim = state.shape[0]
action_dim = 5

device = torch.device('cpu')

if(torch.cuda.is_available()): 
    device = torch.device('cuda:0') 
    torch.cuda.empty_cache()
    print("Device set to : " + str(torch.cuda.get_device_name(device)))
else:
    print("Device set to : cpu")

pretrained = False
pretrained_path = None
model_save_path = "checkpoints/"
model_save_freq = 4

logger.info(f'Training steps: {training_steps}')
logger.info(f'Model Optimization epochs: {K_epochs}')
logger.info(f'Episode length: {ep_len}')
logger.info(f'Action std init: {action_std}')
logger.info(f'Action std decay: {action_std_decay_rate}')
logger.info(f'Min action std: {min_action_std}')
logger.info(f'Action std decay freq: {action_std_decay_freq}')

logger.info(f'Starting balance: {starting_balance}')
logger.info(f'Maximum trade action: {max_trade}')
logger.info(f'Trading fee: {trading_fee}')
logger.info(f'State History: {history}')
logger.info(f'Reward Scaling: {reward_scaling}')

logger.info(f'State dimension: {state_dim}')
logger.info(f'Action dimension: {action_dim}')

logger.info(f'Pytoch device: {device}')
logger.info(f'Pretrained: {pretrained}')
if pretrained:
    logger.info(f'Pretrained model path: {pretrained_path}')
logger.info(f'Model save path: {model_save_path}')
logger.info(f'Model save frequecy: {model_save_freq}')

[INFO] Training loop starting...
[INFO] Training steps: 32
[INFO] Model Optimization epochs: 100
[INFO] Episode length: 10000
[INFO] Action std init: 0.1
[INFO] Action std decay: 0.005
[INFO] Min action std: 0.005
[INFO] Action std decay freq: 1
[INFO] Starting balance: 1000000.0
[INFO] Maximum trade action: 100000.0
[INFO] Trading fee: 0.01
[INFO] State History: 4
[INFO] Reward Scaling: 0.0001
[INFO] State dimension: 231
[INFO] Action dimension: 5
[INFO] Pytoch device: cuda:0
[INFO] Pretrained: False
[INFO] Model save path: checkpoints/
[INFO] Model save frequecy: 4


Device set to : Tesla K80


In [None]:
# setup actor critic networks
actor = models.ActorNN(state_dim, action_dim, [1024, 512, 256, 256, 256], device)
critic = models.CriticNN(state_dim, action_dim, [1024, 512, 256, 256, 256], device)
lr_actor = 1e-6      # learning rate for actor network
lr_critic = 1e-6     # learning rate for critic network

logger.info('Actor: ')
logger.info(actor)
logger.info(f'Actor LR: {lr_actor}')
logger.info('Critic: ')
logger.info(critic)
logger.info(f'Critic LR: {lr_critic}')

# setup training agent
agent_name = 'PPO'
# PPO settings
eps_clip = 0.2          # clip parameter for PPO
gamma = 0.99            # discount factor

logger.info(f'Agent Policy: {agent_name}')
logger.info(f'Epsilon clip: {eps_clip}')
logger.info(f'Gamma: {gamma}')

if agent_name == 'PPO':
    agent = agents.PPO(state_dim, action_dim, actor, critic, lr_actor, lr_critic,
    num_envs, gamma, K_epochs, eps_clip, action_std, device)
else:
    agent = None

if pretrained:
  agent.load(pretrained_path)
  logger.info(f'Loaded saved model: {pretrained_path}')

[INFO] Actor: 
[INFO] ActorNN(
  (model): Sequential(
    (0): Linear(in_features=231, out_features=1024, bias=True)
    (1): Tanh()
    (2): Linear(in_features=1024, out_features=512, bias=True)
    (3): Tanh()
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): Tanh()
    (6): Linear(in_features=256, out_features=256, bias=True)
    (7): Tanh()
    (8): Linear(in_features=256, out_features=256, bias=True)
    (9): Tanh()
    (10): Linear(in_features=256, out_features=5, bias=True)
    (11): Tanh()
  )
)
[INFO] Actor LR: 1e-06
[INFO] Critic: 
[INFO] CriticNN(
  (model): Sequential(
    (0): Linear(in_features=231, out_features=1024, bias=True)
    (1): Tanh()
    (2): Linear(in_features=1024, out_features=512, bias=True)
    (3): Tanh()
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): Tanh()
    (6): Linear(in_features=256, out_features=256, bias=True)
    (7): Tanh()
    (8): Linear(in_features=256, out_features=256, bias=True)
    (9): Tanh()

In [None]:
traj_step = 0
time_step = 0
max_validation_reward = 0

while traj_step <= training_steps:

    # collect starting states for environments
    states = []
    for env in envs:
        env.reset()
        state = env.get_state(flatten=True)
        states.append(state)

    states = np.array(states)

    # collect ep_len trajectories for each env
    average_return = 0
    for t in range(ep_len):
        actions = agent.select_action(states)
        states = []
        for i, env in enumerate(envs):
            action = actions[i]
            reward = env.step(action)
            average_return += reward
            reward = reward * reward_scaling
            agent.buffer.rewards[i].append(reward)
            states.append(env.get_state(flatten=True))
            time_step += 1

        states = np.array(states)
    
    # increment step counter
    traj_step += 1
    average_return = average_return / num_envs

    # update agent using data
    mean_loss = agent.update()

    agent_std = agent.action_std

    logger.info(f'Time Steps: {time_step}')
    logger.info(f'Average Reward: {average_return:15.3f}')
    logger.info(f'Mean Loss: {mean_loss[0]:10.4f}, A/C/E: {mean_loss[1]:10.4f},{mean_loss[2]:10.4f},{mean_loss[3]:10.4f}')
    logger.info(f'Action Std: {agent_std:10.9f}')

    # update agent std
    if traj_step % action_std_decay_freq == 0:
      agent.action_std = action_std - traj_step * action_std_decay_rate
      if agent.action_std < min_action_std:
        agent.action_std = min_action_std
      agent.set_action_std(agent.action_std)

    if traj_step % model_save_freq == 0:
      if model_save_path != None:
        agent.save(checkpoint_path=os.path.join(model_save_path, "model" + str(time_step).zfill(10) + ".pth"))

    if traj_step % validate_freq == 0:
      venv.validate()
      state = venv.get_state(flatten=True)
      validation_return = 0

      mean_val_action = np.zeros(action_dim, dtype=np.float32)
      for t in range(ep_len):
          state = torch.FloatTensor(state).to(device)
          with torch.no_grad():
              action = agent.policy.validate(state)
              mean_val_action += action
              reward = venv.step(action)
              validation_return += reward
              state = venv.get_state(flatten=True)
    
      mean_val_action /= ep_len
      mean_val_action = str(list(mean_val_action))
      logger.info(f'Model Validation: {validation_return}')
      logger.info(f'Mean Val action:  {mean_val_action}')
      if validation_return > max_validation_reward:
          max_validation_reward = validation_return
          if model_save_path != None:
              agent.save(checkpoint_path=os.path.join(model_save_path, "model.pth"))

[INFO] Time Steps: 320000
[INFO] Average Reward:     -436831.642
[INFO] Mean Loss:   211.7520, A/C/E:     0.1592,    6.4097,    0.0000
[INFO] Action Std: 0.095000000
[INFO] Time Steps: 640000
[INFO] Average Reward:     -368646.484
[INFO] Mean Loss:   204.7524, A/C/E:     0.2012,    7.3488,    0.0000
[INFO] Action Std: 0.095000000
[INFO] Model Validation: -138587.75
[INFO] Mean Val action:  [0.04774256, 0.0527654, -0.029198347, 0.027469758, 0.037212502]
[INFO] Time Steps: 960000
[INFO] Average Reward:      -42541.545
[INFO] Mean Loss:   551.1674, A/C/E:     0.0347,    6.7624,    0.0000
[INFO] Action Std: 0.090000000
[INFO] Time Steps: 1280000
[INFO] Average Reward:     -193233.716
[INFO] Mean Loss:   193.2311, A/C/E:     0.0468,    6.2768,    0.0000
[INFO] Action Std: 0.085000000
[INFO] Model Validation: -141756.25
[INFO] Mean Val action:  [0.040110897, 0.046191715, -0.033505034, 0.02256198, 0.037222072]
[INFO] Time Steps: 1600000
[INFO] Average Reward:     -179289.426
[INFO] Mean Loss: