In [1]:
import numpy as np
from scipy import stats
import torch
import torch.optim as optim
import multiprocessing as mp
import logging
from datetime import datetime

from uniswapv3_simulator.pool import Uniswapv3Pool
from uniswapv3_simulator.optimization.environments import OneStepEnvironment, ScaleWrapper
from uniswapv3_simulator.optimization.ddpg.ddpg import (
    DDPG,
    DDPGTrainer,
    DeepActorModel,
    TrainArgs
)
from uniswapv3_simulator.optimization.ddpg.exploration_noise import GaussianProcess
from uniswapv3_simulator.optimization.ddpg.schedulers import ExponentialScheduler

timestamp = datetime.now().strftime('%y%m%d%H%M%S')
logging.basicConfig(
    level=logging.INFO,
    filename=f'./logs/rl_test_{timestamp}.log'
)
logging.getLogger('optimization').setLevel(logging.DEBUG)

In [2]:
SEED = 1234
seed_seq = np.random.SeedSequence(entropy=SEED)
seeds = seed_seq.generate_state(8)

init_price = 100
liquidity_bins = [70, 80, 90, 100, 110, 120, 130]

# fees = stats.uniform(1e-4, 0.01 - 1e-4)
# mu = stats.uniform(-0.05, 0.1)
# sigma = stats.uniform(1e-4, 0.1 - 1e-4) 
# alpha = stats.randint(1, 100 + 1)
# beta = stats.randint(100, 1000 + 1)

fees = stats.uniform(0.01, 0.0)
mu = stats.uniform(0.0, 0.0)
sigma = stats.uniform(0.05, 0.0) 
alpha = stats.randint(50, 50 + 1)
beta = stats.randint(500, 500 + 1)

fees.random_state = seeds[0]
mu.random_state = seeds[1]
sigma.random_state = seeds[2]
alpha.random_state = seeds[3]
beta.random_state = seeds[4]

n_sims_per_step = 500
n_jobs = mp.cpu_count() - 1

env = OneStepEnvironment(
    init_price, liquidity_bins,
    fees, mu, sigma, alpha, beta,
    n_sims_per_step=n_sims_per_step, 
    n_jobs=n_jobs, seed=seeds[5]
)

In [3]:
print('Random Variables')
print(f'fees:  mean={fees.mean():,.4f}, std={fees.std():,.4f}')
print(f'mu:    mean={mu.mean():,.4f}, std={mu.std():,.4f}')
print(f'sigma: mean={sigma.mean():,.4f}, std={sigma.std():,.4f}')
print(f'alpha: mean={alpha.mean():,.2f}, std={alpha.std():,.2f}')
print(f'beta:  mean={beta.mean():,.2f}, std={beta.std():,.2f}')

Random Variables
fees:  mean=nan, std=nan
mu:    mean=nan, std=nan
sigma: mean=nan, std=nan
alpha: mean=50.00, std=0.00
beta:  mean=500.00, std=0.00


  g2 = -6.0/5.0 * (d*d + 1.0) / (d*d - 1.0)


In [4]:
def obs_scale_fn(obs):
#     mu =    np.array([0.0051, 0.0000, 0.0501, 50.50, 550.00])
#     sigma = np.array([0.0029, 0.0289, 0.0288, 28.87, 260.10])
    mu = np.zeros(5)
    sigma = np.array([0.01, 1.0, 0.05, 50, 500])
    
    return (obs - mu) / sigma

def action_scale_fn(action):
    return action * 5e+4

def reward_scale_fn(reward):
    return reward

env = ScaleWrapper(env, obs_scale_fn, action_scale_fn, reward_scale_fn)

In [5]:
torch.manual_seed(seeds[6])
action_size = len(liquidity_bins) - 1
model = DeepActorModel(5, action_size, (128, 64), (128, 64))
agent = DDPG(
    model=model,
    gamma=0.99,
    tau=1e-3,
    optimizer=optim.Adam,
    actor_optimizer_kwargs={
        'lr': 1e-4,
        'weight_decay': 1e-5
    },
    critic_optimizer_kwargs={
        'lr': 1e-3,
        'weight_decay': 1e-5
    },
    clip_gradients=5.0
)
train_args = TrainArgs(
    train_steps=1000,
    batch_size=64, 
    memory_size=100000,
    exploration_noise=GaussianProcess,
    noise_kwargs={
        'size': (action_size, ), 
        'std': ExponentialScheduler(0.2, 0.01, 0.997)
    },
    update_start=50,
    update_freq=4,
    clip_actions=(1e-6, np.inf),
    seed=seeds[7]
)
trainer = DDPGTrainer(agent, env, train_args)

In [6]:
model

DeepActorModel(
  (critic_layers): Sequential(
    (0): Linear(in_features=11, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
  )
  (actor_layers): Sequential(
    (0): Linear(in_features=5, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=6, bias=True)
  )
)

In [7]:
%%time
rewards = trainer.train()

  return (-alpha + np.sqrt((alpha + beta * q0) ** 2 + 2 * beta * dy)) / beta
  return (-alpha + np.sqrt((alpha + beta * q0) ** 2 + 2 * beta * dy)) / beta
  return (-alpha + np.sqrt((alpha + beta * q0) ** 2 + 2 * beta * dy)) / beta
  return (-alpha + np.sqrt((alpha + beta * q0) ** 2 + 2 * beta * dy)) / beta
  return (-alpha + np.sqrt((alpha + beta * q0) ** 2 + 2 * beta * dy)) / beta
  return (-alpha + np.sqrt((alpha + beta * q0) ** 2 + 2 * beta * dy)) / beta
  return (-alpha + np.sqrt((alpha + beta * q0) ** 2 + 2 * beta * dy)) / beta


Episode:    50 | Mean Score: -0.006227
Episode:   100 | Mean Score: -0.004511
Episode:   150 | Mean Score: -0.004774
Episode:   200 | Mean Score: -0.004841
Episode:   250 | Mean Score: -0.004872
Episode:   300 | Mean Score: -0.004709
Episode:   350 | Mean Score: -0.004488
Episode:   400 | Mean Score: -0.004648
Episode:   450 | Mean Score: -0.004553
Episode:   500 | Mean Score: -0.004572
Episode:   550 | Mean Score: -0.004180
Episode:   600 | Mean Score: -0.004212
Episode:   650 | Mean Score: -0.004023
Episode:   700 | Mean Score: -0.003922
Episode:   750 | Mean Score: -0.003728
Episode:   800 | Mean Score: -0.003821
Episode:   850 | Mean Score: -0.003608
Episode:   900 | Mean Score: -0.003582
Episode:   950 | Mean Score: -0.003508
Episode: 1,000 | Mean Score: -0.003379
CPU times: user 30.2 s, sys: 45.9 s, total: 1min 16s
Wall time: 5h 42min 15s
