In [1]:
import pandas as pd
import numpy as np
import gym
import random

from importlib import reload
import sys

from scipy.stats import poisson, beta, norm
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("dark")

In [16]:
!jt -l

Available Themes: 
   chesterish
   grade3
   gruvboxd
   gruvboxl
   monokai
   oceans16
   onedork
   solarizedd
   solarizedl


In [17]:
#jt -t oceans16

### Plan

Experiments on MountainCar, CartPole, and AcroBot (env with discrete action spaces) using small DQN-QR network
- Methods: DQN-QR (baseline), Parallel DQN-QR, Parallel DQN-QR with Biased Risk Profile (DDQN-QR-B), DDQN-QR-B with normalizing bias
- Implement n-step returns and compare performance

1. Experiment 1: Performance and learning on 5k time steps repeated 10 times for each method
    - Plot avg learning curve with 50% std deviation
    - Evaluation performance box plot 
2. Experiment 2: Priority replay experiments on 5k time steps repeated 10 times for each method. Test for best method from prior experiment
    - Hypothesized ideal formulation: resource allocation based on softmax over UCB. Then priority sampling 
    - Compare Uniform, Priority, Thread-Based Priority, Thread-Based Priority with Exploration Bonus
    - Learning curves wi 50% std deviation

### Test and Improve Baselines|

In [2]:
def get_df(results):
    stats = ['Average Reward', '% Optimal Action']
    df = pd.DataFrame(results, index = stats)

    df = df.explode(column=list(df.columns))
    df.index = [df.index, df.groupby(df.index).cumcount()]

    return df

In [49]:
import parallel_qr, dqn_qr, rl_utils

reload(rl_utils)
reload(dqn_qr)
reload(parallel_qr)

from dqn_qr import train_qr, evaluate, QR_Network
from parallel_qr import Biased_QR_Network, train_parallel_qr
from rl_utils import ParallelReplayMemory, TPReplayMemory

In [76]:
%%time

#test sequential DQN-QR
env_name = 'MountainCar-v0'
env = gym.make(env_name)

N = 16
alpha = 1e-3
eps_start, eps_end, eps_dec = 0.9, 0.05, 500
eps = lambda steps: eps_end + (eps_start - eps_end) * np.exp(-1. * steps / eps_dec)
eps = lambda steps: .05

qr = QR_Network(state_dims=env.observation_space.shape[0], num_actions=env.action_space.n, n=N, alpha=alpha)
qr_target = QR_Network(state_dims=env.observation_space.shape[0], num_actions=env.action_space.n, n=N, alpha=alpha)

qr, G = train_qr(env, qr, qr_target, gamma=1., num_episodes=1000, batch_size=64, eps=eps)

Episode 0 Return -200.0
Episode 50 Return -193.96
Episode 100 Return -175.03
Episode 150 Return -167.28
Episode 200 Return -156.44
Episode 250 Return -148.58
Episode 300 Return -146.44
Episode 350 Return -144.72
Episode 400 Return -143.04
Episode 450 Return -140.98
Episode 500 Return -138.53
Episode 550 Return -137.2
Episode 600 Return -135.6
Episode 650 Return -133.88
Episode 700 Return -132.82
Episode 750 Return -131.64
Episode 800 Return -131.39
Episode 850 Return -130.88
Episode 900 Return -130.17
Episode 950 Return -129.85
CPU times: user 2min 26s, sys: 8min 29s, total: 10min 56s
Wall time: 1min 53s


In [77]:
r = evaluate(env, qr, 1, 100)
r.describe()

count    100.000000
mean    -100.910000
std        4.841101
min     -104.000000
25%     -103.000000
50%     -103.000000
75%     -101.750000
max      -84.000000
dtype: float64

In [41]:
num_processes = 5

eps = .1
params = {'state_dims': env.observation_space.shape[0], 'num_actions': env.action_space.n, 'n': N, 'alpha': alpha, 'decay': 0}
training_params = {'env': env, 'gamma': 1., 'iterations':20000, 'batch_size':128, 'epsilon':eps, 'num_processes':num_processes, 'refresh_rate':50}    

In [50]:
#test biased DDQN-QR
qr_input = Biased_QR_Network(**params)
p_replay = ParallelReplayMemory(20000, num_processes)
tp_replay = TPReplayMemory(20000, num_processes)
    
#tp_replay = ParallelReplayMemory(10000, num_processes)
training_params['filename'] = 'model_scripted.pt'
training_params['decay_rates'] = np.array([-.2, -.1, .0, .1, .2])
training_params['qr'] = qr_input
training_params['memory_type'] = ParallelReplayMemory
training_params['memory_size'] = 20000
qr_output = train_parallel_qr(**training_params)

Process Process-45:
Process Process-46:
Process Process-44:
Process Process-42:
Process Process-47:
Process Process-43:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/sonesh/Desktop/Thesis/Classic Control/parallel_qr.py", line 135, in process_data
    if shared_memory.size() < batch_size: continue
       ^^^^^^^^^^^^^^^^^^^^
  File "<string>", line 2, in size
  File "/opt/anaconda3/lib/python3.11/multiprocessing/managers.py", line 822, in _callmethod
    kind, result = conn.recv()
                   ^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/multiprocessing/connection.py",

KeyboardInterrupt: 

In [34]:
G_test = evaluate(env, qr, gamma=1, num_episodes=100)
G_test.describe()

count    100.0
mean    -200.0
std        0.0
min     -200.0
25%     -200.0
50%     -200.0
75%     -200.0
max     -200.0
dtype: float64

In [16]:
threads = list(range(4))
a = dict((t, 1) for t in threads)
a

{0: 1, 1: 1, 2: 1, 3: 1}

In [None]:
for param_in, param_out in zip(qr_input.parameters(), qr_output.parameters()):
  print('input', param_in.data)
  print('output', param_out.data)

In [30]:
#test unbiased DDQN-QR

qr_input = Biased_QR_Network(**params)
training_params['decay_rates'] = []
training_params['qr'] = qr
training_params['iterations'] = 20000
training_params['memory_type'] = ParallelReplayMemory
training_params['memory_size'] = 10000
    
qr_output = train_parallel_qr(**training_params)

In [35]:
G_test = evaluate(env, qr_output, gamma=1, num_episodes=100)
G_test.describe()

count    100.0
mean    -200.0
std        0.0
min     -200.0
25%     -200.0
50%     -200.0
75%     -200.0
max     -200.0
dtype: float64