# HW4 Model-based Reinforcement Learning

In [None]:
import sys, os
import tensorflow as tf
import datetime as dt

os.environ['CUDA_VISIBLE_DEVICES'] = '1' # 0 is default GPU

print(tf.__version__)

from tensorflow.python.client import device_lib

local_device_protos = device_lib.list_local_devices()

print(local_device_protos)
print(tf.test.is_gpu_available())
print(tf.test.gpu_device_name())

In [None]:
# main.py

import os
import argparse
import time
import numpy as np

from half_cheetah_env import HalfCheetahEnv
from logger import logger, LoggerClass
from model_based_rl import ModelBasedRL
from model_based_rl import logger as model_logger
from collections import defaultdict

def run_mbrl() :
    parser = argparse.ArgumentParser()
    parser.add_argument('question', type=str, choices=('q1, q2, q3'))
    parser.add_argument('--exp_name', type=str, default=None)
    parser.add_argument('--env', type=str, default='HalfCheetah', choices=('HalfCheetah',))
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--mpc_horizon', type=int, default=15)
    parser.add_argument('--num_random_action_selection', type=int, default=4096)
    parser.add_argument('--nn_layers', type=int, default=1)
    args = parser.parse_args()

    data_dir = os.path.join(os.getcwd(), 'data')
    exp_name = '{0}_{1}_{2}'.format(args.env,
                                    args.question,
                                    args.exp_name if args.exp_name else time.strftime("%d-%m-%Y_%H-%M-%S"))
    exp_dir = os.path.join(data_dir, exp_name)
    assert not os.path.exists(exp_dir),\
        'Experiment directory {0} already exists. Either delete the directory, or run the experiment with a different name'.format(exp_dir)
    os.makedirs(exp_dir, exist_ok=True)

    logger.setup(exp_name, os.path.join(exp_dir, 'log.txt'), 'debug')

    env = {
        'HalfCheetah': HalfCheetahEnv()
    }[args.env]

    mbrl = ModelBasedRL(env=env,
                        render=args.render,
                        mpc_horizon=args.mpc_horizon,
                        num_random_action_selection=args.num_random_action_selection,
                        nn_layers=args.nn_layers,
                        scope='scope-' + str(np.random.rand()))
    print('2 tabular:', logger._tabular, ', recorded:', logger._curr_recorded)
    print('3 tabular:', model_logger._tabular, ', recorded:', model_logger._curr_recorded)

    run_func = {
        'q1': mbrl.run_q1,
        'q2': mbrl.run_q2,
        'q3': mbrl.run_q3
    }[args.question]
    
    
    model_logger._tabular = defaultdict(list)
    model_logger._curr_recorded.clear()
    model_logger._num_dump_tabular_calls = 0
    
    run_func()
    


## Q1


In [None]:
%tb

g = tf.Graph()
with tf.Session() as sess, g.as_default() :
    sys.argv = 'python main.py q1 --exp_name exp'.split()[1:]
    print('starts running at', dt.datetime.now())
    run_mbrl()
    print('finished running at', dt.datetime.now())



## Q2

* RandomPolicy should return ReturnAverage around -160
* Trained Policy should return ReturnAverage around 0


In [None]:
%tb

g = tf.Graph()
with tf.Session() as sess, g.as_default() :
    sys.argv = 'python main.py q2 --exp_name exp'.split()[1:]
    print('starts running at', dt.datetime.now())
    run_mbrl()
    print('finished running at', dt.datetime.now())



## Q3a

* RandomPolicy should return ReturnAverage around -160
* Trained Policy should return ReturnAverage around 300 by the 10th iteration


In [None]:
%tb

g = tf.Graph()
with tf.Session() as sess, g.as_default() :
    sys.argv = 'python main.py q3 --exp_name default'.split()[1:]
    print('starts running at', dt.datetime.now())
    run_mbrl()
    print('finished running at', dt.datetime.now())



In [None]:
import plot
%matplotlib inline

cmd = 'python plot.py --exps HalfCheetah_q3_default --save HalfCheetah_q3_default'
sys.argv = cmd.split()[1:]

print('starts plot ', sys.argv, ' at', dt.datetime.now())
plot.main()
print('finished plot ', sys.argv, ' at', dt.datetime.now())

## Q3b

In [None]:
%tb

cmd_list = [
    'python main.py q3 --exp_name action128 --num_random_action_selection 128',
    'python main.py q3 --exp_name action4096 --num_random_action_selection 4096',
    'python main.py q3 --exp_name action16384 --num_random_action_selection 16384',
    'python main.py q3 --exp_name horizon10 --mpc_horizon 10',
    'python main.py q3 --exp_name horizon15 --mpc_horizon 15',
    'python main.py q3 --exp_name horizon20 --mpc_horizon 20',
    'python main.py q3 --exp_name layers1 --nn_layers 1',
    'python main.py q3 --exp_name layers2 --nn_layers 2',
    'python main.py q3 --exp_name layers3 --nn_layers 3',    
    ]

for cmd in cmd_list :
    g = tf.Graph()
    with tf.Session() as sess, g.as_default() :
        sys.argv = cmd.split()[1:]
        print('starts running at', dt.datetime.now())
        run_mbrl()
        print('finished running at', dt.datetime.now())


In [None]:
import plot
%matplotlib inline

cmd_list = [
    'python plot.py --exps HalfCheetah_q3_action128 HalfCheetah_q3_action4096 HalfCheetah_q3_action16384 --save HalfCheetah_q3_actions',
    'python plot.py --exps HalfCheetah_q3_horizon10 HalfCheetah_q3_horizon15 HalfCheetah_q3_horizon20 --save HalfCheetah_q3_mpc_horizon',
    'python plot.py --exps HalfCheetah_q3_layers1 HalfCheetah_q3_layers2 HalfCheetah_q3_layers3 --save HalfCheetah_q3_nn_layers',    
    ]

for cmd in cmd_list :
    sys.argv = cmd.split()[1:]
    print('starts plot ', sys.argv, ' at', dt.datetime.now())
    plot.main()
    print('finished plot ', sys.argv, ' at', dt.datetime.now())