# **Behavioral Cloning**

## Install & Import packages/libraries

In [1]:
!pip install pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools > /dev/null 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install ipython==7.10.0 > /dev/null 2>&1
!pip install gymnasium[classic-control] > /dev/null 2>&1

In [2]:
import gymnasium as gym
from gymnasium.wrappers.record_video import RecordVideo
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()

def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(
            HTML(
                data='''<video alt="test" autoplay loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                    </video>'''.format(encoded.decode('ascii'))
            )
        )
    else:
        print("Could not find video")

def wrap_env(env):
    env = RecordVideo(env, './video', disable_logger=True)
    return env

## Pendulum Visualization
https://gymnasium.farama.org/environments/classic_control/pendulum/

In [3]:
# Get environment information
env = gym.make("Pendulum-v1")
print("Observation Space:\t{}".format(env.observation_space))
print("Action Space:\t\t{}".format(env.action_space))
print("Reward Range:\t\t{}".format(env.reward_range))
print("Max Episode Steps:\t{}".format(env.spec.max_episode_steps))

Observation Space:	Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
Action Space:		Box(-2.0, 2.0, (1,), float32)
Reward Range:		(-inf, inf)
Max Episode Steps:	200


In [4]:
import numpy as np

# Make environment
env = gym.make("Pendulum-v1", render_mode="rgb_array")

# Wrap environment to enable rendering on google colab
env = wrap_env(env)

# Reset environment
obs, info = env.reset()

total_reward = 0
for t in range(200):
    # Random action
    action = env.action_space.sample()
    # Uncomment to choose action from [-2.0, 2.0]
    # action = np.array([2.0])

    # Environment step
    obs, reward, terminated, truncated, info = env.step(action)
    total_reward += reward

    # Render environment to virtual display
    env.render()
    
    if terminated:
        print("Terminated. {} steps".format(t + 1))
        break

    if truncated:
        print("Truncated. {} steps".format(t + 1))
        break

# Close environment
env.close()

print('Total Reward: {:.2f}'.format(total_reward))
show_video()

Truncated. 200 steps
Total Reward: -959.65


## Pendulum Expert Behavior

In [14]:
import pickle
from sklearn.utils import shuffle

# Load demonstrations
with open("Pendulum-v1_expert_demo.pkl", 'rb') as f:
    demos = pickle.load(f)
demos = shuffle(demos)

# Check expert's performance
exp_ret = np.mean([np.sum(d['rewards']) for d in demos])
print("Expert's Average Cumulative Rewards: {:.3f}".format(exp_ret))

Expert's Average Cumulative Rewards: -79.331


In [22]:
# Gather the demonstrations' observations and actions
demo_observations = []
demo_actions = []
for demo in demos:
    for t_idx in range(len(demo['observes'])):
        demo_observations.append(demo['observes'][t_idx])
        demo_actions.append(demo['actions'][t_idx])

# Convert lists into numpy arrays
demo_observations = np.asarray(demo_observations)
demo_actions = np.asarray(demo_actions)

# Shuffle data to break correlation
demo_observations, demo_actions = shuffle(demo_observations, demo_actions)
print('There are a total of {} samples!'.format(len(demo_observations)))

######## TO DO #########
# Choose number of samples you want to use!
demo_observations = demo_observations[:200, :]
demo_actions = demo_actions[:200, :]
########################

n = demo_observations.shape[0]
assert demo_observations.shape == (n, 3)
assert demo_actions.shape == (n, 1)

# Print observation and action dimensions
print("Using only {} samples.".format(n))
print("Observation data:\t{}".format(demo_observations.shape))
print("Action data:\t\t{}".format(demo_actions.shape))

There are a total of 20000 samples!
Using only 200 samples.
Observation data:	(200, 3)
Action data:		(200, 1)


## Gaussian Process Regression

In [34]:
# We will use GPR to learn from these expert demonstrations by setting them as targets!
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, RBF

# Set hyperparameters
###### TO DO #######
init_lambda = 1.1
init_beta = 1.1
init_sigma = 0.1
####################

# Initialize GPR
kernel = ConstantKernel(init_beta, (1e-3, 1e3)) * RBF(init_lambda, (1e-2, 1e6))
gp = GaussianProcessRegressor(kernel=kernel, alpha=init_sigma, n_restarts_optimizer=20)

# Normalize observations
###### TO DO #######
demo_obs_mean = np.mean(demo_observations, axis=0, keepdims=True)
demo_obs_std = np.std(demo_observations, axis=0, keepdims=True)
nz_demo_observations = (demo_observations - demo_obs_mean)/demo_obs_std
####################

assert demo_obs_mean.shape == (1, 3)
assert demo_obs_std.shape == (1, 3)
n = len(demo_observations)
assert nz_demo_observations.shape == (n, 3)

gp.fit(nz_demo_observations, demo_actions)

In [35]:
# Make environment
env = gym.make("Pendulum-v1", render_mode="rgb_array")

# Wrap environment to enable rendering on google colab
env = wrap_env(env)

# Reset environment
obs, info = env.reset()

total_reward = 0
for t in range(200):
    # Use GPR to determine actions
    obs = np.reshape(obs, [1, -1])
    ############# TO IMPLEMENT ################
    nz_obs = (obs-demo_obs_mean) / demo_obs_std
    action = gp.predict(nz_obs)
    ###########################################

    # Environment step
    obs, reward, terminated, truncated, info = env.step(action)
    total_reward += reward

    # Render environment to virtual display
    env.render()
    
    if terminated:
        print("Terminated. {} steps".format(t + 1))
        break

    if truncated:
        print("Truncated. {} steps".format(t + 1))
        break

# Close environment
env.close()

print('Total Reward: {:.2f}'.format(total_reward))
show_video()

  logger.warn(


Truncated. 200 steps
Total Reward: -125.54


In [36]:
# Screenshot the following and submit!
# MODIFY THE FOLLOWING
######################################
print('\n')
print('=============================')
print('Name: 강명훈')
print('Student ID: 2019-14166')
print('Total Reward: {:.2f}'.format(total_reward))
print('=============================')
######################################
show_video()



Name: 강명훈
Student ID: 2019-14166
Total Reward: -125.54
