In [11]:
import gym
from sklearn.linear_model import SGDRegressor
import plotting
import numpy as np

env = gym.envs.make("MountainCar-v0")

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [2]:
samples = [env.observation_space.sample() for i in range(10000)]

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.kernel_approximation import RBFSampler


scaler = StandardScaler()

scaler.fit(samples)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [4]:
# what is RBF sampler doing here

featurizer = FeatureUnion([
        ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
        ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
        ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
        ("rbf4", RBFSampler(gamma=0.5, n_components=100))
        ])
featurizer.fit(scaler.transform(samples))

FeatureUnion(n_jobs=1,
       transformer_list=[('rbf1', RBFSampler(gamma=5.0, n_components=100, random_state=None)), ('rbf2', RBFSampler(gamma=2.0, n_components=100, random_state=None)), ('rbf3', RBFSampler(gamma=1.0, n_components=100, random_state=None)), ('rbf4', RBFSampler(gamma=0.5, n_components=100, random_state=None))],
       transformer_weights=None)

In [5]:
class Estimator():
    """
    Value Function approximator. 
    """
    
    def __init__(self):
        self.models = []
        # creating an SGDRegressor for each action 
        # 1 model for every action
        for _ in range(env.action_space.n):
            model = SGDRegressor(learning_rate="constant")
            model.partial_fit([self.featurize_state(env.reset())], [0])
            self.models.append(model)
    
    def featurize_state(self, state):
        """
        Returns the featurized representation for a state.
        """
        scaled = scaler.transform([state])
        featurized = featurizer.transform(scaled)
        return featurized[0]
    
    def predict(self, s, a=None):
        features = self.featurize_state(s)
        if not a:
            return np.array([m.predict([features])[0] for m in self.models])
        else:
            return self.models[a].predict([features])[0]
    
    def update(self, s, a, y):
        """
        Updates the estimator parameters for a given state and action towards
        the target y.
        
        y should be the reward - how do we have y?
        
        """
        features = self.featurize_state(s)
        self.models[a].partial_fit([features], [y])

In [6]:
def make_epsilon_greedy_policy(estimator, epsilon, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.
    
    Args:
        estimator: An estimator that returns q values for a given state - this can be anything
        for now, it's an SGD - lets change this and try
        
        epsilon: The probability to select a random action . float between 0 and 1.
        
        nA: Number of actions in the environment.
    
    Returns:
        A function that takes the observation as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.
    
    """
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        # get q values for each action as per our approximation
        q_values = estimator.predict(observation)
        best_action = np.argmax(q_values)
        
        # what is this??
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn


In [13]:
import IPython.core.debugger
dbg = IPython.core.debugger.Pdb()

def q_learning(env, estimator, num_episodes, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0):
    """
    Q-Learning algorithm for fff-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.
    
    Args:
        env: OpenAI environment.
        estimator: Action-Value function estimator
        num_episodes: Number of episodes to run for.
        discount_factor: Gamma discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
        epsilon_decay: Each episode, epsilon is decayed by this factor
    
    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))    
    
    dbg.set_trace()
    
    for i_episode in range(num_episodes):
        
        dbg.set_trace()
        
        # The policy we're following
        policy = make_epsilon_greedy_policy(
            estimator, epsilon * epsilon_decay**i_episode, env.action_space.n)
        
        # Print out which episode we're on, useful for debugging.
        # Also print reward for last episode
        last_reward = stats.episode_rewards[i_episode - 1]
        dbg.set_trace()

        sys.stdout.flush()
        
        dbg.set_trace()
        
        # Reset the environment and pick the first action
        state = env.reset()
        
        # Only used for SARSA, not Q-Learning
        next_action = None
        
        # One step in the environment
        for t in itertools.count():
                        
            # Choose an action to take
            # If we're using SARSA we already decided in the previous step
            if next_action is None:
                action_probs = policy(state)
                action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            else:
                action = next_action
            
            # Take a step
            next_state, reward, done, _ = env.step(action)
    
            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            
            # TD Update
            q_values_next = estimator.predict(next_state)
            
            # Use this code for Q-Learning
            # Q-Value TD Target
            td_target = reward + discount_factor * np.max(q_values_next)
            
            # Use this code for SARSA TD Target for on policy-training:
            # next_action_probs = policy(next_state)
            # next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs)             
            # td_target = reward + discount_factor * q_values_next[next_action]
            
            # Update the function approximator using our target
            estimator.update(state, action, td_target)
            
            print("\rStep {} @ Episode {}/{} ({})".format(t, i_episode + 1, num_episodes, last_reward), end="")
                
            if done:
                break
                
            state = next_state
    
    return stats

In [None]:
estimator = Estimator()
stats = q_learning(env, estimator, 100, epsilon=0.0)



> [0;32m<ipython-input-13-35024dcb2bc0>[0m(28)[0;36mq_learning[0;34m()[0m
[0;32m     26 [0;31m    [0mdbg[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     27 [0;31m[0;34m[0m[0m
[0m[0;32m---> 28 [0;31m    [0;32mfor[0m [0mi_episode[0m [0;32min[0m [0mrange[0m[0;34m([0m[0mnum_episodes[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0m
[0m[0;32m     29 [0;31m[0;34m[0m[0m
[0m[0;32m     30 [0;31m        [0mdbg[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m
ipdb> c
> [0;32m<ipython-input-13-35024dcb2bc0>[0m(33)[0;36mq_learning[0;34m()[0m
[0;32m     31 [0;31m[0;34m[0m[0m
[0m[0;32m     32 [0;31m        [0;31m# The policy we're following[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 33 [0;31m        policy = make_epsilon_greedy_policy(
[0m[0;32m     34 [0;31m            estimator, epsilon * epsilon_decay**i_episode, env.action_space.n)
[0m[0;32m     35 [0;31m[0;34m[0m[0m
[0m
ip