In [1]:
import numpy as np
import gym
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDRegressor
import itertools

In [2]:
env = gym.make("MountainCar-v0")

print(env)

<TimeLimit<MountainCarEnv<MountainCar-v0>>>


In [3]:
def eps_greedy(est,eps,nA):
    
    
    def policy(obs):
        A = np.ones(nA,dtype=float) * eps / nA
        q_val = est.predict(obs)
        best_action = np.argmax(q_val)
        A[best_action] += (1.0 - eps)
        
        return A
    return policy

In [39]:
def generate_sample():
    
    sample = np.array([env.observation_space.sample() for x in range(10000)])
    
    scaler = StandardScaler()
    scaler.fit(sample)

    featurizer = sklearn.pipeline.FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
            ("rbf4", RBFSampler(gamma=0.5, n_components=100))
            ])
    
    featurizer.fit(scaler.transform(sample))
    return scaler,featurizer
    


In [40]:
gen = generate_sample()

print(dir(gen))

['__add__', '__class__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'count', 'index']


In [57]:

class ValueEstimator():
    def __init__(self):
        
        
        self.target = env.action_space.n
        self.model = SGDRegressor()
        self.model.partial_fit([self.featurize_state(env.reset())], [0])
        
    def featurize_state(self, state):
        
        scaler,featurizer = generate_sample()
        scaled = scaler.transform([state])
        
        featurized = featurizer.transform(scaled)
        return featurized[0]
        
    def predict(self,s):
        
        features = self.featurize_state(s)
        return self.model.predict([features])[0]
    

    
    def __str__(self):
    
        return str(self.target)

    def update(self,s,y):
        
        features = self.featurize_state(s)
        
        return self.model.fit([features],[y])
    
    

In [70]:
def value_est(env,est,ep=1,discount=1.0,eps=0.1,eps_dec=1.0):
    
    nA = env.action_space.n
    
    reward_ = []
    t_ = []
    
    for i in range(ep):
        
        policy = eps_greedy(est,eps*eps_dec**i,nA)
        
        state = env.reset()
        
        for t in itertools.count():
            
            prob = policy(state)
            action = np.random.choice(np.arange(len(prob)))
            
            next_state,reward,done,_ = env.step(action)
            
            values_next = est.predict(next_state)
            
            td_tar = reward + discount * np.max(values_next)
            
            est.update(state,td_tar)
            
            print("next state => {} reward => {} action => {}".format(next_state,reward,prob))
            
            # stats
            
            reward_.append(reward)
            t_.append(t)
            
            if done == True:
                break
            
            state = next_state
    
    return reward_,t_
    

In [71]:
est = ValueEstimator()
val = value_est(env,est)

LOG state => [-0.41392973  0.        ] next state => [-4.13737492e-01  1.92241077e-04] reward => -1.0 action => [0.93333333 0.03333333 0.03333333]
LOG state => [-4.13737492e-01  1.92241077e-04] next state => [-0.41535437 -0.00161688] reward => -1.0 action => [0.93333333 0.03333333 0.03333333]
LOG state => [-0.41535437 -0.00161688] next state => [-0.4187689  -0.00341452] reward => -1.0 action => [0.93333333 0.03333333 0.03333333]
LOG state => [-0.4187689  -0.00341452] next state => [-0.42195675 -0.00318785] reward => -1.0 action => [0.93333333 0.03333333 0.03333333]


KeyboardInterrupt: 