In [None]:
import gym
from matplotlib import animation
import matplotlib.pyplot as plt
import numpy as np

In [None]:
%matplotlib inline

In [None]:
ENV = gym.make('CartPole-v1')
ENV.reset()

In [None]:
def initialize_random_weights(mean, std):
    return np.random.normal(mean, std, 4)

In [None]:
def sigmoid(weights, observation):
    weighted_sum = sum([weights[i] * observation[i] for i in range(len(weights))])
    return 1.0 / (1 + np.exp(-weighted_sum)) 

def grad_log_sigmoid(weights, observation, action):
    if action == 1:
        return observation * (1 - sigmoid(weights, observation))
    else:
        return - observation * sigmoid(weights, observation)

def get_action(weights, observation):
    prob_one = sigmoid(weights, observation)
    return int(np.random.random() <= prob_one)

In [None]:
def one_cartpole_run(weights):
    observation = ENV.reset()
    cum_reward = 0
    grad_log_sum = np.zeros(4)
    for t in range(1000):
        action = get_action(weights, observation)
        observation, reward, done, info = ENV.step(action)
        cum_reward += reward
        grad_log_sum += grad_log_sigmoid(weights, observation, action)
        if done:
            break
    return cum_reward, grad_log_sum

In [None]:
def record_cartpole_run(weights):
    observation = ENV.reset()
    
    all_observations = np.zeros((1, 4))
    all_observations[0, :] = observation
        
    all_actions = []
    for t in range(1000):
        action = get_action(weights, observation)
        all_actions.append(action)
        
        observation, reward, done, info = ENV.step(action)
        all_observations = np.vstack((all_observations, observation))
        
        if done:
            break

    return all_observations[:-1, :], all_actions 

In [None]:
def get_grad_reward(weights, obs, actions, regression_model):
    grad_reward = np.zeros(4)
    for i in xrange(len(actions)):
        remaining_time = len(actions) - i
        baseline = regression_model.predict(obs[i, :].reshape(1, -1))
        grad_reward += grad_log_sigmoid(weights, obs[i, :], actions[i]) * (remaining_time - baseline)
    return grad_reward

In [None]:
from sklearn import linear_model

batch_n = 10 ** 2
grad_sample = 50
weights = initialize_random_weights(0, 1)
weights_cap = 10
learning_rate = 10 ** (-2)
current_score = [0] * (batch_n * grad_sample)

regression_model = linear_model.LinearRegression()
regression_model.fit([[0, 0, 0, 0]], [0])

for i in range(batch_n):
    if i % 10 == 0:
        print i, weights
            
    avg_grad_log_sum = np.zeros(4)
    
    all_obs = np.array([]).reshape(0,4)
    all_rewards = np.array([])
    
    for k in range(grad_sample):
        obs, actions = record_cartpole_run(weights)
        all_obs = np.vstack((all_obs, obs))
        all_rewards = np.hstack((all_rewards, np.arange(len(obs))[::-1]))
        
        avg_grad_log_sum += get_grad_reward(weights, obs, actions, regression_model)
        current_score[i * grad_sample + k] = len(actions)
        
    regression_model = linear_model.LinearRegression()
    regression_model.fit(all_obs, all_rewards)
        
    avg_grad_log_sum /= grad_sample
    
    weights += learning_rate * avg_grad_log_sum    
    
print weights, current_score[-1]

In [None]:
sum(current_score[-100:]) / 100

In [None]:
plt.plot(range(len(current_score)), current_score)
plt.plot(range(len(current_score)), current_score)
plt.show()