In [None]:
import gym
from matplotlib import animation
import matplotlib.pyplot as plt
import numpy as np

In [None]:
%matplotlib inline

In [None]:
ENV = gym.make('CartPole-v1')
ENV.reset()

In [None]:
def initialize_random_weights(mean, std):
    return np.random.normal(mean, std, 4)

In [None]:
def sigmoid(weights, observation):
    weighted_sum = sum([weights[i] * observation[i] for i in range(len(weights))])
    return 1.0 / (1 + np.exp(-weighted_sum)) 

def grad_log_sigmoid(weights, observation, action):
    if action == 1:
        return observation * (1 - sigmoid(weights, observation))
    else:
        return - observation * sigmoid(weights, observation)

def get_action(weights, observation):
    prob_one = sigmoid(weights, observation)
    return int(np.random.random() <= prob_one)

In [None]:
def record_cartpole_run(weights):
    observation = ENV.reset()
    
    all_observations = np.zeros((1, 4))
    all_observations[0, :] = observation
    
    all_actions = []
    for t in range(1000):
        action = get_action(weights, observation)
        all_actions.append(action)
        
        observation, reward, done, info = ENV.step(action)
        all_observations = np.vstack((all_observations, observation))
        
        if done:
            break

    return all_observations[:-1, :], all_actions 

In [None]:
def get_grad_reward(weights, obs, actions, rewards):
    grad_reward = np.zeros(4)
    for i in xrange(len(actions)):
        grad_reward += grad_log_sigmoid(weights, obs[i, :], actions[i]) * rewards[i]
    return grad_reward

In [None]:
n_runs = 10 ** 3
memory_size = 50
last_games = [None] * memory_size
last_actions = [None] * memory_size

weights = initialize_random_weights(0, 1)
weights_cap = 100
beta = 0
learning_rate = 10 ** (-3)
current_score = [0] * n_runs
weights_norms = [0] * n_runs

for i in range(n_runs):
    if i % 100 == 0:
        print i, weights, beta
                
    obs, actions = record_cartpole_run(weights)
    current_score[i] = len(actions)
    last_games.pop(0)
    last_games.append(obs)
    last_actions.pop(0)
    last_actions.append(actions)    
    
    all_obs = np.array([]).reshape(0,4)
    all_actions = np.array([])
    all_rewards = np.array([])
    game_counter = 0
    for j in range(memory_size):
        if last_games[j] is not None:
            game_counter += 1
            all_obs = np.vstack((all_obs, last_games[j]))
            all_rewards = np.hstack((all_rewards, np.arange(len(last_games[j]))[::-1]))
            all_actions = np.hstack((all_actions, last_actions[j]))
        
    grad_reward = get_grad_reward(weights, all_obs, all_actions, all_rewards)
        
    weights += learning_rate * (np.clip(grad_reward / game_counter + 2 * beta * weights, -5, 5)) 
#     print grad_reward / game_counter
    weights_norm = sum([x * x for x in weights])
    weights_norms[i] = weights_norm
    beta -= learning_rate * (weights_norm - weights_cap)
    
    
print weights, current_score[-1]

In [None]:
sum(current_score[-100:]) / 100

In [None]:
plt.subplot(2, 1, 1)
plt.plot(range(len(current_score)), current_score, label="reward")
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(range(len(weights_norms)), weights_norms, label="weights_norm")
plt.legend()
plt.show()