In [19]:
import gym
import time
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")

In [14]:
def Q_func(features, params):
    return 1 # np.dot(features, params)

def features_func(state, action):
    return state + action

def max_argmax(Q_func, features_func, state, actions_list):
    Q_func_values = [ ]
    for action in actions_list:
            Q_func_values.append(Q_func(features_func(state, action), params))
    
    best_value = max(Q_func_values)
    best_action = actions_list[Q_func_values.index(best_value)]

    return (best_value, best_action)

def diff(reward, max_Q_value, Q_value, disc_factor):
    return reward + disc_factor * max_Q_value - Q_value

def update_params(params, features, difference, alpha):
    params += alpha * difference * features
    return params

In [22]:
env = gym.make('MsPacman-ram-v0')
state = env.reset() # initialize the environment

disc_factor  = 0.9 # discounting factor
alpha = 0.1 # learning rate
actions_list = [0, 1, 2, 3, 4, 5, 6, 7, 8]
features_space_dim = state.shape[0]

# The Q function will be a linear combination of
# the features, and for each of them it has one
# parameter to be learned
params = np.random.randn(features_space_dim)

In [None]:
Q_value_old = -50 # naive initialization
total_reward_over_time = np.zeros((1)) # cumulative reward
total_reward = 0

action = actions_list[np.random.randint(len(actions_list))] # random first action
state, reward, done, info = env.step(action)

total_reward += reward
total_reward_over_time = np.append(total_reward_over_time, total_reward)

Q_value = Q_func(state, action) # Q-func in the new state

# parameters update
difference = diff(reward, Q_value, Q_value_old, disc_factor)
features = features_func(state, action)
params = update_params(params, features, difference, alpha)

Q_value_old = Q_value

for t in range(1000):
    time.sleep(0.03)
    env.render()

    max_Q_value, best_action = max_argmax(Q_func, features_func, state, actions_list)
    action = best_action # 100%-greedy selection

    state, reward, done, info = env.step(action)
    total_reward += reward
    total_reward_over_time = np.append(total_reward_over_time, total_reward)

    Q_value = Q_func(state, action)
    
    # parameters update
    features = features_func(state, action)
    difference = diff(reward, max_Q_value, Q_value_old, disc_factor)    
    params = update_params(params, features, difference, alpha)

    Q_value_old = Q_value

    if done: # lost all lives
        break
env.close()

In [None]:
plt.plot(total_reward_over_time)