In [3]:
import neuralnetworksA4 as nn
import numpy as np
import matplotlib.pyplot as plt
import random
from IPython.display import display, clear_output
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import copy

validActions = np.array([ -1, 0, 1])

def initialState(trial):
    global goal
    if trial == 0:
        goal = random.randint(1,10)
    return np.array([10*np.random.random_sample(), 3*(0.5-np.random.random_sample()),goal])

def nextState(s,a,goal):
    s = copy.copy(s)   # s[0] is position, s[1] is velocity. a is -1, 0 or 1
    deltaT = 0.1                           # Euler integration time step
    s[0] += deltaT * s[1]                  # Update position
    s[1] += deltaT * (2 * a - 0.2 * s[1])  # Update velocity. Includes friction
    s[2] = goal
    if s[0] < 0:        # Bound next position. If at limits, set velocity to 0.
        s = np.array([0,0,s[2]])
    elif s[0] > 10:
        s = np.array([10,0,s[2]])
    return s

def reinforcement(s):  # s is new state
    return 0 if abs(s[0]-s[2]) < 1 else -0.1

def policy(qnet, state, epsilon):
    if np.random.rand(1) < epsilon:
        actioni = np.random.randint(validActions.shape[0])
    else:
        inputs = np.hstack(( np.tile(state, (validActions.shape[0], 1)), validActions.reshape((-1,1))))
        qs = qnet.use(inputs)
        actioni = np.argmax(qs)
    return validActions[actioni]


def makeSamples(qnet, nStepsPerStart,trial,inval):
    global goal
    samples = []
    state = initialState(trial)
    act = policy(qnet, state, epsilon)
    oldact = act
    print(goal)
    for iStep in range(nStepsPerStart):
        newState = nextState(state, act,goal)
        r = reinforcement(newState)
        newAct = policy(qnet, newState, epsilon)
        # SARSA
        samples.append(state.tolist() + [act, r] + newState.tolist() + [newAct])
        state = newState
        oldact = act
        act = newAct
    return np.array(samples)
    

def plotStatus(qnet, X, R, trial, epsilonTrace, rtrace):
    
    plt.subplot(4,3,1)
    plt.plot(epsilonTrace[:trial+1])
    plt.ylabel("Random Action Probability ($\epsilon$)")
    plt.ylim(0,1)
    
    plt.subplot(4,3,2)
    plt.plot(X[:,0])
    plt.plot([0,X.shape[0]], [goal,goal],'--',alpha=0.5,lw=5)
    plt.ylabel("$x$")
    plt.ylim(-1,11)
    qs = qnet.use(np.array([[s,0,goal,a] for a in validActions for s in range(11)]))

    
    plt.subplot(4,3,3)
    acts = ["L","0","R"]
    actsiByState = np.argmax(qs.reshape((len(validActions),-1)),axis=0)
    for i in range(11):
        plt.text(i,0,acts[actsiByState[i]])
        plt.xlim(-1,11)
        plt.ylim(-1,1)
    plt.text(2,0.2,"Policy for Zero Velocity")
    plt.axis("off")
    
    plt.subplot(4,3,4)
    plt.plot(rtrace[:trial+1],alpha=0.5)
    binSize = 20
    if trial+1 > binSize:
        smoothed = np.mean(rtrace[:int(trial/binSize)*binSize].reshape((int(trial/binSize),binSize)),axis=1)
        plt.plot(np.arange(1,1+int(trial/binSize))*binSize,smoothed)
    plt.ylabel("Mean reinforcement")
    
    
    plt.subplot(4,3,5)
    plt.plot(X[:,0],X[:,1])
    plt.plot(X[0,0],X[0,1],'o')
    plt.xlabel("$x$")
    plt.ylabel("$\dot{x}$")
    plt.fill_between([goal-1,goal+1],[-5,-5],[5,5],color="red",alpha=0.3)
    plt.xlim(-1,11)
    plt.ylim(-5,5)

    plt.subplot(4,3,6)
    qnet.draw(["$x$","$\dot{x}$","$a$"],["Q"])

    plt.subplot(4,3,7)
    n = 20
    positions = np.linspace(0,10,n)
    velocities =  np.linspace(-5,5,n)
    xs,ys = np.meshgrid(positions,velocities)
    xsflat = xs.flat
    ysflat = ys.flat
    qs = qnet.use(np.array([[xsflat[i],ysflat[i],goal,a] for a in validActions for i in range(len(xsflat))]))
    qs = qs.reshape((len(validActions),-1)).T
    qsmax = np.max(qs,axis=1).reshape(xs.shape)
    cs = plt.contourf(xs,ys,qsmax)
    plt.colorbar(cs)
    plt.xlabel("$x$")
    plt.ylabel("$\dot{x}$")
    plt.title("Max Q")
   
    plt.subplot(4,3,8)
    acts = np.array(validActions)[np.argmax(qs,axis=1)].reshape(xs.shape)
    cs = plt.contourf(xs,ys,acts,[-2, -0.5, 0.5, 2])
    plt.colorbar(cs)
    plt.xlabel("$x$")
    plt.ylabel("$\dot{x}$")
    plt.title("Actions")

    s = plt.subplot(4,3,10)
    rect = s.get_position()
    ax = Axes3D(plt.gcf(),rect=rect)
    ax.plot_surface(xs,ys,qsmax,cstride=1,rstride=1,cmap=cm.viridis,linewidth=0)
    ax.set_xlabel("$x$")
    ax.set_ylabel("$\dot{x}$")
    plt.title("Max Q")

    s = plt.subplot(4,3,11)
    rect = s.get_position()
    ax = Axes3D(plt.gcf(),rect=rect)
    ax.plot_surface(xs,ys,acts,cstride=1,rstride=1,cmap=cm.viridis,linewidth=0)
    ax.set_xlabel("$x$")
    ax.set_ylabel("$\dot{x}$")
    plt.title("Action")    
    
    
def testIt(qnet,nTrials,nStepsPerTrial,inval):
    xs = np.linspace(0,10,nTrials)
    plt.subplot(4,3,12)
    for x in xs:
        s = [x,0,goal] ## 0 velocity
        xtrace = np.zeros((nStepsPerTrial,3))
        for step in range(nStepsPerTrial):
            a = policy(qnet, s, 0.0)  # epsilon = 0
            s = nextState(s,a,goal)
            xtrace[step,:] = s
        plt.plot(xtrace[:,0],xtrace[:,1])
        plt.xlim(-1,11)
        plt.ylim(-5,5)
        plt.plot([goal,goal],[-5,5],'--',alpha=0.5,lw=5)
        plt.ylabel('$\dot{x}$')
        plt.xlabel('$x$')
        plt.title('State Trajectories for $\epsilon=0$ and Goal = %s'%(goal))
        
gamma = 0.999
nTrials = 400
nStepsPerTrial = 700 
nSCGIterations = 40

nh = [4,4]
qnet = nn.NeuralNetwork([4], nh , [1])  
qnet.setInputRanges(( (0, 10), (-3, 3), (0,10),(-1,1)))


for inval in range(0,4,1):
    epsilon = 1
    finalEpsilon = 0.03
    epsilonDecay = np.exp(np.log(finalEpsilon)/(nTrials))  # to produce this final value
    epsilonTrace = np.zeros(nTrials)
    rtrace = np.zeros(nTrials)
    fig = plt.figure(figsize=(20,20))
    for trial in range(nTrials):
        # Collect nStepsPerRep samples of X, R, Qn, and Q, and update epsilon
        samples = makeSamples(qnet, nStepsPerTrial,trial,inval)
        ns = 3
        
        
gamma = 0.999
nTrials = 400
nStepsPerTrial = 700 
nSCGIterations = 40

nh = [4,4]
qnet = nn.NeuralNetwork([4],nh,[1])  
qnet.setInputRanges(( (0, 10), (-3, 3), (0,10),(-1,1)))


for inval in range(0,1,1):
    epsilon = 1
    finalEpsilon = 0.03
    epsilonDecay = np.exp(np.log(finalEpsilon)/(nTrials))  # to produce this final value
    epsilonTrace = np.zeros(nTrials)
    rtrace = np.zeros(nTrials)
    fig = plt.figure(figsize=(20,20))
    for trial in range(nTrials):
        # Collect nStepsPerRep samples of X, R, Qn, and Q, and update epsilon
        samples = makeSamples(qnet, nStepsPerTrial,trial,inval)
        ns = 3
        na = 1
        X = samples[:, :ns+na]
        R = samples[:, ns+na:ns+na+1]
        nextX = samples[:, ns+na+1:]
        nextQ = qnet.use(nextX)

        qnet.train(X, R + gamma * nextQ, nIterations = nSCGIterations)

        # Decay epsilon
        epsilon *= epsilonDecay

        # Rest is for plotting
        epsilonTrace[trial] = epsilon
        rtrace[trial] = np.mean(R)

        if trial % (nTrials//10) == 0 or trial == nTrials-1:
            plt.clf()
            plotStatus(qnet, X, R, trial,epsilonTrace,rtrace)
            testIt(qnet,10,500,inval)
            clear_output(wait=False)
            display(fig);
            plt.pause(0.01)

        # print('Trial',trial,'mean R',np.mean(R))
    clear_output(wait=False)

        
na = 1
X = samples[:, :ns+na]
R = samples[:, ns+na:ns+na+1]
nextX = samples[:, ns+na+1:]
nextQ = qnet.use(nextX)

qnet.train(X, R + gamma * nextQ, nIterations = nSCGIterations)

# Decay epsilon
epsilon *= epsilonDecay

# Rest is for plotting
epsilonTrace[trial] = epsilon
rtrace[trial] = np.mean(R)

if trial % (nTrials//10) == 0 or trial == nTrials-1:
            plt.clf()
            testIt(qnet,10,500,inval)
            clear_output(wait=False)
            display(fig);
            plt.pause(0.01)

        # print('Trial',trial,'mean R',np.mean(R))
clear_output(wait=False)

        

TypeError: __init__() missing 2 required positional arguments: 'nhs' and 'no'