In [None]:
!pip install gymnasium scipy

In [None]:
import yaml
import random
import numpy as np
import gymnasium as gym
from scipy.stats import bernoulli
from gymnasium import Env, register
from gymnasium.spaces import  Discrete
from matplotlib import pyplot as plt

# Problem 3

In [None]:
class RandomMaze(Env):
    def __init__(self,params):
        # action space left =0,up=1,right=2,down=3
        self.action_space = Discrete(4)
        # state spaces are 3,7-terminal, 5 wall, rest all non terminal
        self.observation_space=Discrete(12)
        # initial state is 8
        self.startState=params["startState"]
        self.state=self.startState
        # prob of 0.8 of going in the desired action direction
        self.goInDirection=params["goInDirection"]
        self.goOrthogonal=(1-self.goInDirection)/2
        # it incurs a livingCost for every non terminal state
        self.livingCost=params["livingCost"]

    def step(self,action):
        # stochasticity in the walk.
        # sample from uniform (0,1) distribution using random.random()
        random_num=np.random.random()
        if random_num<=self.goInDirection:
            pass
        else:
            bernouli_sample = np.random.binomial(1, .5)
            if bernouli_sample == 0:
                action= (action - 1)%4
            else:
                action= (action + 1)%4
        # left action 0
        if action == 0 :
            if self.state not in [0,4,8,6, 3,7,5]:
                self.state-=1
            else:
                self.state=self.state
        # right action 2
        if action == 2 :
            if self.state not in [4,11, 3,7,5]:
                self.state+=1
            else:
                self.state=self.state
        # up action =1
        if action == 1 :
            if self.state not in [0,1,2,9, 3,7,5]:
                self.state-=4
            else:
                self.state=self.state
        # down action = 3
        if action == 3 :
            if self.state not in [1,8,9,10,11 ,3,7,5]:
                self.state+=4
            else:
                self.state=self.state

        if self.state==3:
            reward = 1
        elif self.state == 7:
            reward = -1
        else:
            reward=-self.livingCost
        # check wether we reached a terminal state or not
        if self.state==3 or self.state==7:
            done = True
        else:
            done = False
        # set placeholder for information
        info={}
        truncated = False
        assert self.state in range(12), "Hey check your step"
        return self.state,reward,done,truncated,info

    def render(self):
        pass

    def currState(self):
        return self.state

    def  reset(self, seed=None, options=None):
        # reset state
        super().reset(seed=seed)
        self.state=self.startState
        done=False
        info = {}
        return self.state,info


In [None]:
register(id='RME-v0', entry_point=RandomMaze)

In [None]:
random.seed(10)
np.random.seed(10)
params={}
params["gamma"]=0.99
params["goInDirection"]=0.8
params["livingCost"]=0.04
params["startState"]=8
params["theta"]=pow(10,-4)
params["S+"]=[0,1,2,3,4,6,7,8,9,10,11]
params["S"]=[0,1,2,4,6,8,9,10,11]
params["A"]=[0,1,2,3]
gamma=params["gamma"]
env=gym.make('RME-v0',params=params)
env.reset(seed=10)
done= False
noEpisodes=500
n=3
lamda=0.3

In [None]:
pi=np.zeros(12)
pi[0]=2
pi[1]=2
pi[2]=2
pi[3]=0 #taken any random number
pi[4]=1
pi[5]=0 #taken any random number
pi[6]=1
pi[7]=0 #taken any random number
pi[8]=1
pi[9]=2
pi[10]=1
pi[11]=0

#### Question 3(1)

In [None]:
'''
Generates sample trajectories to be used for learning the value function.
Returns list of state,action,next_state,reward tuples.
'''

def generateTrajectory(env, policy,maxSteps):

    list_exp_tuples=[]
    done=False
    for i in range (maxSteps):
        exp_tuples = np.zeros((4))
        exp_tuples[0]=env.currState()
        exp_tuples[1]=policy[env.currState()]
        end_state, reward, done,truncated,info=env.step(policy[env.currState()])
        exp_tuples[2]=end_state
        exp_tuples[3]=reward
        # print(done)
        list_exp_tuples.append(exp_tuples)
        if done==True:
            break
    if done==False:
        list_exp_tuples=[]
    return np.array(list_exp_tuples)
np.random.seed(12)
random.seed(12)
env.reset(seed=12)
print(generateTrajectory(env,pi,100))

#### Question 3(2)

In [None]:
'''
Calculates a decayed α value for each timestep.
'''
def decayAlpha(initialValue, finalValue, maxSteps, decayType):
    step_size=np.zeros(maxSteps)
    if decayType=="Linear":
        decayRate=(initialValue-finalValue)/(maxSteps -1)
        for i in range(maxSteps):
            step_size[i]=initialValue-i*decayRate
        # plt.plot(np.arange(0,maxSteps),step_size, color='r')
        # plt.ylabel('Step_size Parameter w.r.t time')
        # plt.xlabel('Timesteps')
        # plt.title('Linear Decay of step size parameter')
        # plt.savefig('Linear Decay of step size parameter.pdf')
        # plt.close()
    if decayType=="Exp":
        decayRate=(np.log(initialValue/finalValue))/maxSteps
        step_size[0]=initialValue
        for i in range(maxSteps-1):
            step_size[i+1]=step_size[i]*np.exp(-decayRate)
        # plt.plot(np.arange(0,maxSteps),step_size, color='r')
        # plt.ylabel('Step_size Parameter w.r.t time')
        # plt.xlabel('Timesteps')
        # plt.title('Exponential Decay of step size parameter')
        # plt.savefig('Exponential Decay of step size parameter.pdf')
        # plt.close()
    return step_size
lin_x=decayAlpha(0.5,0.01,250,"Linear")
exp_x=decayAlpha(0.5,0.01,250,"Exp")
plt.plot(lin_x,np.arange(250),color='red',linestyle='-')
plt.plot(exp_x,np.arange(250),color='blue',linestyle='--')
plt.xlabel('Timesteps')
plt.ylabel('Step Size Value')
plt.title('Strategies for decaying Step Size parameter')
plt.savefig('Decay Step Size.pdf')

In [None]:
alpha=(decayAlpha(0.5,0.01,250,"Exp"))
for i in range(250):
    alpha = np.append(alpha,0.01)

#### Question 3(3)

In [None]:
'''
env: The environment object.
policy: Policy to follow.
gamma: Discount factor.
alpha: Array of learning rates.
maxSteps: Max steps to simulate.
noEpisodes: Number of episodes to run.
firstVisit: Boolean to choose between FVMC and EVMC.
v: Final value function estimate.
v_r: Value function estimate at each episode.
Gt: Returns for a particular episode Xnot stateX .
'''


#state,action,next_state,reward

def MonteCarloPrediction(env, policy, gamma, alpha, maxSteps, noEpisodes, firstVisit):
    v=np.zeros(env.observation_space.n)
    v_r=np.zeros((noEpisodes,env.observation_space.n))
    Gt=np.zeros(noEpisodes)
    for e in range(noEpisodes):
        env.reset(seed=10)
        t=generateTrajectory(env, policy,maxSteps)
        vis=np.zeros(env.observation_space.n)
        for i in range(len(t)):
            s=t[i][0]
            if vis[int(s)] and firstVisit:
                continue
            G=0
            for j in range(i,len(t)):
                G+=(pow(gamma,j-i)*t[j,3])
            if s==4:
                Gt[e]=G
            v[int(s)]=v[int(s)]+alpha[int(e)]*(G-v[int(s)])
            vis[int(s)]=1

        v_r[e]=v
    return v, v_r,Gt

In [None]:
v,v_r,Gt = MonteCarloPrediction(env, pi, gamma , alpha, 1000, 500, True)

#### Question 3(4)

In [None]:
def TemporalDifferencePrediction(env, policy, gamma , alpha, noEpisodes):
    v = np.zeros(env.observation_space.n)
    v_r = np.zeros((noEpisodes,env.observation_space.n))
    Gt=np.zeros(noEpisodes)
    for e in range(noEpisodes):
        s,done = env.reset(seed=10)
        while not done:
            action=policy[s]
            next_state,reward,done,truncated,info = env.step(action)
            td_target = reward
            if not done:
                td_target = td_target + gamma*v[next_state]
            if s==4:
                Gt[e]=td_target
            td_error = td_target-v[s]
            v[s] = v[s] + alpha[e]*td_error
            s = next_state
        v_r[e] = v
    return v,v_r,Gt

In [None]:
v,v_r,Gt = TemporalDifferencePrediction(env, pi, gamma , alpha, 500)

#### Question 3(5)

In [None]:
def generateNStepTrajectory(env, policy,n):

    list_exp_tuples=[]
    done=False
    for i in range (n):
        exp_tuples = np.zeros((4))
        exp_tuples[0]=env.currState()
        exp_tuples[1]=policy[env.currState()]
        end_state, reward,done,truncated, info=env.step(policy[env.currState()])
        #self.state,reward,done,truncated,info
        exp_tuples[2]=end_state
        exp_tuples[3]=reward
        # print(exp_tuples[0])
        # print(exp_tuples[1])
        # print(exp_tuples[2])
        # print(done)
        list_exp_tuples.append(exp_tuples)
        if done==True:
            break
    return np.array(list_exp_tuples),env.currState(),done

In [None]:
def calculateReturn(gamma,path):
    g_partial=0
    for i in range(len(path)):
        g_partial+=(path[i][3]*pow(gamma,i))
    return g_partial

In [None]:
def nStepTD(env,pi,gamma,alpha,n,noEpisodes):
    v = np.zeros(env.observation_space.n)
    v_r = np.zeros((noEpisodes,env.observation_space.n))
    for e in range (noEpisodes): #noEpisodes
        #print(e)
        s,done=env.reset()
        path=[]
        itr = 0
        while not done:
            path,s_next,done=generateNStepTrajectory(env,pi,n)
            #print(path)
            #print("******")
            target=calculateReturn(gamma,path)
            if not done:
                target+=pow(gamma,n)*v[s_next]
            ntd_error=target-v[s]
            v[s]=v[s]+alpha[e]*ntd_error
            # if len(path)==1 and done:
            #     path = None
            s=s_next
        v_r[e]=v
    return v,v_r

In [None]:
v, v_r = nStepTD(env,pi,gamma,alpha,3,500)

#### Question 3(6)

In [None]:
def TDlambda(env, pi, gamma, alpha,lam, noEpisodes, params):
    v = np.zeros(env.observation_space.n)
    v_r = np.zeros((noEpisodes,env.observation_space.n))
    eligibility_records = {}
    for e in range(noEpisodes):
        eligibility_records_for_episode = []
        s,done = env.reset(seed=10)
        elig = np.zeros(env.observation_space.n) #eligibility vector
        while not done:
            action=pi[s]
            next_state,reward,done,truncated,info = env.step(action)
            td_target = reward
            if not done:
                td_target = td_target + gamma*v[next_state]
            td_error = td_target-v[s]
            elig[s]+=1
            for states in params["S"]:
                v[states] = v[states] + alpha[e]*td_error*elig[states]
                elig[states] = gamma*lam*elig[states]
            s = next_state
            eligibility_records_for_episode.append(elig.tolist())
        eligibility_records[e] = eligibility_records_for_episode
        v_r[e] = v
    return v,v_r, eligibility_records

In [None]:
value,value_record, eligibility = TDlambda(env, pi, gamma, alpha, 0.2, 500, params)

#### Question 3(7)

True Values:

> State 0 - 5/6
> 
> State 1 - 4/6
> 
> State 2 - 3/6
> 
> State 4 - 2/6
> 
> State 6 - 1/6
> 
> State 8 - 1/6
> 
> State 9 - 1/6
> 
> State 10 - 1/6
> 
> State 11 - 1/6

#### Question 3(8)

In [None]:
random.seed(10)
np.random.seed(10)
v,v_r2 ,Gt= MonteCarloPrediction(env, pi, gamma, alpha, 1000, 500, 1)
x = np.arange(0,500)
o=np.ones(500)

f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)

plt.plot(x,v_r2[:,0],color = 'blue', label="State 0")
plt.plot(x,5/6*o,color = 'blue',linestyle = "--", label="State 0 True")
plt.plot(x,v_r2[:,1],color = 'green', label="State 1")
plt.plot(x,4/6*o,color = 'green',linestyle = "--", label="State 1 True")
plt.plot(x,v_r2[:,2],color = 'pink', label="State 2")
plt.plot(x,3/6*o,color = 'pink',linestyle = "--", label="State 2 True")
plt.plot(x,v_r2[:,4],color = 'red', label="State 4")
plt.plot(x,2/6*o,color = 'red',linestyle = "--", label="State 4 True")
plt.plot(x,v_r2[:,6],color = 'orange', label="State 6")
plt.plot(x,1/6*o,color = 'orange',linestyle = "--", label="State 6 True")
plt.plot(x,v_r2[:,8],color = 'teal', label="State 8")
plt.plot(x,1/6*o,color = 'teal',linestyle = "--", label="State 8 True")
plt.plot(x,v_r2[:,9],color = 'lime', label="State 9")
plt.plot(x,1/6*o,color = 'lime',linestyle = "--", label="State 9 True")
plt.plot(x,v_r2[:,10],color = 'navy', label="State 10")
plt.plot(x,1/6*o,color = 'navy',linestyle = "--", label="State 10 True")
plt.plot(x,v_r2[:,11],color = 'purple', label="State 11")
plt.plot(x,1/6*o,color = 'purple',linestyle = "--", label="State 11 True")
plt.legend()

plt.title("FVMC estimates through time vs. true values")
plt.xlabel("Episodes")
plt.ylabel("State-Value Function")
plt.savefig("problem 3, Q-8.pdf")

plt.show()
plt.close()

#### Question 3(9)

In [None]:
random.seed(10)
np.random.seed(10)
v,v_r2 ,Gt= MonteCarloPrediction(env, pi, gamma, alpha, 1000, 500, 0)
x = np.arange(0,500)
o=np.ones(500)

f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)

plt.plot(x,v_r2[:,0],color = 'blue', label="State 0")
plt.plot(x,5/6*o,color = 'blue',linestyle = "--", label="State 0 True")
plt.plot(x,v_r2[:,1],color = 'green', label="State 1")
plt.plot(x,4/6*o,color = 'green',linestyle = "--", label="State 1 True")
plt.plot(x,v_r2[:,2],color = 'pink', label="State 2")
plt.plot(x,3/6*o,color = 'pink',linestyle = "--", label="State 2 True")
plt.plot(x,v_r2[:,4],color = 'red', label="State 4")
plt.plot(x,2/6*o,color = 'red',linestyle = "--", label="State 4 True")
plt.plot(x,v_r2[:,6],color = 'orange', label="State 6")
plt.plot(x,1/6*o,color = 'orange',linestyle = "--", label="State 6 True")
plt.plot(x,v_r2[:,8],color = 'teal', label="State 8")
plt.plot(x,1/6*o,color = 'teal',linestyle = "--", label="State 8 True")
plt.plot(x,v_r2[:,9],color = 'lime', label="State 9")
plt.plot(x,1/6*o,color = 'lime',linestyle = "--", label="State 9 True")
plt.plot(x,v_r2[:,10],color = 'navy', label="State 10")
plt.plot(x,1/6*o,color = 'navy',linestyle = "--", label="State 10 True")
plt.plot(x,v_r2[:,11],color = 'purple', label="State 11")
plt.plot(x,1/6*o,color = 'purple',linestyle = "--", label="State 11 True")
plt.legend()

plt.title("EVMC estimates through time vs. true values")
plt.xlabel("Episodes")
plt.ylabel("State-Value Function")
# plt.legend(["V(1)", "V(","pink","red","orange"])
# plt.legend(bbox_to_anchor=(1.05, 1.0, 0.3, 0.2), loc='upper left')
plt.savefig("problem 3, Q-9.pdf")

plt.show()
plt.close()

#### Question 3(10)

In [None]:
random.seed(10)
np.random.seed(10)
v,v_r2 ,Gt= TemporalDifferencePrediction(env, pi, gamma, alpha, 500)
x = np.arange(0,500)
o=np.ones(500)

f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)

plt.plot(x,v_r2[:,0],color = 'blue', label="State 0")
plt.plot(x,5/6*o,color = 'blue',linestyle = "--", label="State 0 True")
plt.plot(x,v_r2[:,1],color = 'green', label="State 1")
plt.plot(x,4/6*o,color = 'green',linestyle = "--", label="State 1 True")
plt.plot(x,v_r2[:,2],color = 'pink', label="State 2")
plt.plot(x,3/6*o,color = 'pink',linestyle = "--", label="State 2 True")
plt.plot(x,v_r2[:,4],color = 'red', label="State 4")
plt.plot(x,2/6*o,color = 'red',linestyle = "--", label="State 4 True")
plt.plot(x,v_r2[:,6],color = 'orange', label="State 6")
plt.plot(x,1/6*o,color = 'orange',linestyle = "--", label="State 6 True")
plt.plot(x,v_r2[:,8],color = 'teal', label="State 8")
plt.plot(x,1/6*o,color = 'teal',linestyle = "--", label="State 8 True")
plt.plot(x,v_r2[:,9],color = 'lime', label="State 9")
plt.plot(x,1/6*o,color = 'lime',linestyle = "--", label="State 9 True")
plt.plot(x,v_r2[:,10],color = 'navy', label="State 10")
plt.plot(x,1/6*o,color = 'navy',linestyle = "--", label="State 10 True")
plt.plot(x,v_r2[:,11],color = 'purple', label="State 11")
plt.plot(x,1/6*o,color = 'purple',linestyle = "--", label="State 11 True")
plt.legend()

plt.title("TD estimates through time vs. true values")
plt.xlabel("Episodes")
plt.ylabel("State-Value Function")
# plt.legend(["V(1)", "V(","pink","red","orange"])
# plt.legend(bbox_to_anchor=(1.05, 1.0, 0.3, 0.2), loc='upper left')
plt.savefig("problem 3, Q-10.pdf")

plt.show()
plt.close()

#### Question 3(11)

In [None]:
random.seed(10)
np.random.seed(10)
v,v_r2 = nStepTD(env,pi,gamma,alpha,3,500)
x = np.arange(0,500)
o=np.ones(500)

f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)

plt.plot(x,v_r2[:,0],color = 'blue', label="State 0")
plt.plot(x,5/6*o,color = 'blue',linestyle = "--", label="State 0 True")
plt.plot(x,v_r2[:,1],color = 'green', label="State 1")
plt.plot(x,4/6*o,color = 'green',linestyle = "--", label="State 1 True")
plt.plot(x,v_r2[:,2],color = 'pink', label="State 2")
plt.plot(x,3/6*o,color = 'pink',linestyle = "--", label="State 2 True")
plt.plot(x,v_r2[:,4],color = 'red', label="State 4")
plt.plot(x,2/6*o,color = 'red',linestyle = "--", label="State 4 True")
plt.plot(x,v_r2[:,6],color = 'orange', label="State 6")
plt.plot(x,1/6*o,color = 'orange',linestyle = "--", label="State 6 True")
plt.plot(x,v_r2[:,8],color = 'teal', label="State 8")
plt.plot(x,1/6*o,color = 'teal',linestyle = "--", label="State 8 True")
plt.plot(x,v_r2[:,9],color = 'lime', label="State 9")
plt.plot(x,1/6*o,color = 'lime',linestyle = "--", label="State 9 True")
plt.plot(x,v_r2[:,10],color = 'navy', label="State 10")
plt.plot(x,1/6*o,color = 'navy',linestyle = "--", label="State 10 True")
plt.plot(x,v_r2[:,11],color = 'purple', label="State 11")
plt.plot(x,1/6*o,color = 'purple',linestyle = "--", label="State 11 True")
plt.legend()

plt.title("n-Step TD estimates through time vs. true values")
plt.xlabel("Episodes")
plt.ylabel("State-Value Function")
# plt.legend(["V(1)", "V(","pink","red","orange"])
# plt.legend(bbox_to_anchor=(1.05, 1.0, 0.3, 0.2), loc='upper left')
plt.savefig("problem 3, Q-11.pdf")

plt.show()
plt.close()

#### Question 3(12)

In [None]:
random.seed(10)
np.random.seed(10)
v,v_r2, _ = TDlambda(env, pi, gamma, alpha, 0.3, 500, params)
x = np.arange(0,500)
o=np.ones(500)


f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)

plt.plot(x,v_r2[:,0],color = 'blue', label="State 0")
plt.plot(x,5/6*o,color = 'blue',linestyle = "--", label="State 0 True")
plt.plot(x,v_r2[:,1],color = 'green', label="State 1")
plt.plot(x,4/6*o,color = 'green',linestyle = "--", label="State 1 True")
plt.plot(x,v_r2[:,2],color = 'pink', label="State 2")
plt.plot(x,3/6*o,color = 'pink',linestyle = "--", label="State 2 True")
plt.plot(x,v_r2[:,4],color = 'red', label="State 4")
plt.plot(x,2/6*o,color = 'red',linestyle = "--", label="State 4 True")
plt.plot(x,v_r2[:,6],color = 'orange', label="State 6")
plt.plot(x,1/6*o,color = 'orange',linestyle = "--", label="State 6 True")
plt.plot(x,v_r2[:,8],color = 'teal', label="State 8")
plt.plot(x,1/6*o,color = 'teal',linestyle = "--", label="State 8 True")
plt.plot(x,v_r2[:,9],color = 'lime', label="State 9")
plt.plot(x,1/6*o,color = 'lime',linestyle = "--", label="State 9 True")
plt.plot(x,v_r2[:,10],color = 'navy', label="State 10")
plt.plot(x,1/6*o,color = 'navy',linestyle = "--", label="State 10 True")
plt.plot(x,v_r2[:,11],color = 'purple', label="State 11")
plt.plot(x,1/6*o,color = 'purple',linestyle = "--", label="State 11 True")
plt.legend()

plt.title("TD(lambda) estimates through time vs. true values")
plt.xlabel("Episodes")
plt.ylabel("State-Value Function")
# plt.legend(["V(1)", "V(","pink","red","orange"])
# plt.legend(bbox_to_anchor=(1.05, 1.0, 0.3, 0.2), loc='upper left')
plt.savefig("problem 3, Q-12.pdf")

plt.show()
plt.close()

#### Question 3(13)

In [None]:
random.seed(10)
np.random.seed(10)
v,v_r2, eligibility = TDlambda(env, pi, gamma, alpha, 0.3, 500, params)
episode_100_eligibility_records = np.array(eligibility[100])
x = np.arange(0,episode_100_eligibility_records.shape[0])

f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)

plt.plot(x,episode_100_eligibility_records[:,0],color = 'blue', label="State 0")
plt.plot(x,episode_100_eligibility_records[:,1],color = 'green', label="State 1")
plt.plot(x,episode_100_eligibility_records[:,2],color = 'pink', label="State 2")
plt.plot(x,episode_100_eligibility_records[:,4],color = 'red', label="State 4")
plt.plot(x,episode_100_eligibility_records[:,6],color = 'orange', label="State 6")
plt.plot(x,episode_100_eligibility_records[:,8],color = 'teal', label="State 8")
plt.plot(x,episode_100_eligibility_records[:,9],color = 'lime', label="State 9")
plt.plot(x,episode_100_eligibility_records[:,10],color = 'navy', label="State 10")
plt.plot(x,episode_100_eligibility_records[:,11],color = 'purple', label="State 11")
plt.title("n-Step TD Eligibility trace through time [Episode 100]")
plt.xlabel("Time Step")
plt.ylabel("Eligibility")
plt.legend()
plt.savefig("problem 3, Q-13.pdf")

plt.show()
plt.close()

#### Question 3(14)

In [None]:
random.seed(10)
np.random.seed(10)
v,v_r2 ,Gt= MonteCarloPrediction(env, pi, gamma, alpha, 1000, 500, 1)
x = np.arange(0,500)
o=np.ones(500)

f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)

plt.plot(x,v_r2[:,0],color = 'blue', label="State 0")
plt.plot(x,5/6*o,color = 'blue',linestyle = "--", label="State 0 True")
plt.plot(x,v_r2[:,1],color = 'green', label="State 1")
plt.plot(x,4/6*o,color = 'green',linestyle = "--", label="State 1 True")
plt.plot(x,v_r2[:,2],color = 'pink', label="State 2")
plt.plot(x,3/6*o,color = 'pink',linestyle = "--", label="State 2 True")
plt.plot(x,v_r2[:,4],color = 'red', label="State 4")
plt.plot(x,2/6*o,color = 'red',linestyle = "--", label="State 4 True")
plt.plot(x,v_r2[:,6],color = 'orange', label="State 6")
plt.plot(x,1/6*o,color = 'orange',linestyle = "--", label="State 6 True")
plt.plot(x,v_r2[:,8],color = 'teal', label="State 8")
plt.plot(x,1/6*o,color = 'teal',linestyle = "--", label="State 8 True")
plt.plot(x,v_r2[:,9],color = 'lime', label="State 9")
plt.plot(x,1/6*o,color = 'lime',linestyle = "--", label="State 9 True")
plt.plot(x,v_r2[:,10],color = 'navy', label="State 10")
plt.plot(x,1/6*o,color = 'navy',linestyle = "--", label="State 10 True")
plt.plot(x,v_r2[:,11],color = 'purple', label="State 11")
plt.plot(x,1/6*o,color = 'purple',linestyle = "--", label="State 11 True")
plt.legend()

plt.title("FVMC estimates through time [logscale] vs. true values")
plt.xlabel("Episodes")
plt.ylabel("State-Value Function")
plt.xscale('log')
# plt.legend(["V(1)", "V(","pink","red","orange"])
# plt.legend(bbox_to_anchor=(1.05, 1.0, 0.3, 0.2), loc='upper left')
plt.savefig("problem 3, Q-14.pdf")

plt.show()
plt.close()

#### Question 3(15)

In [None]:
random.seed(10)
np.random.seed(10)
v,v_r2 ,Gt= MonteCarloPrediction(env, pi, gamma, alpha, 1000, 500, 0)
x = np.arange(0,500)
o=np.ones(500)

f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)

plt.plot(x,v_r2[:,0],color = 'blue', label="State 0")
plt.plot(x,5/6*o,color = 'blue',linestyle = "--", label="State 0 True")
plt.plot(x,v_r2[:,1],color = 'green', label="State 1")
plt.plot(x,4/6*o,color = 'green',linestyle = "--", label="State 1 True")
plt.plot(x,v_r2[:,2],color = 'pink', label="State 2")
plt.plot(x,3/6*o,color = 'pink',linestyle = "--", label="State 2 True")
plt.plot(x,v_r2[:,4],color = 'red', label="State 4")
plt.plot(x,2/6*o,color = 'red',linestyle = "--", label="State 4 True")
plt.plot(x,v_r2[:,6],color = 'orange', label="State 6")
plt.plot(x,1/6*o,color = 'orange',linestyle = "--", label="State 6 True")
plt.plot(x,v_r2[:,8],color = 'teal', label="State 8")
plt.plot(x,1/6*o,color = 'teal',linestyle = "--", label="State 8 True")
plt.plot(x,v_r2[:,9],color = 'lime', label="State 9")
plt.plot(x,1/6*o,color = 'lime',linestyle = "--", label="State 9 True")
plt.plot(x,v_r2[:,10],color = 'navy', label="State 10")
plt.plot(x,1/6*o,color = 'navy',linestyle = "--", label="State 10 True")
plt.plot(x,v_r2[:,11],color = 'purple', label="State 11")
plt.plot(x,1/6*o,color = 'purple',linestyle = "--", label="State 11 True")
plt.legend()

plt.title("EVMC estimates through time [logscale] vs. true values")
plt.xlabel("Episodes")
plt.ylabel("State-Value Function")
plt.xscale('log')

plt.savefig("problem 3, Q-15.pdf")

plt.show()
plt.close()

#### Question 3(16)

In [None]:
random.seed(10)
np.random.seed(10)
v,v_r2 ,Gt= TemporalDifferencePrediction(env, pi, gamma, alpha, 500)
x = np.arange(0,500)
o=np.ones(500)

f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)

plt.plot(x,v_r2[:,0],color = 'blue', label="State 0")
plt.plot(x,5/6*o,color = 'blue',linestyle = "--", label="State 0 True")
plt.plot(x,v_r2[:,1],color = 'green', label="State 1")
plt.plot(x,4/6*o,color = 'green',linestyle = "--", label="State 1 True")
plt.plot(x,v_r2[:,2],color = 'pink', label="State 2")
plt.plot(x,3/6*o,color = 'pink',linestyle = "--", label="State 2 True")
plt.plot(x,v_r2[:,4],color = 'red', label="State 4")
plt.plot(x,2/6*o,color = 'red',linestyle = "--", label="State 4 True")
plt.plot(x,v_r2[:,6],color = 'orange', label="State 6")
plt.plot(x,1/6*o,color = 'orange',linestyle = "--", label="State 6 True")
plt.plot(x,v_r2[:,8],color = 'teal', label="State 8")
plt.plot(x,1/6*o,color = 'teal',linestyle = "--", label="State 8 True")
plt.plot(x,v_r2[:,9],color = 'lime', label="State 9")
plt.plot(x,1/6*o,color = 'lime',linestyle = "--", label="State 9 True")
plt.plot(x,v_r2[:,10],color = 'navy', label="State 10")
plt.plot(x,1/6*o,color = 'navy',linestyle = "--", label="State 10 True")
plt.plot(x,v_r2[:,11],color = 'purple', label="State 11")
plt.plot(x,1/6*o,color = 'purple',linestyle = "--", label="State 11 True")
plt.legend()

plt.title("TD estimates through time [logscale] vs. true values")
plt.xlabel("Episodes")
plt.ylabel("State-Value Function")
plt.xscale('log')
plt.savefig("problem 3, Q-16.pdf")

plt.show()
plt.close()

#### Question 3(17)

In [None]:
random.seed(10)
np.random.seed(10)
v,v_r2 = nStepTD(env,pi,gamma,alpha,3,500)
x = np.arange(0,500)
o=np.ones(500)

f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)

plt.plot(x,v_r2[:,0],color = 'blue', label="State 0")
plt.plot(x,5/6*o,color = 'blue',linestyle = "--", label="State 0 True")
plt.plot(x,v_r2[:,1],color = 'green', label="State 1")
plt.plot(x,4/6*o,color = 'green',linestyle = "--", label="State 1 True")
plt.plot(x,v_r2[:,2],color = 'pink', label="State 2")
plt.plot(x,3/6*o,color = 'pink',linestyle = "--", label="State 2 True")
plt.plot(x,v_r2[:,4],color = 'red', label="State 4")
plt.plot(x,2/6*o,color = 'red',linestyle = "--", label="State 4 True")
plt.plot(x,v_r2[:,6],color = 'orange', label="State 6")
plt.plot(x,1/6*o,color = 'orange',linestyle = "--", label="State 6 True")
plt.plot(x,v_r2[:,8],color = 'teal', label="State 8")
plt.plot(x,1/6*o,color = 'teal',linestyle = "--", label="State 8 True")
plt.plot(x,v_r2[:,9],color = 'lime', label="State 9")
plt.plot(x,1/6*o,color = 'lime',linestyle = "--", label="State 9 True")
plt.plot(x,v_r2[:,10],color = 'navy', label="State 10")
plt.plot(x,1/6*o,color = 'navy',linestyle = "--", label="State 10 True")
plt.plot(x,v_r2[:,11],color = 'purple', label="State 11")
plt.plot(x,1/6*o,color = 'purple',linestyle = "--", label="State 11 True")
plt.legend()

plt.title("n-Step TD estimates through time [logscale] vs. true values")
plt.xlabel("Episodes")
plt.ylabel("State-Value Function")
plt.xscale('log')
plt.savefig("problem 3, Q-17.pdf")

plt.show()
plt.close()

#### Question 3(18)

In [None]:
random.seed(10)
np.random.seed(10)
v,v_r2, _ = TDlambda(env, pi, gamma, alpha, 0.3, 500, params)
x = np.arange(0,500)
o=np.ones(500)

f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)

plt.plot(x,v_r2[:,0],color = 'blue', label="State 0")
plt.plot(x,5/6*o,color = 'blue',linestyle = "--", label="State 0 True")
plt.plot(x,v_r2[:,1],color = 'green', label="State 1")
plt.plot(x,4/6*o,color = 'green',linestyle = "--", label="State 1 True")
plt.plot(x,v_r2[:,2],color = 'pink', label="State 2")
plt.plot(x,3/6*o,color = 'pink',linestyle = "--", label="State 2 True")
plt.plot(x,v_r2[:,4],color = 'red', label="State 4")
plt.plot(x,2/6*o,color = 'red',linestyle = "--", label="State 4 True")
plt.plot(x,v_r2[:,6],color = 'orange', label="State 6")
plt.plot(x,1/6*o,color = 'orange',linestyle = "--", label="State 6 True")
plt.plot(x,v_r2[:,8],color = 'teal', label="State 8")
plt.plot(x,1/6*o,color = 'teal',linestyle = "--", label="State 8 True")
plt.plot(x,v_r2[:,9],color = 'lime', label="State 9")
plt.plot(x,1/6*o,color = 'lime',linestyle = "--", label="State 9 True")
plt.plot(x,v_r2[:,10],color = 'navy', label="State 10")
plt.plot(x,1/6*o,color = 'navy',linestyle = "--", label="State 10 True")
plt.plot(x,v_r2[:,11],color = 'purple', label="State 11")
plt.plot(x,1/6*o,color = 'purple',linestyle = "--", label="State 11 True")
plt.legend()

plt.title("TD(lambda) estimates through time [logscale] vs. true values")
plt.xlabel("Episodes")
plt.ylabel("State-Value Function")
plt.xscale('log')
plt.savefig("problem 3, Q-18.pdf")

plt.show()
plt.close()

#### Question 3(19)

In [None]:
START_EPISODE = 100

In [None]:
random.seed(10)
np.random.seed(10)
x = np.arange(START_EPISODE,500)
o=np.ones(500 - START_EPISODE)

f = plt.figure()
f.set_figwidth(20)
f.set_figheight(5)

v,v_r2_mf ,Gt= MonteCarloPrediction(env, pi, gamma, alpha, 1000, 500, 1)
v,v_r2_me ,Gt= MonteCarloPrediction(env, pi, gamma, alpha, 1000, 500, 0)
v,v_r2_td ,Gt= TemporalDifferencePrediction(env, pi, gamma, alpha, 500)
v,v_r2_ntd = nStepTD(env,pi,gamma,alpha,3,500)
v,v_r2_tdlambda, _ = TDlambda(env, pi, gamma, alpha, 0.3, 500, params)

plt.plot(x,v_r2_mf[START_EPISODE:,4],color = 'blue', label="EVMC")
plt.plot(x,v_r2_me[START_EPISODE:,4],color = 'red', label="FVMC")
plt.plot(x,v_r2_td[START_EPISODE:,4],color = 'green', label="TD")
plt.plot(x,v_r2_ntd[START_EPISODE:,4],color = 'orange', label="nStep TD")
plt.plot(x,v_r2_tdlambda[START_EPISODE:,4],color = 'black', label="TD Lambda")
plt.plot(x,2/6*o,color = 'red',linestyle = "--", label="True value")

plt.title(f"Evaluation of various algorithms on State 4 through time from Episodes : {START_EPISODE} to 500")
plt.xlabel("Episodes")
plt.ylabel("State-Value Function")
plt.legend()
plt.savefig("problem 3, Q-19.pdf")

plt.show()
plt.close()

#### Question 3(20)

In [None]:
random.seed(10)
np.random.seed(10)

f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)

v,v_r2 ,Gt= MonteCarloPrediction(env, pi, gamma, alpha, 1000, 500, 1)
x = np.arange(0,Gt.shape[0])
plt.scatter(x,Gt,color = 'blue', label="Gt values")
plt.plot(x,2/6*o,color = 'red',linestyle = "--", label="True value")
plt.title("FVMC Target Value (Gt) of State 4 through time")
plt.xlabel("Episodes")
plt.ylabel("Target Value (Gt)")
plt.legend()
plt.savefig("problem 3, Q-20.pdf")

plt.show()
plt.close()

#### Question 3(21)

In [None]:
random.seed(10)
np.random.seed(10)

f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)

v,v_r2 ,Gt= MonteCarloPrediction(env, pi, gamma, alpha, 1000, 500, 0)
x = np.arange(0,Gt.shape[0])
plt.scatter(x,Gt,color = 'blue', label="Gt values")
plt.plot(x,2/6*o,color = 'red',linestyle = "--", label="True value")

plt.title("EVMC Target Value (Gt) of State 4 through time")
plt.xlabel("Episodes")
plt.ylabel("Target Value (Gt)")
plt.savefig("problem 3, Q-21.pdf")

plt.show()
plt.close()

#### Question 3(22)

In [None]:
random.seed(10)
np.random.seed(10)

f = plt.figure()
f.set_figwidth(20)
f.set_figheight(10)

v,v_r2 ,Gt= TemporalDifferencePrediction(env, pi, gamma, alpha, 500)
x = np.arange(0,500)
o=np.ones(500)
plt.scatter(x,Gt,color = 'blue', label="Gt values")
plt.plot(x,2/6*o,color = 'red',linestyle = "--", label="True value")
plt.title("TD Target Value (Gt) of State 4 through time")
plt.xlabel("Episodes")
plt.ylabel("Target Value (Gt)")
plt.savefig("problem 3, Q-22.pdf")

plt.show()
plt.close()