In [1]:
import numpy as np
import math

In [65]:
#this is to define the state space 
v_n = 50                      #we are going to divide the velocity state space into 1500 discrete points
v_max = 0.07
v_min = -0.07
velocities = np.linspace (v_min , v_max , v_n)

p_n = 50
p_max = 0.6
p_min = -1.2
positions = np.linspace (p_min , p_max , p_n)

gamma = 0.99
reward = -1
acc = 0.001

In [66]:
def give_index (pos , vel):
    #this function takes the pos and the vel and then return the correspoding index of the state
    #in the state space
    v_div = (v_max - v_min) / (v_n - 1)
    y =  (vel - v_min) / v_div
    
    p_div = (p_max - p_min) / (p_n - 1)
    x = (pos - p_min) / p_div
    
    return (int(x) , int(y))

In [67]:
give_index (-1.2 , 0.07)

(0, 49)

In [68]:
def value_iteration (state_values):
    #this function takes the state values (initial)
    #and then use the value iteration to return the actual value function of the states
    
    #in the state values rows represent the position and the columns represent the velocity
    
    
    threshold = 1000
    epsilon = 0.01
    
    while (threshold > epsilon):
        #print (threshold)
        #compute the value function for the first sweep
        threshold = -10000
        
        for row in range(p_n):
            for col in range(v_n):
                
                current_vel = velocities[col]
                current_pos = positions[row]
                
                optimal_value = -math.inf
                temp_value = -1
                for action in [-1 , 0 , 1]:
                    new_position = current_vel + (action * 0.001) + math.cos (3 * current_pos) * (-0.025)
                    new_velocity = new_position + current_vel
                    
                    if (new_velocity > 0.07):
                        new_velocity = 0.07
                        
                    elif (new_velocity < -0.07):
                        new_velocity = -0.07
                    
                    
                    #this is for updating the value
                    if (new_position > 0.6):
                        temp_value = 0
                        
                    elif (new_position < -1.2):
                        temp_value = reward
                        
                    else:    
                        x , y = give_index(new_position , new_velocity)
                        temp_value = reward + gamma * state_values[x][y]
                        
                    optimal_value = max (optimal_value, temp_value)
                
                threshold = max (threshold ,  abs (optimal_value - state_values[row][col]))
                state_values[row][col] = optimal_value
            
    
    return (np.copy (state_values))
                    

In [69]:
state_value = np.zeros ((p_n , v_n))
final_values = value_iteration(state_value)

1000
6.793465209301
5.793465209301
0.9801
0.9702989999999998
0.9605960100000002
0.9509900498999997
0.9414801494010003
0.93206534790699
0.9227446944279203
0.9135172474836413
0.9043820750088045
0.895338254258716
0.88638487171613
0.8775210229989678
0.868745812768978
0.8600583546412892
0.851457771094875
0.8429431933839293
0.8345137614500864
0.826168623835585
0.817906937597229
0.809727868221259
0.8016305895390445
0.7936142836436559
0.7856781408072173
0.7778213593991481
0.770043145805154
0.7623427143471027
0.7547192872036312
0.7471720943315958
0.7397003733882812
0.7323033696543959
0.7249803359578522
0.7177305325982744
0.7105532272722925
0.7034476949995678
0.6964132180495746
0.6894490858690787
0.6825545950103908
0.6757290490602799
0.6689717585696826
0.6622820409839818
0.6556592205741438
0.6491026283684036
0.6426116020847203
0.6361854860638729
0.6298236312032301
0.6235253948912032
0.6172901409422877
0.6111172395328666
0.6050060671375377
0.5989560064661603
0.5929664464014976
0.5870367819374849


0.014681882057374196
0.014535063236792212
0.014389712604426563
0.014245815478375334
0.014103357323591581
0.01396232375036277
0.013822700512861275
0.013684473507723283
0.01354762877265614
0.013412152484931994
0.0132780309600804
0.013145250650481444
0.013013798143973077
0.012883660162529509
0.012754823560911177
0.012627275325300502
0.012501002572037123
0.01237599254632471
0.012252232620866721
0.01212971029465848
0.0120084131917082
0.011888329059786429
0.011769445769189701
0.0116517513115042
0.011535233798383615
0.011419881460398074
0.011305682645797788
0.011192625819333557
0.011080699561148322
0.010969892565526607
0.01086019363987134
0.010751591703481722
0.010644075786444773
0.010537635028583736
0.010432258678292783
0.010327936091513834
0.01022465673059969
0.010122410163290851
0.0100211860616497
0.009920974201037325
0.009821764459033488
0.009723546814441875
0.009626311346295324
0.009530048232832655
0.009434747750503902
0.00934040027300398
0.009246996270263708
0.009154526307568744
0.00906

In [70]:
print (state_value)

[[-99.90266158 -99.90266158 -99.90266158 ... -99.90266158 -99.90266158
  -99.90266158]
 [-99.90266158 -99.90266158 -99.90266158 ... -99.90266158 -99.90266158
  -99.90266158]
 [-99.90266158 -99.90266158 -99.90266158 ... -99.90266158 -99.90266158
  -99.90266158]
 ...
 [-99.90266158 -99.90266158 -99.90266158 ... -99.90363497 -99.90363497
  -99.90363497]
 [-99.90266158 -99.90266158 -99.90266158 ... -99.90363497 -99.90363497
  -99.90363497]
 [-99.90266158 -99.90266158 -99.90363497 ... -99.90363497 -99.90363497
  -99.90363497]]
