In [None]:
# Implementing Q learning On The Inverted Pendulum Problem. 
# Reference: https://github.tamu.edu/desik-rengarajan/IRL

from ped_car import PedestrianEnv
import numpy as np
import random
import math
from time import sleep
import matplotlib.pyplot as plt
import pdb
import pickle
%matplotlib inline

## Initialize the "Pedestrian" environment
env = PedestrianEnv()
# observation_space_low = [0,0,1.5,10,-3]
# observation_space_high = [10,75,7.5,15,3]
observation_space_low = [0,0,2.5,10,-3]
# CHANGE HERE
observation_space_high = [10,60,10,15,3]

## Defining the environment related constants

# Number of discrete states and actions (bucket) per dimension
# CHANGE HERE
NUM_BUCKETS = (21,61,4,21,13)  # (p_y, c_x, c_y, c_v, c_a) = (40, 300, 3, 40, 24) add 1 to all!
NUM_ACTIONS = 5

# bounds for each discrete state
STATE_BOUNDS = list(zip(observation_space_low, observation_space_high))

# bounds for action and state spaces
action_space_low = -2
action_space_high = 2
action_bins = np.squeeze(np.linspace(action_space_low, action_space_high, NUM_ACTIONS))
state_bins = []
# CHANGE HERE
for i in range(5):
    state_bins.append(np.linspace(STATE_BOUNDS[i][0], STATE_BOUNDS[i][1], NUM_BUCKETS[i]-1))

## Defining the simulation related constants
NUM_EPISODES =300000
# MAX_T = 200
DEBUG_MODE = False

## Creating a Q-Table for each state-action pair
q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,))


# with open('ped_try.pickle', 'rb') as f:
#     q_table = pickle.load(f)
    
def select_action(state, explore_rate):
    # Select a random action
    if random.random() < explore_rate:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[state])
        action = map_action(action)
        
    return action

def map_action(action_idx):
    if(action_idx == 0):
        lower_limit = action_space_low
        upper_limit = action_space_low
    elif(action_idx == NUM_ACTIONS):
        lower_limit = action_space_high
        upper_limit = action_space_high
    else:
        lower_limit = action_bins[action_idx-1]
        upper_limit = action_bins[action_idx]
    
    action = np.random.uniform(low=lower_limit, high=upper_limit, size=1)
    return action

def bucket_action(action):
    return np.digitize(action, action_bins, right=True) #right is different

def bucket_state(states):
    idx = []
    for i, state in enumerate(states):
        idx.append(np.digitize(state, state_bins[i], right=True)) #right is different
    return tuple(idx)
## Instantiating the learning related parameters
learning_rate = 1
explore_rate = 1
decay_rate_exp = 0.00002
decay_rate_lea = 0.0002

max_explore_rate = 1
max_learn_rate = 1
min_explore_rate = 0.01
min_learn_rate = 0.01

discount_factor = 0.99
rew = np.zeros(NUM_EPISODES)
Rate_explore = np.zeros((NUM_EPISODES,1))
Rate_learn = np.zeros((NUM_EPISODES,1))

static_count = 0
static_death_toll=0
static_safe_chicken=0

death_toll=0
safe_chicken=0
done_count=0
count=0


In [None]:
# # ## Defining the simulation related constants
# NUM_EPISODES = 100000

# import ped_car
# reload(ped_car)
# from ped_car import PedestrianEnv

# The initial state
for episode in range(NUM_EPISODES):
    c_state,static_state = env.reset(np.random.randint(1,4))
    c_state = bucket_state(c_state)
    cum_rew_ep = 0
    if static_state:
        static_count+=1
    while True:
        # Select an action
        action = select_action(c_state, explore_rate)
#         print(action)
        # Execute the action
        n_state, reward, done = env.step(action)
        
#         the variable temp is for printing purpose
        temp = n_state
        n_state = np.array(n_state)
#         print(reward)

        # bucket states and action
        action = bucket_action(action)
#         print(action,"C")
        n_state = bucket_state(n_state)

        # Update the Q based on the result
        best_q = np.amax(q_table[n_state])
        try:
            q_table[c_state + (action,)] = ((1-learning_rate) * q_table[c_state + (action,)] + 
                                        learning_rate * (reward + discount_factor * best_q))
        except:
            print()
            
        # Setting up for the next iteration
        c_state = n_state
        cum_rew_ep += reward

        # Print data
        if done:
            print(temp)
            print('EPISODE:'+str(episode)+' STATIC: '+str(static_state)
                  +'   REWARD: '+str(reward) + ' Cumulative Reward: '+str(cum_rew_ep)) 
            
            done_count+=1
            if (reward==-100):
                death_toll+=1   
                if static_state:
                    static_death_toll+=1
                    
            if (reward==75):
                safe_chicken+=1
                if static_state:
                    static_safe_chicken+=1
            break
        
    Rate_explore[episode] = explore_rate
    Rate_learn[episode] = learning_rate

    # Update parameters
    rew[episode] = cum_rew_ep

    #Exponential learning
    decay_parameter_exp = np.exp(-decay_rate_exp * (episode+1))
    decay_parameter_lea = np.exp(-decay_rate_lea * (episode+1))
    print(explore_rate)
    explore_rate = min_explore_rate + (max_explore_rate - min_explore_rate)*decay_parameter_exp
    learning_rate = min_learn_rate + (max_learn_rate - min_learn_rate)*decay_parameter_lea

#Results
print('Episodes', done_count)
print('Safe_chicken',safe_chicken)
print('Death_toll '+str(death_toll))
print('Did_not_reach '+str(done_count-safe_chicken-death_toll))
print('Death_toll % '+str(death_toll*100/(done_count)))

print('Static Episodes', static_count)
print('Static Safe_chicken',static_safe_chicken)
print('Static Death_toll '+str(static_death_toll))
print('Static Did_not_reach '+str(static_count - static_safe_chicken - static_death_toll))
print('Static Death_toll % '+str(static_death_toll*100/(static_count)))

In [None]:
np.where(q_table>0)


In [None]:
# print(action_bins)

In [None]:
CHUNK_SIZE = 100
num_iter = NUM_EPISODES // CHUNK_SIZE
ma_rew = np.empty((num_iter))
for n in range(num_iter):
    ma_rew[n] = np.mean(rew[n*CHUNK_SIZE:(n+1)*CHUNK_SIZE])
# print(ma_rew.size)
plt.plot(np.asarray(np.arange(ma_rew.size)), ma_rew)
plt.xlabel('Episodes / {}'.format(CHUNK_SIZE))
plt.ylabel('Cumulative Reward')
plt.savefig('Episodic Cumulative Reward Ped2 latest small time.pdf')
plt.savefig('Episodic Cumulative Reward Ped2 latest small time.png')



In [None]:
plt.plot(np.asarray(np.arange(NUM_EPISODES)),Rate_explore, label='Exploration Rate')
plt.plot(np.asarray(np.arange(NUM_EPISODES)),Rate_learn, label='Learning Rate')
plt.xlabel('Episodes')
plt.legend()


In [None]:
Rate_learn[20000]

In [None]:
# print(q_table)
import pickle
with open('ped_try.pickle', 'wb') as f:
    pickle.dump(q_table, f)

In [None]:
print(np.where(q_table > 0))