# Assignment 2 - Action Critic

<div style="border-bottom: 3px solid black; margin-bottom:5px"></div>
<div style="border-bottom: 3px solid black; margin-bottom:5px"></div>


## Importing packages

In [1]:
import gymnasium as gym
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import random
import os


os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
rnd = np.random.default_rng(112233)

<div style="border-bottom: 3px solid black; margin-bottom:5px"></div>
<div style="border-bottom: 3px solid black; margin-bottom:5px"></div>

## Building Cart Pole

In [2]:
env = gym.make('CartPole-v1')
env.reset()

(array([ 0.00264488, -0.02958185,  0.01406104,  0.01702181], dtype=float32),
 {})

<div style="border-bottom: 3px solid black; margin-bottom:5px"></div>

## Building Q-learning class

In [3]:
class qlearning():
    def __init__(self, decision_matrix, alpha=.85, gamma=.95,epsilon=.1):
        self.a = alpha
        self.g = gamma
        self.q = decision_matrix
        
        self.e = epsilon

        return

    def update(self, reward, state, action, next_state): 
        
        self.q[state, action] = self.q[state, action] + self.a * (
            reward + self.g * np.max(self.q[next_state, :]) - self.q[state, action])

        return None

    def choose(self, env, state):
        
        if rnd.random()< self.e:
            # random sampling
            chosen = rnd.choice(list(range(env.action_space.n)))
        else:
            # greedy choice
            chosen = np.argmax(self.q[state])
        return chosen

<div style="border-bottom: 3px solid black; margin-bottom:5px"></div>

## Building Action critic class

In [4]:
class AC():
    def __init__(self, decision_matrix, alpha=.85, gamma=.95,epsilon=.1):
        self.a = alpha
        self.g = gamma
        self.q = decision_matrix
        self.e = epsilon

        return

    def update(self, reward, state, action, next_state): 
        
        self.q[state, action] = self.q[state, action] + self.a * (
            reward + self.g * np.max(self.q[next_state, :]) - self.q[state, action])

        return None

    def choose(self, env, state):
        
        if rnd.random()< self.e:
            # random sampling
            chosen = rnd.choice(list(range(env.action_space.n)))
        else:
            # greedy choice
            chosen = np.argmax(self.q[state])
        return chosen

In [5]:
# table generator and converter

def gen_table(env, bins=10):
    observation_dim = len(env.observation_space.high)
    action_dim = env.action_space.n
    
    table_dim = [bins] * observation_dim + [action_dim] 
    
    table = np.zeros((table_dim))
                     
    return table
    
def get_s(state, table, env, bins=10):
    result = table
    for i, feature in enumerate(state):
        max_value = env.observation_space.high[i]
        min_value = env.observation_space.low[i]


        window_size = (max_value - min_value) / bins
        bin_loc = (feature - min_value) // window_size
        result = result[int(bin_loc)]

    return result

<div style="border-bottom: 3px solid black; margin-bottom:5px"></div>

## Building the training process

In [6]:
# defining one episode
def episode(model, env, n_bins):
    state = env.reset()
    state = get_s(state[0], model.q, env, n_bins)

    ended = False
    reward = 0

    while not ended:

        action = model.choose(env, state)

        # take A from S and get S'
        new_state, reward, ended, time_limit, prob = env.step(action)
        new_state = get_s(new_state, model.q, env, n_bins)

        model.update(reward, state, action, new_state)

        # S <- S'
        state = new_state

        if time_limit:
            break

    return reward

In [7]:
# defining process for each of the runs
def run(model, env, episode_n=1000,verbose=True,n_bins=10):
    run_results = {}
    for i, mode in enumerate(range(episode_n)):
        if verbose:
            print(f"\n{i + 1}th Segment:", end='')
            
        run_results[i] = episode(model, env, n_bins)

    return run_results

<div style="border-bottom: 3px solid black; margin-bottom:5px"></div>
<div style="border-bottom: 3px solid black; margin-bottom:5px"></div>


## Running the models

In [8]:
# configurations
n_bins = 10

epsilons = [.01,.1,.5]
learning_rates = [.15,.5,.85]

n_runs = 10
rolling_window = 10



training_size = 10
testing_size = 1


q_table = gen_table(env,n_bins)
q_table.shape

(10, 10, 10, 10, 2)

In [9]:
# Declaring the model

models = []

general_results = {}
for alpha in learning_rates:
    general_results[alpha] = {}
    for epsilon in epsilons:
        general_results[alpha][epsilon] = {}

        #creating model to use as standard for each run config
        models.append(qlearning(q_table.copy(),alpha=alpha,epsilon=epsilon))

In [10]:
# Runing the training

for model in models:
    print(f'Training on |Epsilon: {str(model.e)}\t| alpha: {str(model.a)}')
    
    for i in range(n_runs):
        # creating model copies for each run 
        n_model =qlearning(q_table.copy(),alpha=model.a,epsilon=epsilon)
        general_results[model.a][model.e][i] = run(n_model,env,verbose=False,n_bins=n_bins)
        
    

Training on |Epsilon: 0.01	| alpha: 0.15


  window_size = (max_value - min_value) / bins


IndexError: arrays used as indices must be of integer (or boolean) type

<div style="border-bottom: 3px solid black; margin-bottom:5px"></div>

## Generating the dataset

In [None]:
df = pd.DataFrame.from_dict({(a,b,c,d,e,f): general_results[a][b][c][d][e][f]
                                           for a in general_results.keys() 
                                           for b in general_results[a].keys()
                                           for c in general_results[a][b].keys()
                                           for d in general_results[a][b][c].keys()
                                           for e in general_results[a][b][c][d].keys()
                                           for f in general_results[a][b][c][d][e].keys()},
                                           orient='index')

In [None]:
df = df.reset_index()
df = df.rename(columns={'level_0':'sarsa','level_1':'alpha','level_2':'temperature','level_3':'run','level_4':'segment','level_5':'episode'})
df.head(25)

<div style="border-bottom: 3px solid black; margin-bottom:5px"></div>
<div style="border-bottom: 3px solid black; margin-bottom:5px"></div>

## Results
