In [3]:
#This session is mostly (99.9%) from pymc website: https://www.pymc.io/projects/examples/en/latest/case_studies/reinforcement_learning.html
#Also in archive: https://archive.ph/Srci7

import os
#from IPython.display import HTML

#Tables and matrices
import numpy as np
import pandas as pd

#Stats
import scipy.stats as st
from scipy.optimize import fmin
from scipy import integrate
from scipy.stats.mstats import mquantiles
import statistics 


#Probabilistic programs
#!pip install pymc==5.0.2
#!pip install pytensor
import pymc as pm
import pytensor.tensor as pt
import pytensor 
#import aesara.tensor as at
print('Running on PyMC v{}'.format(pm.__version__))
print('Running on Pytensor v{}'.format(pytensor.__version__))

#Graphs
import seaborn as sns
import plotly.graph_objects as go
import altair as alt
#from altair_saver import save #ademas instalar en terminal: brew cask install chromedriver
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.gridspec import GridSpec
from matplotlib import animation, rc
from IPython.display import display, HTML, Markdown
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, HBox, VBox, Layout
#from graphviz import Source, Digraph
#import dot2tex as d2t
#from latex import build_pdf
from mpl_toolkits.mplot3d import axes3d
import arviz as az

#User-defined functions (in the same folder as the notebook)
import my_fun as mf


Running on PyMC v5.1.0
Running on Pytensor v2.10.1


# Reinforced Learning

Santiago Alonso-Díaz

<center><img src="img/11_CB/SuttonBarto.png" width = "600" height = '600'></center>

In [5]:
%%HTML
<video width="620" height="540" controls>
  <source src="img/11_CB/ReinforcedLearningVideogames.mov">
</video>

<center><img src="img/11_CB/Mnih2015.png" width = "600" height = '600'></center>
Mnih et al, 2015

<center><img src="img/11_CB/Triqui1.png" width = "400" height = '400'></center>


In this state S, there are only two available actions (G, H)
<center><img src="img/11_CB/Triqui2.png" width = "400" height = '400'></center>


Which of this states is more valuable under a random policy? Green/X plays
<center><img src="img/11_CB/Triqui3.png" width = "600" height = '600'></center>


One way to learn the value of a state (V(s)) after doing action (a) at time t+1 is reinforced learning? For instance,
<br><br>
$$V(s)_{t+1} = V(s)_{t} + \alpha (V(s)_{t} - R(s)_{t+1}) $$
<br>
$\alpha$: Learning rate
$R$: Reward 
$t$: Time


# Case: Multiarmed Bandit

<center><img src="img/11_CB/MAB1.png" width = "600" height = '600'></center>
Daw, et al, 2006

$q(a)$ is an idealized quantity i.e. the actual expected value of doing action $a$:
$$q(a) \dot{=} \mathbb{E}[R_t | A_t = a] $$
<br>
One should pick the arm with the highest $q(a)$:
$$Argmax_a q(a)$$

Problem: we do not know $q(a)$. Thus, we symbolize our current estimate at time $t$ as:
$$ Q_t (a)$$

One possibility for $Q_t(a)$ is the mean of the rewards $R$ up to time $t-1$ when choosing $a$:

$$ Q_t(a) = \frac{\sum_i^{t-1}R_i*\mathbb{1}_{A_i=a}}{\sum_i^{t-1}\mathbb{1}_{A_i=a}}$$

<H2 style="text-align: center;">Dilemma</H2>
<p style="text-align: center;">Exploit ($Argmax_a Q_t(a)$)</p>
<p style="text-align: center;">vs</p>
<p style="text-align: center;">Explore (other non $Argmax$)</p>

Algorithm examples:
* Greedy ($Argmax_a Q_t(a)$)
* $\epsilon$-greedy ($Argmax_a Q_t(a)$ with probability $1-\epsilon$, uniform with probability $\epsilon$)
 

Which one to choose with a greedy policy?
<center><img src="img/11_CB/MAB2.png" width = "500" height = '500'></center>


What is the probability of choosing 143 with $\epsilon=0.5$?
<center><img src="img/11_CB/MAB2.png" width = "500" height = '500'></center>


The greedy algorithm only exploits, while $\epsilon$-greedy allows for exploration

Which policy is better? It depends
<center><img src="img/11_CB/MAB3.png" width = "500" height = '500'></center>
Sutton & Barto, 2020



Some simulations:

<center><img src="img/11_CB/MAB4.png" width = "500" height = '500'></center>

Sutton & Barto, 2020