In [0]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import random

In [0]:
def UCBfunc(eps_list,arms_total=10,steps=1000,runs=2000):
  best = 0
  R_plot = []
  Opt_plot= []

  print("\nUCB1 Algo")
  
  def bandit1(i,qstar_arr):
    r = np.random.normal(qstar_arr[i],1)
    return r
  
  for c in c_list:
    best = 0
    R_avg = np.zeros(steps)
    opt_avg = np.zeros(steps)

    for j in tqdm_notebook(range(runs)):
        qstar_arr = np.random.standard_normal(arms_total)
        true_best = np.argmax(qstar_arr,axis = 0) 
        
        R_arr=[]
        opt_arr = np.zeros(steps)

        Qarr = np.zeros(arms_total)
        Narr = np.zeros(arms_total)

        for i in range(steps):
          if i>(arms_total-1):
            Qarr_ucb = Qarr + np.sqrt(c*np.log(i)/Narr)
            best = np.argmax(Qarr_ucb,axis =0)

            R = bandit1(best,qstar_arr)
            
            Narr[best] = Narr[best]+1
            Qarr[best] = Qarr[best] + (R-Qarr[best])/Narr[best]
              
            R_arr.append(Qarr[best])

            if true_best == best:
              opt_arr[i] =  opt_arr[i] + 1

          else:
            best = i
            R = bandit1(best,qstar_arr)
            Narr[best] = Narr[best]+1
            Qarr[best] = Qarr[best] + (R-Qarr[best])/Narr[best]

            R_arr.append(Qarr[best])

            if true_best == best:
              opt_arr[i] =  opt_arr[i] + 1

        
        for k in range(steps):
            R_avg[k] = R_avg[k]+(R_arr[k]-R_avg[k])/(j+1)
            opt_avg[k] = opt_avg[k] + (opt_arr[k]-opt_avg[k])/(j+1)
        
    R_plot.append(R_avg)
    Opt_plot.append(opt_avg)

  return R_plot,Opt_plot        
        

In [0]:
def ucb_plots(R_plot,Opt_plot):
  step_list = list(range(len(R_plot[0])))
  R_fig=plt.figure(figsize=(12,6)).add_subplot(111)
  Opt_fig=plt.figure(figsize=(12,6)).add_subplot(111)

  colors = ['m', 'g', 'b', 'r', 'c','y', 'k']

  for i in range(len(R_plot)):
      R_fig.plot(step_list, R_plot[i], colors[i] ,label = "c = " + str(c_list[i]))

  R_fig.title.set_text(r'UCB1 : 10 arms')
  R_fig.set_ylabel('Average Reward')
  R_fig.set_xlabel('Steps')
  R_fig.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)    


  for i in range(len(Opt_plot)):
      Opt_fig.plot(step_list, Opt_plot[i]*100, colors[i] ,label = "c =" + str(c_list[i]))

  Opt_fig.title.set_text(r'UCB1 : 10 arms')
  Opt_fig.set_ylabel('% Optimal Action')
  Opt_fig.set_xlabel('Steps')
  Opt_fig.set_ylim(0,100)
  Opt_fig.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)    

  plt.show()

In [0]:
if __name__ == '__main__':

  #c_list = [2]
  #R_plot_u,Opt_plot_u=UCBfunc(c_list)
  #ucb_plots(R_plot_u,Opt_plot_u)

  from rl1p1 import EpsilonGreedy
  from rl1p2 import Softmax

  def comparison_plot(eps_list,temp_list, c_list, arms_total,steps,runs):
      
      R_plot_e, Opt_plot_e= EpsilonGreedy(eps_list,arms_total,steps,runs)
      R_plot_s, Opt_plot_s = Softmax(temp_list,arms_total, steps, runs)
      R_plot_u, Opt_plot_u =  UCBfunc(c_list, arms_total,steps,runs)
      
      fig1=plt.figure(figsize=(10,6)).add_subplot(111)
      fig2=plt.figure(figsize=(10,6)).add_subplot(111)
      
      fig1.plot(range(steps), R_plot_e[0], 'r', label = "Epsilon-greedy: $\epsilon$ = " + str(eps_list[0]) )
      fig1.plot(range(steps), R_plot_s[0], 'k', label = "Softmax: T = " + str(temp_list[0]))
      fig1.plot(range(steps), R_plot_u[0], 'g', label = "UCB1: c = " + str(c_list[0]))
      
      fig2.plot(range(steps), Opt_plot_e[0]*100, 'r', label = "Epsilon-greedy: $\epsilon$ = " + str(eps_list[0]) )
      fig2.plot(range(steps), Opt_plot_s[0]*100, 'k', label = "Softmax: T = " + str(temp_list[0]))
      fig2.plot(range(steps), Opt_plot_u[0]*100, 'g', label = "UCB1: c = " + str(c_list[0]))
      
      fig1.title.set_text(' Average reward comparison between Epsilon greedy, Softmax, UCB1')
      fig1.set_xlabel('Steps', fontsize = 15)
      fig1.set_ylabel('Average reward', fontsize = 15)
      fig1.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
      
      fig2.title.set_text('$\%$ Optimal action comparison between Epsilon greedy, Softmax, UCB1')
      fig2.set_xlabel('Steps', fontsize = 15)
      fig2.set_ylabel('$\%$ Optimal Action', fontsize = 15)
      fig2.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
      
      plt.show()

  comparison_plot([0.1],[0.1],[2],1000,10000,2000)


Epsilon Greedy Algo


HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))



Softmax Algo


HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))