In [1]:
!pip install mdptoolbox-hiive

Collecting mdptoolbox-hiive
  Downloading mdptoolbox-hiive-4.0.3.1.tar.gz (30 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jedi>=0.16 (from ipython->mdptoolbox-hiive)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: mdptoolbox-hiive
  Building wheel for mdptoolbox-hiive (setup.py) ... [?25l[?25hdone
  Created wheel for mdptoolbox-hiive: filename=mdptoolbox_hiive-4.0.3.1-py3-none-any.whl size=35120 sha256=045ad3d3de089832cd718391a02a64d870575710e56ec5fdf22d0a4985f9a640
  Stored in directory: /root/.cache/pip/wheels/3c/21/00/79fb3890bf11432b069070b7623416cd8b9f8501580692a52f
Successfully built mdptoolbox-hiive
Installing collected packages: jedi, mdptoolbox-hiive
Successfully installed jedi-0.19.1 mdptoolbox-hiive-4.0.3.1


In [2]:
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
import numpy as np
import sys
import os
from numpy.random import choice
import pandas as pd
import seaborn as sns

In [3]:
def mean(x, N):
    sum = np.cumsum(np.insert(x, 0, 0))
    return (sum[N:] - sum[:-N]) / float(N)

In [4]:
def evaluate_policy(prob_matrix, reward_matrix, optimal_policy, test_count=100, gamma=0.9):
    states = prob_matrix.shape[-1]
    episodes = states * test_count
    total_reward = 0
    for state in range(states):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 0
            discount_rate = 1
            while True:
                action = optimal_policy[state]
                probs = prob_matrix[action][state]
                candidates = list(range(len(prob_matrix[action][state])))
                next_state =  choice(candidates, 1, p=probs)[0]
                reward = reward_matrix[state][action] * discount_rate
                episode_reward += reward
                discount_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        total_reward += state_reward
    return total_reward / episodes


In [11]:
def value_iteration(prob_matrix, reward_matrix, discount_factor=0.9, epsilons=[1e-9]):
    data_frame = pd.DataFrame(columns=["Epsilon", "Optimal_Policy", "Iterations",
                                  "Time taken", "Reward", "Value Function"])
    for e in epsilons:
        value_itr = ValueIteration(prob_matrix, reward_matrix, gamma=discount_factor, epsilon=e, max_iter=int(1e15))
        value_itr.run()
        reward = evaluate_policy(prob_matrix, reward_matrix, value_itr.policy)
        info = [float(e), value_itr.policy, value_itr.iter, value_itr.time, reward, value_itr.V]
        data_frame.loc[len(data_frame)] = info
    return data_frame

In [6]:
def policy_iteration(prob_matrix, reward_matrix, discount_factor=0.9):
  policy_itr = PolicyIteration(prob_matrix, reward_matrix, gamma=discount_factor, max_iter=1e6)
  policy_itr.run()
  policy = policy_itr.policy
  reward = evaluate_policy(prob_matrix, reward_matrix, policy)
  iterations = policy_itr.iter
  time = policy_itr.time
  return iterations, time, reward




In [10]:
def q_learning(prob_matrix, reward_matrix, discount=0.9, alpha_dec=[0.99], alpha_min=[0.001],
            epsilon=[1.0], epsilon_decay=[0.99], n_iter=[1000000]):
    q_df = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Min",
                                 "Epsilon", "Epsilon Decay", "Reward",
                                 "Time", "Policy", "Value Function",
                                 "Training Rewards"])

    count = 0
    for i in n_iter:
        for eps in epsilon:
            for eps_dec in epsilon_decay:
                for a_dec in alpha_dec:
                    for a_min in alpha_min:
                        q = QLearning(prob_matrix, reward_matrix, discount, alpha_decay=a_dec,
                                      alpha_min=a_min, epsilon=eps,
                                      epsilon_decay=eps_dec, n_iter=i)
                        q.run()
                        reward = evaluate_policy(prob_matrix, reward_matrix, q.policy)
                        count += 1
                        print("{}: {}".format(count, reward))
                        st = q.run_stats
                        rews = [s['Reward'] for s in st]
                        info = [i, a_dec, a_min, eps, eps_dec, reward,
                                q.time, q.policy, q.V, rews]

                        df_length = len(q_df)
                        q_df.loc[df_length] = info
    return q_df

FOREST MANAGEMENT WITH 50 STATES

In [12]:
prob_matrix, reward_matrix = forest(S=50, r1=10, r2= 5, p=0.01)

In [13]:
print( value_iteration(prob_matrix, reward_matrix, epsilons=[1e-1, 1e-2, 1e-3, 1e-5, 1e-9, 1e-12]))


        Epsilon                                     Optimal_Policy  \
0  1.000000e-01  (0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
1  1.000000e-02  (0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
2  1.000000e-03  (0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
3  1.000000e-05  (0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
4  1.000000e-09  (0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
5  1.000000e-12  (0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   

   Iterations  Time taken    Reward  \
0          59    0.003872  2.252649   
1          79    0.003982  2.273600   
2          99    0.008129  2.324879   
3         139    0.011879  2.205875   
4         219    0.019604  2.330401   
5         279    0.013138  2.318898   

                                      Value Function  
0  (4.701569160732358, 5.23097346057433, 5.230973...  
1  (4.710556185449387, 5.239434944489701, 5.23943...  
2  (4.711643009800913, 5.2404695099104295, 5.2404...  
3  (4.711790503

In [14]:
print(policy_iteration(prob_matrix, reward_matrix))

(26, 0.031317710876464844, 2.315741141279923)


In [15]:
print(q_learning(prob_matrix, reward_matrix, discount=0.9, alpha_dec=[0.99, 0.999], alpha_min=[0.001, 0.0001],
            epsilon=[10.0, 1.0], epsilon_decay=[0.99, 0.999], n_iter=[1000000, 10000000]))

1: 2.698958896143808
2: 1.04
3: 2.654422944691908
4: 0.94
5: 0.96
6: 2.7845738650567076
7: 0.9
8: 2.633606735223075
9: 2.7308972211252103
10: 1.04
11: 2.6349138726147725
12: 2.674642988618572
13: 2.68238365748513


KeyboardInterrupt: ignored

FOREST MANAGEMENT WITH 400 STATES

In [16]:
prob_matrix, reward_matrix = forest(S=400, r1=10, r2= 5, p=0.01)

In [17]:
print( value_iteration(prob_matrix, reward_matrix, epsilons=[1e-1, 1e-2, 1e-3, 1e-5, 1e-9, 1e-12]))


        Epsilon                                     Optimal_Policy  \
0  1.000000e-01  (0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
1  1.000000e-02  (0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
2  1.000000e-03  (0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
3  1.000000e-05  (0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
4  1.000000e-09  (0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
5  1.000000e-12  (0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   

   Iterations  Time taken    Reward  \
0          59    0.033690  1.157760   
1          79    0.035558  1.151885   
2          99    0.047704  1.165787   
3         139    0.056917  1.157420   
4         219    0.053029  1.169877   
5         279    0.066078  1.163662   

                                      Value Function  
0  (4.701569160732358, 5.23097346057433, 5.230973...  
1  (4.710556185449387, 5.239434944489701, 5.23943...  
2  (4.711643009800913, 5.2404695099104295, 5.2404...  
3  (4.711790503

In [18]:
print(policy_iteration(prob_matrix, reward_matrix))

(26, 0.2763371467590332, 1.1629042434708299)


In [19]:
print(q_learning(prob_matrix, reward_matrix, discount=0.9, alpha_dec=[0.99, 0.999], alpha_min=[0.001, 0.0001],
            epsilon=[10.0, 1.0], epsilon_decay=[0.99, 0.999], n_iter=[1000000, 10000000]))

1: 1.049307917823982


KeyboardInterrupt: ignored