In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
## Copy contents from My Drive to "/content" in order to import all scripts.
!cp -r /content/drive/My\ Drive/SC_RL /content

In [3]:
#rm -rf SC_RL/

In [4]:
!pip install import_ipynb

Collecting import_ipynb
  Downloading https://files.pythonhosted.org/packages/63/35/495e0021bfdcc924c7cdec4e9fbb87c88dd03b9b9b22419444dc370c8a45/import-ipynb-0.1.3.tar.gz
Building wheels for collected packages: import-ipynb
  Building wheel for import-ipynb (setup.py) ... [?25l[?25hdone
  Created wheel for import-ipynb: filename=import_ipynb-0.1.3-cp36-none-any.whl size=2976 sha256=6e5fc51fb74fdc26d3298863b88f09237fc0941e7d91be45687756da9b3d6e2f
  Stored in directory: /root/.cache/pip/wheels/b4/7b/e9/a3a6e496115dffdb4e3085d0ae39ffe8a814eacc44bbf494b5
Successfully built import-ipynb
Installing collected packages: import-ipynb
Successfully installed import-ipynb-0.1.3


In [5]:
## Import libraries.
import import_ipynb
import numpy as np
import os
import sklearn
import tensorflow as tf
import tensorflow_probability as tfp

from pathlib import Path

from tensorflow.keras.layers import Dense

from SC_RL.Environments import warehouse_store

importing Jupyter notebook from /content/SC_RL/Environments/warehouse_store.ipynb


In [6]:
metadata_file = Path(os.getcwd()+"/SC_RL/data/instacart-market-basket-analysis/products_metadata.xlsx")
forecast_data = Path(os.getcwd()+"/SC_RL/data/instacart-market-basket-analysis/scenarios.xlsx")
w = warehouse_store.warehouse_store()
num_products = 10
min_produts = 0
max_products = 20 ## Double check.

In [None]:
def reward_function(states, actions):
  '''
  Should ensure that inventory is stocked, but at the same time, ensure that
  wastage is minimized.
  1 - (quantity_restocked/total quantity of products)
    - (quantity of expired products/total quantity of products)
  '''
  ## only check quantity portion when it is thrown away.
  p_restocked = np.ndarray.sum(actions)
  q_max = np.ndarray.sum(states[0:num_products,0])
  reward = 1 - (p_restocked+states[num_products+1,0])/q_max
  return reward

In [None]:
## Test passing a function as argument here:
# total_reward = w.simulate(metadata_file,forecast_data,reward_function) # Need not initialize; just simulate.
# print(total_reward)

-inf


  # This is added back by InteractiveShellApp.init_path()


## **Actor-Critic**

In [13]:
## Parameters:
episodes = 100
gamma = 0.9           # reward discount in TD error
lr_actor = 5e-6       # learning rate for actor
lr_critic = 5e-3      # learning rate for critic

In [14]:
class Critic(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(40,activation='relu')
    self.d2 = tf.keras.layers.Dense(20,activation='relu')
    self.v = tf.keras.layers.Dense(1, activation = None)

  def call(self, input_data):
    x = self.d1(input_data)
    x = self.d2(x)
    v = self.v(x)
    return v
    

class Actor(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(40,activation='relu')
    self.d2 = tf.keras.layers.Dense(20,activation='relu')
    self.a = tf.keras.layers.Dense(num_products,activation='relu')

  def call(self, input_data):
    x = self.d1(input_data)
    x = self.d2(x)
    a = self.a(x)
    return a

In [15]:
class Agent():
  def __init__(self, gamma = 0.99):
    self.gamma = gamma
    self.a_opt = tf.keras.optimizers.Adam(learning_rate=5e-6)
    self.c_opt = tf.keras.optimizers.Adam(learning_rate=5e-6)
    self.actor = Actor()
    self.critic = Critic()
    
  def act(self,state):
    prob = self.actor(np.array([state]))
    #print(prob)
    prob = prob.numpy()
    # dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
    dist = tfp.distributions.Normal(loc=0, scale=1)
    action = dist.sample()
    return int(action.numpy()[0])

  def actor_loss(self, prob, action, td):
    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
    log_prob = dist.log_prob(action)
    loss = -log_prob*td
    return loss

  def learn(self, state, action, reward, next_state, done):
    state = np.array([state])
    next_state = np.array([next_state])
    with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
      p = self.actor(state, training=True)
      v =  self.critic(state,training=True)
      vn = self.critic(next_state, training=True)
      td = reward + self.gamma*vn*(1-int(done)) - v
      a_loss = self.actor_loss(p, action, td)
      c_loss = td**2
    grads1 = tape1.gradient(a_loss, self.actor.trainable_variables)
    grads2 = tape2.gradient(c_loss, self.critic.trainable_variables)
    self.a_opt.apply_gradients(zip(grads1, self.actor.trainable_variables))
    self.c_opt.apply_gradients(zip(grads2, self.critic.trainable_variables))
    return a_loss, c_loss

In [None]:
## Normalize state-space:
state_space_samples = np.array(
    [env.observation_space.sample() for x in range(10000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(state_space_samples)

#function to normalize states
def scale_state(state):                 #requires input shape=(2,)
    scaled = scaler.transform([state])
    return scaled                       #returns shape =(1,2)

In [18]:
## Training:
scrl = Agent()
tot_reward_vs_episode = []
for ep in range(episodes):
  total_reward_per_episode = 0
  states = w.reset(metadata_file, forecast_data)
  ## get_demand() returns a pre-determined demand of each product for timestep.
  demand = w.get_demand()
  all_aloss = []
  all_closss = []
  while current_timestep <= w.simulation_duration:
    ## Sample action according to current policy
    action  = np.random.randint(0,5,(num_prod,1))# action = scrl.act(states)
    ## Execute action and observe reward & next state from E
    next_state, reward, done = w.step(np.squeeze(action, axis=0),demand,current_timestep)
    aloss, closs = scrl.learn(states, action, reward, next_state, done)
    all_aloss.append(aloss)
    all_closs.append(closs)
    current_timestep += 1
    total_reward_per_episode += reward
    next_state = states

  tot_reward_vs_episode.append(total_reward_per_episode)
  '''Plot reward here.'''

[[10.]
 [10.]
 [10.]
 [10.]
 [10.]
 [10.]
 [10.]
 [10.]
 [10.]
 [10.]
 [ 0.]]


IndexError: ignored