# Deep Optimal Stopping

Implementing the numerical solver for optimal stopping problems based on the article 
"Deep Optimal Stopping" - Sebastian Becker, Patrick Cheridito, Arnulf Jentzen 
(available at: https://arxiv.org/pdf/1804.05394.pdf)

#### The problem:
Roll a die once. You are offered the dollar amount the die shows. At this stage you are free to choose to take the money, or roll the die again (in which case no money is paid at this stage). Example: Say you rolled a 4, but decided to go on

If you chose to roll again. After the second toss, the dollar amount the die shows is offered you again. You are free to choose to take the money, or toss the die again. Example: Now you rolled a 3, but decided to go on

Provided you proceeded to the third round. The die is cast and you will have to take the dollar amount the die shows after the third roll, i.e. no more choice to stop. Example: You rolled a 5, the game terminates

#### Question: How much are you willing to pay to participate in this game? In other words: what’s the fair value of this game?

In [1]:
import tensorflow as tf
import numpy as np 
import scipy
tf.__version__

'1.8.0'

### Creating a sample used for training

The data set will be a numpy array of the size $M\times 3$, where each line represents one sample, i.e. $3$ tosses of a die, and $M$ is the number of samples. 

In [2]:
# define sample size for training 
M = 10000

# create a sample of M x 3
# In this example we only consider 3 tosses, hence only two steps with choices to stop at
dice = np.random.randint(low=1, high=7, size=(M, 3))

In [3]:
# see the first 10 paths from the samples generated above
print("The first 10 samples: ")
print(dice[:10, :])

The first 10 samples: 
[[4 5 3]
 [6 5 1]
 [2 4 5]
 [5 1 5]
 [4 3 3]
 [1 3 2]
 [1 1 3]
 [1 4 5]
 [3 1 3]
 [3 4 6]]


In [4]:
'''
 define a function that slices batches from the samples 
          --- Variables: 
              - i : the i-th batch from the sample 
              - time : this is needed for the training, 
                       this is a variable that picks the right column from the sample 
              - batch_size : batch size; i.e. number of samples in one batch 
              - paths : matrix or tensor that contains the samples 
                        for the dice case, this is the numpy array of size Mx3 named dice
          --- Output: 
              - returns the required batch in the right format for training 
'''

def makeBatches(i, time, batch_size,  paths):
    start_pos = i*batch_size
    end_pos = (i+1)*batch_size
    if end_pos > len(paths[:, 0]):
        return np.reshape(paths[start_pos: , time], (len(paths[start_pos: , time]),1)) 
    else:
        return np.reshape(paths[start_pos:end_pos, time], (batch_size, 1))
    

In [5]:
# -------------------------------------------------
# defining hyperparameters for the feed forward NNs
# -------------------------------------------------

learning_rate = 0.001
training_epochs = 25
batch_size = 64

n_input = 1
# hidden layer size
n_hidden_layer = 51 # number of nodes/size of hidden layer. in the paper, the choice is d+50, hence 51. 


### Building the computational graph

In [6]:
'''
    Building the graph structure with tf.Graph()
'''

dummyGraph = tf.Graph()


with dummyGraph.as_default():
   
    with tf.variable_scope('2nd'):
    
        # weights and biases 
        weights = {
            'hidden_layer_1': tf.Variable(tf.random_normal([n_input, n_hidden_layer], mean=0.0, stddev=0.01)), 
            'hidden_layer_2': tf.Variable(tf.random_normal([n_hidden_layer, n_hidden_layer], mean=0.0, stddev=0.01)),
            'out': tf.Variable(tf.random_normal([n_hidden_layer, 1], mean=0.0, stddev=0.01))
            }

        biases = {
            'hidden_layer_1': tf.Variable(tf.random_normal([n_hidden_layer], mean=0.0, stddev=0.01)),
            'hidden_layer_2': tf.Variable(tf.random_normal([n_hidden_layer], mean=0.0, stddev=0.01)),
            'out': tf.Variable(tf.random_normal([], mean=0.0, stddev=0.01))
        }
    
        # tf graph input 
        x = tf.placeholder("float", [None, 1])
        x_prev = tf.placeholder("float", [None, 1])

        x_flat = tf.reshape(x, [-1, n_input])
        
        # Hidden layer with RELU activation
        layer_1 = tf.add(tf.matmul(x_flat, weights['hidden_layer_1']),biases['hidden_layer_1'])
        layer_1 = tf.nn.relu(layer_1)

        layer_2 = tf.add(tf.matmul(layer_1, weights['hidden_layer_2']), biases['hidden_layer_2'])
        layer_2 = tf.nn.relu(layer_2)

        # Output layer with linear activation
        logits = {'logits': tf.add(tf.matmul(layer_2, weights['out']), biases['out'])}
        F_theta = tf.nn.sigmoid(logits['logits'])

        # Define reward and optimizer
        one = tf.constant(1, dtype=tf.float32)
        reward = (tf.multiply(F_theta, x) + tf.multiply((one-F_theta), x_prev))
        rAvg = tf.reduce_mean(reward) 

        cost = tf.scalar_mul(-1,rAvg)
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
        opt_operation = opt.minimize(cost)
        
    # assemble tau for the next network, i.e. tau that describes the stopping times, which can be
    # either 1 or 2
    arg_1 = tf.placeholder("float", [None, 1])
    f_theta_1 = tf.cast(tf.clip_by_value(tf.sign(arg_1), 0, 5), dtype=tf.int32)
    tau_1 = 1*f_theta_1 + 2*(1-f_theta_1)
        
    
    with tf.variable_scope('1st'):
    
        # weights and biases 
        weights = {
            'hidden_layer_1': tf.Variable(tf.random_normal([n_input, n_hidden_layer], mean=0.0, stddev=0.001)), 
            'hidden_layer_2': tf.Variable(tf.random_normal([n_hidden_layer, n_hidden_layer], mean=0.0, stddev=0.001)),
            'out': tf.Variable(tf.random_normal([n_hidden_layer, 1], mean=0.0, stddev=0.001))
            }

        biases = {
            'hidden_layer_1': tf.Variable(tf.random_normal([n_hidden_layer], mean=0.0, stddev=0.001)),
            'hidden_layer_2': tf.Variable(tf.random_normal([n_hidden_layer], mean=0.0, stddev=0.001)),
            'out': tf.Variable(tf.random_normal([], mean=0.0, stddev=0.001))
        }
    
        # tf graph input 
        x = tf.placeholder("float", [None, 1])
        x_prev = tf.placeholder("float", [None, 1])

        x_flat = tf.reshape(x, [-1, n_input])
        
        # Hidden layer with RELU activation
        layer_1 = tf.add(tf.matmul(x_flat, weights['hidden_layer_1']),biases['hidden_layer_1'])
        layer_1 = tf.nn.relu(layer_1)

        layer_2 = tf.add(tf.matmul(layer_1, weights['hidden_layer_2']), biases['hidden_layer_2'])
        layer_2 = tf.nn.relu(layer_2)

        # Output layer with linear activation
        logits = {'logits': tf.add(tf.matmul(layer_2, weights['out']), biases['out'])}
        F_theta = tf.nn.sigmoid(logits['logits'])

        # Define reward and optimizer
        one = tf.constant(1, dtype=tf.float32)
        reward = (tf.multiply(F_theta, x) + tf.multiply((one-F_theta), x_prev))
        rAvg = tf.reduce_mean(reward) 

        cost = tf.scalar_mul(-1,rAvg)
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
        opt_operation = opt.minimize(cost)
    
    # assemble tau that describes the stopping time, which can be 0, 1, or 2. In other words, this is the 
    # stopping time for the entire time span of the model 
    arg_0 = tf.placeholder("float", [None, 1])
    f_theta_0 = tf.cast(tf.clip_by_value(tf.sign(arg_0), 0, 5), dtype=tf.int32)
    tau_0 = 0*f_theta_0 + 1*f_theta_1*(1-f_theta_0)+2*(1-f_theta_0)*(1-f_theta_1)
    
    
    saver = tf.train.Saver()

### Training the model

In [7]:
with tf.Session(graph=dummyGraph) as sess: 
    sess.run(tf.global_variables_initializer())
    
    # Training cycle
    for epoch in range(training_epochs):
        total_batch = int(len(dice[:, 0])/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_x = makeBatches(i, 1, batch_size,  dice) # getting the correct input slices (batches)
            batch_x_prev = makeBatches(i, 2, batch_size,  dice) # getting the correct input slices (batches)
            # Run optimization op (backprop) and cost op (to get loss value)
            loss_val = sess.run(dummyGraph.get_operation_by_name("2nd/Adam"), 
                                feed_dict={dummyGraph.get_tensor_by_name("2nd/Placeholder:0"): batch_x, 
                                           dummyGraph.get_tensor_by_name("2nd/Placeholder_1:0"): batch_x_prev})
            
            
            if i%100==0:
                w_value = sess.run(dummyGraph.get_tensor_by_name("2nd/mul:0"), 
                                   feed_dict={dummyGraph.get_tensor_by_name("2nd/Placeholder:0"): batch_x, 
                                           dummyGraph.get_tensor_by_name("2nd/Placeholder_1:0"): batch_x_prev})
                print("cost: {b:.7f}".format(b=w_value))
                #print(sess.run(F_theta[:5,:], feed_dict={x: batch_x, x_prev: batch_x_prev}))
                #print("weights")
                #print(sess.run(weights['hidden_layer_1']))
        print("epoch: %i " %epoch)
    
    
    
    
    
    h = np.reshape(dice[:,1].astype(np.float32), [M,1])
    h_ = sess.run(dummyGraph.get_tensor_by_name("2nd/Add_2:0"), 
                  feed_dict={dummyGraph.get_tensor_by_name("2nd/Placeholder:0"): h}).astype(np.int32)
    tau_test = sess.run(dummyGraph.get_tensor_by_name("add:0"), 
                         feed_dict={arg_1: h_}).astype(np.int32)
    
    dice_tau = np.array([dice[x, tau_test[x,0]] for x in range(len(tau_test[:,0]))])
    dice_tau = np.reshape(dice_tau, [len(dice[:,1]), 1]) 
    
    for epoch_ in range(training_epochs):
        total_batch_ = int(len(dice[:, 0])/batch_size)
        # Loop over all batches
        for i in range(total_batch_):
            batch_x0 = makeBatches(i, 0, batch_size,  dice) # getting the correct input slices (batches)
            batch_x_prev0 = makeBatches(i, 0, batch_size,  dice_tau) # getting the correct input slices (batches)
            # Run optimization op (backprop) and cost op (to get loss value)
            loss_val = sess.run(dummyGraph.get_operation_by_name("1st/Adam"), 
                                feed_dict={dummyGraph.get_tensor_by_name("1st/Placeholder:0"): batch_x0, 
                                           dummyGraph.get_tensor_by_name("1st/Placeholder_1:0"): batch_x_prev0})
            
            
            if i%100==0:
                w_value_ = sess.run(dummyGraph.get_tensor_by_name("1st/mul:0"), 
                                   feed_dict={dummyGraph.get_tensor_by_name("1st/Placeholder:0"): batch_x0, 
                                           dummyGraph.get_tensor_by_name("1st/Placeholder_1:0"): batch_x_prev0})
                print("cost: {b:.7f}".format(b=w_value_))
                #print(sess.run(F_theta[:5,:], feed_dict={x: batch_x, x_prev: batch_x_prev}))
                #print("weights")
                #print(sess.run(weights['hidden_layer_1']))
        print("epoch: %i " %epoch)
    
    
    
    g = np.reshape(dice[:,0].astype(np.float32), [M,1])
    g_ = sess.run(dummyGraph.get_tensor_by_name("1st/Add_2:0"), 
                  feed_dict={dummyGraph.get_tensor_by_name("1st/Placeholder:0"): g}).astype(np.int32)
    
    
    tau_test_2 = sess.run(dummyGraph.get_tensor_by_name("add_2:0"), 
                         feed_dict={arg_0: g_, arg_1: h_}).astype(np.int32)
    
    d_ = np.array([dice[x, tau_test_2[x,0]] for x in range(len(tau_test_2[:,0]))])
    d_ = np.reshape(d_, [len(dice[:,1]), 1]) 
    
    saver.save(sess, './train_model_prec.ckpt')
    print("The trained model is saved")

cost: -3.5785875
cost: -3.8937650
epoch: 0 
cost: -4.1967888
cost: -4.3118210
epoch: 1 
cost: -4.3134398
cost: -4.3452549
epoch: 2 
cost: -4.3278174
cost: -4.3582611
epoch: 3 
cost: -4.3350372
cost: -4.3649092
epoch: 4 
cost: -4.3387208
cost: -4.3685231
epoch: 5 
cost: -4.3406010
cost: -4.3706074
epoch: 6 
cost: -4.3416348
cost: -4.3718815
epoch: 7 
cost: -4.3422527
cost: -4.3727016
epoch: 8 
cost: -4.3426485
cost: -4.3732543
epoch: 9 
cost: -4.3429146
cost: -4.3736401
epoch: 10 
cost: -4.3431005
cost: -4.3739185
epoch: 11 
cost: -4.3432355
cost: -4.3741255
epoch: 12 
cost: -4.3433347
cost: -4.3742819
epoch: 13 
cost: -4.3434105
cost: -4.3744035
epoch: 14 
cost: -4.3434677
cost: -4.3744993
epoch: 15 
cost: -4.3435140
cost: -4.3745761
epoch: 16 
cost: -4.3435502
cost: -4.3746386
epoch: 17 
cost: -4.3435798
cost: -4.3746896
epoch: 18 
cost: -4.3436036
cost: -4.3747320
epoch: 19 
cost: -4.3436236
cost: -4.3747673
epoch: 20 
cost: -4.3436403
cost: -4.3747969
epoch: 21 
cost: -4.3436546
cos

In [8]:
# printing some parts of the result

# the trained stopping times that indicates where we should stop
print("The trained stopping time - tau_0: ")
print(tau_test_2[10:25])

# the relevant part of the training sample
print("The relevant part of the input sample - dice")
print("corresponding to the trained stopping time")
print(dice[10:25])

# the value on the die that's picked by the trained stopping time 
print("The value on the die at stopping")
print(d_[10:25])

The trained stopping time - tau_0: 
[[0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [2]
 [1]
 [0]
 [1]
 [2]
 [1]
 [0]
 [1]
 [1]]
The relevant part of the input sample - dice
corresponding to the trained stopping time
[[6 5 4]
 [4 6 6]
 [6 2 1]
 [1 6 6]
 [5 3 3]
 [6 5 5]
 [3 2 2]
 [4 4 5]
 [6 2 1]
 [4 6 2]
 [3 2 2]
 [1 4 4]
 [6 1 5]
 [1 4 1]
 [3 5 6]]
The value on the die at stopping
[[6]
 [6]
 [6]
 [6]
 [5]
 [6]
 [2]
 [4]
 [6]
 [6]
 [2]
 [4]
 [6]
 [4]
 [5]]


In [9]:
# -------------------------------------------
# Price - evaluation on a separate sample set
# -------------------------------------------

# create a sample of M x 3 for evaluation
dice_eval = np.random.randint(low=1, high=7, size=(M, 3))

In [10]:
# Launch the graph and evaluate the trained stopping time on the evaluation sample 
with tf.Session(graph=dummyGraph) as sess:
    saver.restore(sess, './train_model_prec.ckpt')
    
    h = np.reshape(dice_eval[:,1].astype(np.float32), [len(dice[:,0]),1])
    h_ = sess.run(dummyGraph.get_tensor_by_name("2nd/Add_2:0"), 
                  feed_dict={dummyGraph.get_tensor_by_name("2nd/Placeholder:0"): h}).astype(np.int32)
    
    g = np.reshape(dice_eval[:,0].astype(np.float32), [len(dice[:,0]),1])
    g_ = sess.run(dummyGraph.get_tensor_by_name("1st/Add_2:0"), 
                  feed_dict={dummyGraph.get_tensor_by_name("1st/Placeholder:0"): g}).astype(np.int32)
    
    tau_eval = sess.run(dummyGraph.get_tensor_by_name("add_2:0"), 
                         feed_dict={arg_0: g_, arg_1: h_}).astype(np.int32)
    
    d_ = np.array([dice_eval[x, tau_eval[x,0]] for x in range(len(tau_eval[:,0]))])
    d_ = np.reshape(d_, [len(dice_eval[:,1]), 1]) 
    
    Value = np.mean(d_)
    print("The fair value of the game: %f" %Value)

INFO:tensorflow:Restoring parameters from ./train_model_prec.ckpt
The fair value of the game: 4.668600


The result is fairly stable in the sense that with the above setting, the pricing should be around $4.6\dots$, depending on the samples and training. It can be shown that the analytical solution, or fair price of the game is $28/6 \approx 4.66$. Hence the numerical method produces a reasonably close result that matches to the first decimal. 