## Using Keras

### Data

TODO : Add intro to usecase


For now using the data used here : https://github.com/shinchan75034/LSTM_TouchPoint.  
        
Eventually would want to use data at http://archive.ics.uci.edu/ml/datasets/Online+Retail

TODO : Explain data structure

We will follow this sequence in this notebook

1. Execute the python script that builds and trains the LSTM model using Keras. 
2. Execute training the Amazon SageMaker's 'local' mode
3. Execute training on a training cluster of ML instances managed by Amazon SageMaker
4. Deploy the trained model.
5. Execute inferences agains the deployed model.

### Process the data

In [None]:
import boto3
import numpy as np
from sklearn.model_selection import train_test_split

#s3 = boto3.resource('s3') 
#s3.Bucket("555360056434-sagemaker-us-east-1").download_file('small_train_data_orig.txt', 'data.txt')

data_file="data.txt"

with open(data_file, 'r') as f:
        lines = f.read().split('\n')
        data = np.array(lines)
        print("Data : ", data)

        ##Split data into training (60%), validation (20%) and test (20%) data sets
        train_dat ,remained = train_test_split(data,test_size=0.4)
        validation_dat, test_dat = train_test_split(remained, test_size = 0.5)

        #Verify training, validation and test data set sizes
        print("\nTraining Data Size ", len(train_dat))
        print("Validation Data Size ", len(validation_dat))
        print("Test Data Size ", len(test_dat))
        
#convert training, validation and test data sets to lists
train_lines = list(train_dat)
validation_lines = list(validation_dat)
test_lines = list(test_dat)

In [None]:
#Utility methods to create corpus, split lines into input & target and encode the data

#Create the corpus dictionary
def create_corpus_dict(word_list):
  token_index = dict(
    [(word, i) for i, word in enumerate(word_list)])
  return token_index

#Split a given line into input and target
#TODO : Can this be made generic enough to use in the above cell to build corpus
def split_input_and_target(line_list):
    input_texts = []
    target_texts = []
    
    try:

        for line in line_list:
            _, input_text, target_text = line.split('\t')
            # We use "tab" as the "start sequence" character
            # for the targets, and "\n" as "end sequence" character.
            target_text = '<start>' + " " + target_text + " " + '<stop>' 
            input_texts.append(input_text)
            target_texts.append(target_text)
            
    except:
      pass
    
    return input_texts, target_texts

## Method to encode data

def encode_data(input_texts,target_texts, input_vocab, target_vocab, input_corpus, target_corpus) :
    
    #Get the array/list length/counts
    # input and target may have different vocab and different token count.
    input_vocab = sorted(list(input_vocab))
    target_vocab = sorted(list(target_vocab))
    num_encoder_tokens = len(input_vocab)
    num_decoder_tokens = len(target_vocab)
    max_encoder_seq_length = max([len(txt.split()) for txt in input_texts]) # number of words in each string.  Use max length to make all sequences same size.
    max_decoder_seq_length = max([len(txt.split()) for txt in target_texts])
    
    #Create zero encoded/decoder arrays of correct size
    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32')
    decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')
    decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')
    
    #Now update the encoded/decoded arrays with 1.
    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, word in enumerate(input_text.split()):
            encoder_input_data[i, t, input_corpus[word]] = 1.
        for t, word in enumerate(target_text.split()):
            # decoder_target_data is ahead of decoder_input_data by one timestep
            decoder_input_data[i, t, target_corpus[word]] = 1.
            if t > 0:
                # decoder_target_data will be ahead by one timestep
                # and will not include the start character.
                decoder_target_data[i, t - 1, target_corpus[word]] = 1.
    
    return encoder_input_data, decoder_input_data, decoder_target_data

In [None]:
##Utility methods to upload to and read from S3 bucket.
import boto3
import numpy as np

s3 = boto3.resource('s3') 
s3.Bucket("555360056434-sagemaker-us-east-1").download_file('small_train_data_orig.txt', 'data.txt')

##Save ndarray to file, upload to S3
def upload_ndarray_to_s3(encoder_input_data, s3_prefix):
    local_file = 'encoded_data.npy'
    #s3_prefix = 'test/encoder_input_data.npy'
    np.save(local_file, encoder_input_data) 
    s3.Bucket('555360056434-sagemaker-us-east-1').upload_file(local_file, s3_prefix)
    
    
### Read ndarray from S3
def read_ndarray_from_s3(s3_prefix):
    local_file_downloaded = 'downloaded_encoder_data.npy'
    s3.Bucket("555360056434-sagemaker-us-east-1").download_file(s3_prefix, local_file_downloaded)
    downloaded_encoder_input_data = np.load(local_file_downloaded)
    return downloaded_encoder_input_data

In [None]:
# Set up all data to build a corpus
input_texts = []
target_texts = []
input_words = set()
target_words = set()

for line in lines:
      try:
        #print ("line :", line)
        #Split each line into input text and target text.  Ignore the user_id (???)
        _, input_text, target_text = line.split("\t")
        #print("input_text :", input_text, " output text : ", target_text)


        # We use "tab" as the "start sequence" character
        # for the targets, and "\n" as "end sequence" character.
        #Update target text to include <start> and <stop> tokens
        target_text = '<start>' + " " + target_text + " " + '<stop>'   


        #Append input_texts and target_texts
        input_texts.append(input_text)
        target_texts.append(target_text)

        #Split the input_text and target_text into words and populate the input_words and target_words
        for word in input_text.split():
            if word not in input_words:
                input_words.add(word)
        for word in target_text.split():
            if word not in target_words:
                target_words.add(word)
      except:
        pass

    
#Show what input_text or target_text looks like??

print("Number of input words ", len(input_words))
print("Number of target words ", len(target_words))  #Should be two more than input words, since we added <start> and <stop>
    
    
#Build the vocabulary.  Here it is simply union of the input and target words.
vocab = list(set(input_words).union(set(target_words)))
print("Vocab size ", len(vocab))
    
corpus_dict = create_corpus_dict(vocab)
print("corpus_dict size ", len(corpus_dict))
    
# split each set of lines into input and target separately.
train_input_texts, train_target_texts  = split_input_and_target(train_lines)
#validation_input_texts, validation_target_texts  = split_input_and_target(validation_lines)
test_input_texts, test_target_texts  = split_input_and_target(test_lines)

In [None]:
##Encode training data and persist to S3
train_encoder_input_data, train_decoder_input_data, train_decoder_target_data = encode_data(train_input_texts,train_target_texts, vocab,vocab, corpus_dict, corpus_dict)

upload_ndarray_to_s3(train_encoder_input_data, "train/train_encoder_input_data.npy")
upload_ndarray_to_s3(train_decoder_input_data, "train/train_decoder_input_data.npy")
upload_ndarray_to_s3(train_decoder_target_data, "train/train_decoder_target_data.npy")

In [None]:
train_encoder_input_data

In [None]:
##Encode test data and persist to S3
test_encoder_input_data, test_decoder_input_data, test_decoder_target_data = encode_data(test_input_texts,test_target_texts, vocab,vocab, corpus_dict, corpus_dict)

upload_ndarray_to_s3(test_encoder_input_data, "test/test_encoder_input_data.npy")
upload_ndarray_to_s3(test_decoder_input_data, "test/test_decoder_input_data.npy")
upload_ndarray_to_s3(test_decoder_target_data, "test/test_decoder_target_data.npy")

### First, execute the python script locally to test it out
In this step we can test for syntax and overall script flow.

In [None]:
#Delete this directory the model is already persisted in this directory.
!rm -rf /tmp/model/1

In [None]:
!python customer_event_prediction_lstm_keras_model.py --epochs 2 --batchsize 64 --modeldir '/tmp'

### Next train using the local mode

In [None]:
import sagemaker
import time

sess = sagemaker.Session()
role = sagemaker.get_execution_role()

In [None]:
from sagemaker.tensorflow import TensorFlow

tf_estimator_local = TensorFlow(entry_point='customer_event_prediction_lstm_keras_model.py', 
                          role=role,
                          train_instance_count=1, 
                          train_instance_type='local',
                          framework_version='1.12', 
                          py_version='py3',
                          script_mode=True,
                          hyperparameters={ 'epochs': 1 }
                         )

In [None]:
tf_estimator_local.fit() 

### Train on a training cluster

In [None]:
tf_estimator_on_cluster = TensorFlow(entry_point='customer_event_prediction_lstm_keras_model.py', 
                          role=role,
                          train_instance_count=1, 
                          train_instance_type='ml.m5.xlarge',
                          framework_version='1.12', 
                          py_version='py3',
                          script_mode=True,
                          hyperparameters={
                              'epochs': 20,
                              'batch-size': 256,
                              'learning-rate': 0.01}
                         )

In [None]:
tf_estimator_on_cluster.fit()

In [None]:
## Deploy the model trained on the cluster.
## This takes approximately XXX minutes

tf_endpoint_name = 'customer-event-prediction-lstm'+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

tf_predictor = tf_estimator_on_cluster.deploy(initial_instance_count=1,
                         instance_type='ml.c5.large',        
                         endpoint_name=tf_endpoint_name)

### Now lets use the deployed model for predictions

In [None]:
##Method to predict
stop_word_list = ['<start>', '<stop>']

def predict(prediction_index):
    #print("encode input data type : " , type(test_encoder_input_data[prediction_index]), "encode input data size : " , len(test_encoder_input_data[prediction_index ]))
    predictions_from_model = tf_predictor.predict({'encoder_input_data' : test_encoder_input_data[prediction_index], 
                                          'decoder_input_data' : test_decoder_input_data[prediction_index]})

    #print("predictions_from_model ", type(predictions_from_model))
    
    output_tokens = np.asarray(predictions_from_model['predictions'])
    integer_list = output_tokens.argmax(axis=2)

    #print("output_token.shape ", output_tokens.shape) # (observations, max sequence length, onehot corpus size)
    #print("integer_list.shape ", integer_list.shape)

    # Reassign variables for convenience
    input_token_index = corpus_dict
    #target_token_index = corpus_dict
    
    # Reverse-lookup token index to decode sequences back to something readable.
    reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
    #reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

    translated_array = np.vectorize(reverse_input_char_index.get)(integer_list) 
    #translated_array = np.vectorize(reverse_target_char_index.get)(integer_list) 

    translated_list = translated_array.tolist()
    
    predicted_sequence = translated_list[0]
    
    ##Remove <start>, <stop> tokens from the predicted sequence
    predicted_sequence_cleaned = [item for item in predicted_sequence if item not in stop_word_list]

    return predicted_sequence_cleaned 

In [None]:
##Predict output for a single element (represented by the index) in the text target 
predicted_sequence = predict(0)
print("predicted sequence ", predicted_sequence)

In [None]:
## Now predict for multiple elements in the text target 

all_predictions = []
for i in range(0,len(test_encoder_input_data.tolist())):
#for i in range(0,10000):
    predicted_sequence = predict(i)
    all_predictions.append(predicted_sequence)
        
print("Total number of predictions ", len(all_predictions))        
        

In [None]:
print("Total test target_texts : ", len(test_target_texts))

matching_prediction_count = 0

for i in range(0,9999):
    if ("visit" in all_predictions[i] and "visit" in test_target_texts[i]):
        matching_prediction_count = matching_prediction_count + 1
        
print("matching_prediction_count : ", matching_prediction_count)

In [None]:
## TODO : Confustion matrix calculations

### TODO : Delete the endpoint