In [38]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Extract url
url = "https://raw.githubusercontent.com/gheniabla/datasets/master/enjoy.txt"
#Read from the URL seperating the tabbed lines until terminator sentence is found
dataset = pd.read_csv(url,sep='\t', lineterminator='\r')
#------------#1--------------#
#tokenize the data set with maximum of 8000 words
tokenizer = Tokenizer(num_words = 8000)
tokenizer.fit_on_texts(dataset)
#------------#2--------------#
#assign word indexes from most used word to least used word
word_index = tokenizer.word_index
#print tokenized word indexes
print("----#2----")
print(word_index, "\n")
#------------#3--------------#
#encode the training data sentences into sequences
sequences = tokenizer.texts_to_sequences(dataset)
print("----#3----")
print(word_index)
print(sequences)
#------------#4--------------#
#Get max training sequence length
used = tf.sign(tf.reduce_max(tf.abs(sequences), 1))
length = tf.reduce_sum(used, 0)
length = tf.cast(length, tf.int32)
print("\n----#4----")
print(length)
#------------#5--------------#
#use post padding on the trinaing sequences
padded = pad_sequences(sequences, padding='post', maxlen=1000)
print("\n----#5----")
print(padded)
#------------#6--------------#
#Use tokenizer to tokenize the test data
#test data
sentences = [
             'Enjoy coffee this morning.',
             'I enjoy going to the supermarket.',
             'Want some milk for your coffee?'
]
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print("\n----#6----")
print(word_index)
#------------#7--------------#
#print the testing sequences
test_sequences = tokenizer.texts_to_sequences(sentences)
print("\n----#7----")
print(test_sequences) 
#print the padding sequences
newpadded = pad_sequences(test_sequences)
print("\n----#7----")
print(newpadded)

----#2----
{'enjoy': 1, 'the': 2, 'i': 3, 'you': 4, 'to': 5, 'of': 6, 'and': 7, 'did': 8, 'in': 9, 'your': 10, 'a': 11, 'it': 12, 'he': 13, 'my': 14, 'they': 15, 'do': 16, 'we': 17, 'life': 18, 'not': 19, 'yourself': 20, 'with': 21, 'this': 22, 'at': 23, 'what': 24, 'his': 25, 'good': 26, 'health': 27, 'she': 28, 'us': 29, 'hope': 30, 'time': 31, 'family': 32, 'can': 33, 'people': 34, 'children': 35, 'let': 36, 'party': 37, 'watching': 38, 'sure': 39, 'will': 40, 'playing': 41, 'evening': 42, 'cannot': 43, 'all': 44, 'always': 45, 'don’t': 46, 'swing': 47, 'stay': 48, 'here': 49, 'themselves': 50, 'there': 51, 'football': 52, 'go': 53, 'morning': 54, 'being': 55, 'work': 56, 'friends': 57, 'movie': 58, 'am': 59, 'could': 60, 'day': 61, 'is': 62, 'him': 63, 'does': 64, 'i’m': 65, 'every': 66, 'music': 67, 'beauty': 68, 'nature': 69, 'one': 70, 'on': 71, 'ourselves': 72, 'company': 73, 'doing': 74, 'shall': 75, 'may': 76, 'reading': 77, 'vacation': 78, 'only': 79, 'game': 80, 'night': 81