# 1.0 Purpose
The purpose of this notebook is to use Embedding for a Natural Language Processing (NLP) task with TensorFlow on the imbd_reviews dataset.  Predictions are "1" for a bad review and "0" for  a good review.

# 2.0 Import Libraries

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
print(tf.__version__)

## Helper Functions

### decode_review

In [None]:
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])



### get_training_metrics

In [None]:
def get_training_metrics(history):
  
  # This is needed depending on if you used the pretrained model or you trained it yourself
  if not isinstance(history, pd.core.frame.DataFrame):
    history = history.history
  
  acc = history['binary_accuracy']
  val_acc = history['val_binary_accuracy']

  loss = history['loss']
  val_loss = history['val_loss']

  return acc, val_acc, loss, val_loss

### plot_train_eval

In [None]:
def plot_train_eval(history,name='',parameters='',optimizer_name='',loss='',accuracy_metric='',epochs='',vocab_size='',embedding_dim='',max_length='',trunc_type='',oov_tok=''):
  acc, val_acc, loss_values, val_loss = get_training_metrics(history)

  acc_plot = pd.DataFrame({"training accuracy":acc, "evaluation accuracy":val_acc})
  acc_plot = sns.lineplot(data=acc_plot)
  acc_plot.set_title(f'training vs evaluation accuracy: {name} \n # parameters={parameters} \n optimizer_name={optimizer_name} \n loss={loss} \n accuracy_metric={accuracy_metric} \n epochs={epochs} \n vocab_size={vocab_size} \n embedding_dim={embedding_dim} \n max_length={max_length} \n trunc_type={trunc_type} \n oov_tok={oov_tok}')
  acc_plot.set_xlabel('epoch')
  acc_plot.set_ylabel(accuracy_metric)
  plt.savefig(f'{name} - train_acc_vs_eval_acc.jpg',bbox_inches='tight')
  plt.show()


  print("")

  loss_plot = pd.DataFrame({"training loss":loss_values, "evaluation loss":val_loss})
  loss_plot = sns.lineplot(data=loss_plot)
  loss_plot.set_title(f'training vs evaluation loss: {name} \n # parameters={parameters} \n optimizer_name={optimizer_name}  \n loss={loss} \naccuracy_metric={accuracy_metric} \n epochs={epochs} \n vocab_size={vocab_size} \n embedding_dim={embedding_dim} \n max_length={max_length} \n trunc_type={trunc_type} \n oov_tok={oov_tok}')
  loss_plot.set_xlabel('epoch')
  loss_plot.set_ylabel(loss)
  plt.savefig(f'{name} - train_loss_vs_eval_loss.jpg',bbox_inches='tight')
  plt.show()

### model_picker

In [None]:
def model_picker(vocab_size=100,embedding_dim=16,input_length=100,model_name='model1'):

  if model_name=='model1':
    model = tf.keras.Sequential([
                                tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                                tf.keras.layers.Flatten(),
                                tf.keras.layers.Dense(6,activation='relu'),
                                tf.keras.layers.Dense(1,activation='sigmoid')
    ])
    model_description=f'Embedding(vocab_size={vocab_size}, \n embedding_dim={embedding_dim}, \n input_length={max_length}), \n Flatten, \n Dense(6,relu), \n Dense(1,sigmoid)'
  elif model_name=='model2':
    model = tf.keras.Sequential([
                                tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                                tf.keras.layers.GlobalAveragePooling1D(),
                                tf.keras.layers.Dense(6,activation='relu'),
                                tf.keras.layers.Dense(1,activation='sigmoid')
    ])
    model_description=f'Embedding(vocab_size={vocab_size},embedding_dim={embedding_dim},input_length={max_length}), \n GlobalAveragePooling1D,\n Dense(6,relu), \n Dense(1,sigmoid)'
  elif model_name=='model3':
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
    model_description=f'Embedding(vocab_size={vocab_size}, \n embedding_dim={embedding_dim}), \n Bidirectional(LSTM(64,return_sequences=True)), \n Bidirectional(LSTM(32)), \n Dense(64,relu), \n Dense(1,sigmoid)'
  elif model_name=='model4':
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
    model_description=f'Embedding(vocab_size={vocab_size}, \n embedding_dim={embedding_dim}), \n Bidirectional(LSTM(32)), \n Dense(24,relu), \n Dense(1,sigmoid)'
  elif model_name=='model5':
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
    model_description=f'Embedding(vocab_size={vocab_size},embedding_dim={embedding_dim},input_length={max_length}), \n Bidirectional(LSTM(32)),Dense(6,relu),Dense(1,sigmoid)'
  elif model_name=='model6':
    model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
  tf.keras.layers.Conv1D(128, 5, activation='relu'),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(6, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])
    model_description=f'Embedding(vocab_size={vocab_size},embedding_dim={embedding_dim},input_length={max_length}), \n Conv1D(128,5,activation=relu), \n GlobalAveragePooling1D, \n Dense(6,relu),Dense(1,sigmoid)'
  elif model_name=='model7':
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
    model_description=f'Embedding(vocab_size={vocab_size},embedding_dim={embedding_dim},input_length={max_length}), \n Bidirectional(GRU(32)),Dense(6,relu),Dense(1,sigmoid)'
  return model,model_description

# 3.0 Import **imbd_reviews**

https://www.tensorflow.org/datasets/catalog/imdb_reviews

In [None]:

imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True,download=True)


In [None]:
cols=['filename','model_name','model_description',
      'optimizer',
      'loss',
      'accuracy_metric','epochs','vocab_size','embedding_dim','max_length','trunc_type','oov_tok','final_eval_loss','final_eval_acc']
df_metrics=pd.DataFrame(columns=cols)
df_metrics

# 4.1 model1

## Set Embedding/Modeling Parameters

In [None]:
filename='TF_CNN_Sequential_NLP_imdb_reviews.ipynb'
model_name='model1'
optimizer='adam'
optimizer_name=str(optimizer)
loss='binary_crossentropy'
accuracy_metric='binary_accuracy'
epochs=10

vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = '<OOV>'

## Split Train/Test

In [None]:
train_data, test_data = imdb['train'], imdb['test']

In [None]:
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

In [None]:
for s, l in train_data:
  training_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())

In [None]:
for s, l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())

In [None]:
import numpy as np
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

## Create Tokenizer

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok) #create tokenizer that has a vocab_size & oov_token specified above

In [None]:
testing_labels_final.max() #ensure max is still 1 for a "bad" review

In [None]:
tokenizer.fit_on_texts(training_sentences) #fit the tokenizer on the training_sentences 

In [None]:
word_index = tokenizer.word_index #get the word index for the tokenizer

In [None]:
sequences = tokenizer.texts_to_sequences(training_sentences) #convert texts to sequences using the tokenizer

In [None]:
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type) # pad the sequences by the max_length with truncation set to trunc_type

In [None]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences) # convert the testing_sentences to testing_sequences

In [None]:
testing_padded = pad_sequences(testing_sequences, maxlen=max_length) # convert the testing_sequences to testing_padded

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # reverse the key,value from word_index to check index value to key

In [None]:
print(decode_review(padded[0])) #This is after it has been padded and OOV in place, but decoded.  Recall this is actually padded as a sequence of numbers for training
print(training_sentences[0]) #This is the original

## Create Model

In [None]:
model,model_description=model_picker(vocab_size=vocab_size,embedding_dim=embedding_dim,input_length=max_length,model_name='model1')

In [None]:
model.summary()

In [None]:
model.compile(loss=loss,optimizer=optimizer,metrics=[accuracy_metric])

## Train Model

In [None]:
history=model.fit(padded, training_labels_final,epochs=epochs,validation_data=(testing_padded,
                                                                           testing_labels_final))

## Evaluate Model

In [None]:
e = model.layers[0]
weights=e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
final_eval_loss,final_eval_acc=model.evaluate(testing_padded,testing_labels_final)

In [None]:
final_eval_loss

In [None]:
trainableParams = np.sum([np.prod(v.get_shape()) for v in model.trainable_weights])
trainableParams

In [None]:
plot_train_eval(history,model_name,trainableParams,optimizer_name,loss,accuracy_metric,epochs,vocab_size,embedding_dim,max_length,trunc_type,oov_tok)

In [None]:
df=pd.DataFrame([[filename,model_name,model_description,optimizer,loss,accuracy_metric,epochs,vocab_size,embedding_dim,max_length,trunc_type,oov_tok,final_eval_loss,final_eval_acc]],columns=cols)
df_metrics=df_metrics.append(df,ignore_index=True)
df_metrics

In [None]:
model.save(model_name) #save model
model=tf.keras.models.load_model(model_name) #load model
df_metrics.to_excel('df_metrics.xlsx')

In [None]:
import io

out_v = io.open(f'vecs_{model_name}.tsv', 'w', encoding='utf-8')
out_m = io.open(f'meta_{model_name}.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download(f'vecs_{model_name}.tsv')
  files.download(f'meta_{model_name}.tsv')

In [None]:
# Testing new Custom Reviews
sentence1='What a great movie.'
test1=tokenizer.texts_to_sequences([sentence1])
test1= pad_sequences(test1, maxlen=max_length, truncating=trunc_type)
answer=model.predict(test1)
print(f'Review for this sentence: \n {sentence1} \n is a value of {answer}')
print('This makes sense since it is a value close to 1, meaning a good review')
print('\n')

sentence2='What a terrible movie.'
test2=tokenizer.texts_to_sequences([sentence2])
test2= pad_sequences(test2, maxlen=max_length, truncating=trunc_type)
answer=model.predict(test2)
print(f'Review for this sentence: \n {sentence2} \n is a value of {answer}')
print('This makes sense since it is a value close to 0, meaning a bad review')

# 4.2 model2

## Set Embedding/Modeling Parameters

In [None]:
filename='TF_CNN_Sequential_NLP_imdb_reviews.ipynb'
model_name='model2'
optimizer='adam'
optimizer_name=str(optimizer)
loss='binary_crossentropy'
accuracy_metric='binary_accuracy'
epochs=10

vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = '<OOV>'

## Split Train/Test

In [None]:
train_data, test_data = imdb['train'], imdb['test']

In [None]:
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

In [None]:
for s, l in train_data:
  training_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())

In [None]:
for s, l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())

In [None]:
import numpy as np
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

## Create Tokenizer

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok) #create tokenizer that has a vocab_size & oov_token specified above

In [None]:
testing_labels_final.max() #ensure max is still 1 for a "bad" review

In [None]:
tokenizer.fit_on_texts(training_sentences) #fit the tokenizer on the training_sentences 

In [None]:
word_index = tokenizer.word_index #get the word index for the tokenizer

In [None]:
sequences = tokenizer.texts_to_sequences(training_sentences) #convert texts to sequences using the tokenizer

In [None]:
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type) # pad the sequences by the max_length with truncation set to trunc_type

In [None]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences) # convert the testing_sentences to testing_sequences

In [None]:
testing_padded = pad_sequences(testing_sequences, maxlen=max_length) # convert the testing_sequences to testing_padded

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # reverse the key,value from word_index to check index value to key

In [None]:
print(decode_review(padded[0])) #This is after it has been padded and OOV in place, but decoded.  Recall this is actually padded as a sequence of numbers for training
print(training_sentences[0]) #This is the original

## Create Model

In [None]:
model,model_description=model_picker(vocab_size=vocab_size,embedding_dim=embedding_dim,input_length=max_length,model_name='model1')

In [None]:
model.summary()

In [None]:
model.compile(loss=loss,optimizer=optimizer,metrics=[accuracy_metric])

## Train Model

In [None]:
history=model.fit(padded, training_labels_final,epochs=epochs,validation_data=(testing_padded,
                                                                           testing_labels_final))

## Evaluate Model

In [None]:
e = model.layers[0]
weights=e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
final_eval_loss,final_eval_acc=model.evaluate(testing_padded,testing_labels_final)

In [None]:
final_eval_loss

In [None]:
trainableParams = np.sum([np.prod(v.get_shape()) for v in model.trainable_weights])
trainableParams

In [None]:
plot_train_eval(history,model_name,trainableParams,optimizer_name,loss,accuracy_metric,epochs,vocab_size,embedding_dim,max_length,trunc_type,oov_tok)

In [None]:
df=pd.DataFrame([[filename,model_name,model_description,optimizer,loss,accuracy_metric,epochs,vocab_size,embedding_dim,max_length,trunc_type,oov_tok,final_eval_loss,final_eval_acc]],columns=cols)
df_metrics=df_metrics.append(df,ignore_index=True)
df_metrics

In [None]:
model.save(model_name) #save model
model=tf.keras.models.load_model(model_name) #load model
df_metrics.to_excel('df_metrics.xlsx')

In [None]:
import io

out_v = io.open(f'vecs_{model_name}.tsv', 'w', encoding='utf-8')
out_m = io.open(f'meta_{model_name}.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download(f'vecs_{model_name}.tsv')
  files.download(f'meta_{model_name}.tsv')

In [None]:
# Testing new Custom Reviews
sentence1='What a great movie.'
test1=tokenizer.texts_to_sequences([sentence1])
test1= pad_sequences(test1, maxlen=max_length, truncating=trunc_type)
answer=model.predict(test1)
print(f'Review for this sentence: \n {sentence1} \n is a value of {answer}')
print('This makes sense since it is a value close to 1, meaning a good review')
print('\n')

sentence2='What a terrible movie.'
test2=tokenizer.texts_to_sequences([sentence2])
test2= pad_sequences(test2, maxlen=max_length, truncating=trunc_type)
answer=model.predict(test2)
print(f'Review for this sentence: \n {sentence2} \n is a value of {answer}')
print('This makes sense since it is a value close to 0, meaning a bad review')

# 4.3 model3

## Set Embedding/Modeling Parameters

In [None]:
filename='TF_CNN_Sequential_NLP_imdb_reviews.ipynb'
model_name='model3'
optimizer='adam'
optimizer_name=str(optimizer)
loss='binary_crossentropy'
accuracy_metric='binary_accuracy'
epochs=10

vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = '<OOV>'

## Split Train/Test

In [None]:
train_data, test_data = imdb['train'], imdb['test']

In [None]:
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

In [None]:
for s, l in train_data:
  training_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())

In [None]:
for s, l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())

In [None]:
import numpy as np
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

## Create Tokenizer

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok) #create tokenizer that has a vocab_size & oov_token specified above

In [None]:
testing_labels_final.max() #ensure max is still 1 for a "bad" review

In [None]:
tokenizer.fit_on_texts(training_sentences) #fit the tokenizer on the training_sentences 

In [None]:
word_index = tokenizer.word_index #get the word index for the tokenizer

In [None]:
sequences = tokenizer.texts_to_sequences(training_sentences) #convert texts to sequences using the tokenizer

In [None]:
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type) # pad the sequences by the max_length with truncation set to trunc_type

In [None]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences) # convert the testing_sentences to testing_sequences

In [None]:
testing_padded = pad_sequences(testing_sequences, maxlen=max_length) # convert the testing_sequences to testing_padded

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # reverse the key,value from word_index to check index value to key

In [None]:
print(decode_review(padded[0])) #This is after it has been padded and OOV in place, but decoded.  Recall this is actually padded as a sequence of numbers for training
print(training_sentences[0]) #This is the original

## Create Model

In [None]:
model,model_description=model_picker(vocab_size=vocab_size,embedding_dim=embedding_dim,input_length=max_length,model_name='model1')

In [None]:
model.summary()

In [None]:
model.compile(loss=loss,optimizer=optimizer,metrics=[accuracy_metric])

## Train Model

In [None]:
history=model.fit(padded, training_labels_final,epochs=epochs,validation_data=(testing_padded,
                                                                           testing_labels_final))

## Evaluate Model

In [None]:
e = model.layers[0]
weights=e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
final_eval_loss,final_eval_acc=model.evaluate(testing_padded,testing_labels_final)

In [None]:
final_eval_loss

In [None]:
trainableParams = np.sum([np.prod(v.get_shape()) for v in model.trainable_weights])
trainableParams

In [None]:
plot_train_eval(history,model_name,trainableParams,optimizer_name,loss,accuracy_metric,epochs,vocab_size,embedding_dim,max_length,trunc_type,oov_tok)

In [None]:
df=pd.DataFrame([[filename,model_name,model_description,optimizer,loss,accuracy_metric,epochs,vocab_size,embedding_dim,max_length,trunc_type,oov_tok,final_eval_loss,final_eval_acc]],columns=cols)
df_metrics=df_metrics.append(df,ignore_index=True)
df_metrics

In [None]:
model.save(model_name) #save model
model=tf.keras.models.load_model(model_name) #load model
df_metrics.to_excel('df_metrics.xlsx')

In [None]:
import io

out_v = io.open(f'vecs_{model_name}.tsv', 'w', encoding='utf-8')
out_m = io.open(f'meta_{model_name}.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download(f'vecs_{model_name}.tsv')
  files.download(f'meta_{model_name}.tsv')

In [None]:
# Testing new Custom Reviews
sentence1='What a great movie.'
test1=tokenizer.texts_to_sequences([sentence1])
test1= pad_sequences(test1, maxlen=max_length, truncating=trunc_type)
answer=model.predict(test1)
print(f'Review for this sentence: \n {sentence1} \n is a value of {answer}')
print('This makes sense since it is a value close to 1, meaning a good review')
print('\n')

sentence2='What a terrible movie.'
test2=tokenizer.texts_to_sequences([sentence2])
test2= pad_sequences(test2, maxlen=max_length, truncating=trunc_type)
answer=model.predict(test2)
print(f'Review for this sentence: \n {sentence2} \n is a value of {answer}')
print('This makes sense since it is a value close to 0, meaning a bad review')

# 4.4 model4

## Set Embedding/Modeling Parameters

In [None]:
filename='TF_CNN_Sequential_NLP_imdb_reviews.ipynb'
model_name='model4'
optimizer='adam'
optimizer_name=str(optimizer)
loss='binary_crossentropy'
accuracy_metric='binary_accuracy'
epochs=10

vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = '<OOV>'

## Split Train/Test

In [None]:
train_data, test_data = imdb['train'], imdb['test']

In [None]:
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

In [None]:
for s, l in train_data:
  training_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())

In [None]:
for s, l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())

In [None]:
import numpy as np
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

## Create Tokenizer

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok) #create tokenizer that has a vocab_size & oov_token specified above

In [None]:
testing_labels_final.max() #ensure max is still 1 for a "bad" review

In [None]:
tokenizer.fit_on_texts(training_sentences) #fit the tokenizer on the training_sentences 

In [None]:
word_index = tokenizer.word_index #get the word index for the tokenizer

In [None]:
sequences = tokenizer.texts_to_sequences(training_sentences) #convert texts to sequences using the tokenizer

In [None]:
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type) # pad the sequences by the max_length with truncation set to trunc_type

In [None]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences) # convert the testing_sentences to testing_sequences

In [None]:
testing_padded = pad_sequences(testing_sequences, maxlen=max_length) # convert the testing_sequences to testing_padded

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # reverse the key,value from word_index to check index value to key

In [None]:
print(decode_review(padded[0])) #This is after it has been padded and OOV in place, but decoded.  Recall this is actually padded as a sequence of numbers for training
print(training_sentences[0]) #This is the original

## Create Model

In [None]:
model,model_description=model_picker(vocab_size=vocab_size,embedding_dim=embedding_dim,input_length=max_length,model_name='model1')

In [None]:
model.summary()

In [None]:
model.compile(loss=loss,optimizer=optimizer,metrics=[accuracy_metric])

## Train Model

In [None]:
history=model.fit(padded, training_labels_final,epochs=epochs,validation_data=(testing_padded,
                                                                           testing_labels_final))

## Evaluate Model

In [None]:
e = model.layers[0]
weights=e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
final_eval_loss,final_eval_acc=model.evaluate(testing_padded,testing_labels_final)

In [None]:
final_eval_loss

In [None]:
trainableParams = np.sum([np.prod(v.get_shape()) for v in model.trainable_weights])
trainableParams

In [None]:
plot_train_eval(history,model_name,trainableParams,optimizer_name,loss,accuracy_metric,epochs,vocab_size,embedding_dim,max_length,trunc_type,oov_tok)

In [None]:
df=pd.DataFrame([[filename,model_name,model_description,optimizer,loss,accuracy_metric,epochs,vocab_size,embedding_dim,max_length,trunc_type,oov_tok,final_eval_loss,final_eval_acc]],columns=cols)
df_metrics=df_metrics.append(df,ignore_index=True)
df_metrics

In [None]:
model.save(model_name) #save model
model=tf.keras.models.load_model(model_name) #load model
df_metrics.to_excel('df_metrics.xlsx')

In [None]:
import io

out_v = io.open(f'vecs_{model_name}.tsv', 'w', encoding='utf-8')
out_m = io.open(f'meta_{model_name}.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download(f'vecs_{model_name}.tsv')
  files.download(f'meta_{model_name}.tsv')

In [None]:
# Testing new Custom Reviews
sentence1='What a great movie.'
test1=tokenizer.texts_to_sequences([sentence1])
test1= pad_sequences(test1, maxlen=max_length, truncating=trunc_type)
answer=model.predict(test1)
print(f'Review for this sentence: \n {sentence1} \n is a value of {answer}')
print('This makes sense since it is a value close to 1, meaning a good review')
print('\n')

sentence2='What a terrible movie.'
test2=tokenizer.texts_to_sequences([sentence2])
test2= pad_sequences(test2, maxlen=max_length, truncating=trunc_type)
answer=model.predict(test2)
print(f'Review for this sentence: \n {sentence2} \n is a value of {answer}')
print('This makes sense since it is a value close to 0, meaning a bad review')

# 4.5 model5

## Set Embedding/Modeling Parameters

In [None]:
filename='TF_CNN_Sequential_NLP_imdb_reviews.ipynb'
model_name='model5'
optimizer='adam'
optimizer_name=str(optimizer)
loss='binary_crossentropy'
accuracy_metric='binary_accuracy'
epochs=10

vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = '<OOV>'

## Split Train/Test

In [None]:
train_data, test_data = imdb['train'], imdb['test']

In [None]:
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

In [None]:
for s, l in train_data:
  training_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())

In [None]:
for s, l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())

In [None]:
import numpy as np
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

## Create Tokenizer

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok) #create tokenizer that has a vocab_size & oov_token specified above

In [None]:
testing_labels_final.max() #ensure max is still 1 for a "bad" review

In [None]:
tokenizer.fit_on_texts(training_sentences) #fit the tokenizer on the training_sentences 

In [None]:
word_index = tokenizer.word_index #get the word index for the tokenizer

In [None]:
sequences = tokenizer.texts_to_sequences(training_sentences) #convert texts to sequences using the tokenizer

In [None]:
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type) # pad the sequences by the max_length with truncation set to trunc_type

In [None]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences) # convert the testing_sentences to testing_sequences

In [None]:
testing_padded = pad_sequences(testing_sequences, maxlen=max_length) # convert the testing_sequences to testing_padded

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # reverse the key,value from word_index to check index value to key

In [None]:
print(decode_review(padded[0])) #This is after it has been padded and OOV in place, but decoded.  Recall this is actually padded as a sequence of numbers for training
print(training_sentences[0]) #This is the original

## Create Model

In [None]:
model,model_description=model_picker(vocab_size=vocab_size,embedding_dim=embedding_dim,input_length=max_length,model_name='model1')

In [None]:
model.summary()

In [None]:
model.compile(loss=loss,optimizer=optimizer,metrics=[accuracy_metric])

## Train Model

In [None]:
history=model.fit(padded, training_labels_final,epochs=epochs,validation_data=(testing_padded,
                                                                           testing_labels_final))

## Evaluate Model

In [None]:
e = model.layers[0]
weights=e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
final_eval_loss,final_eval_acc=model.evaluate(testing_padded,testing_labels_final)

In [None]:
final_eval_loss

In [None]:
trainableParams = np.sum([np.prod(v.get_shape()) for v in model.trainable_weights])
trainableParams

In [None]:
plot_train_eval(history,model_name,trainableParams,optimizer_name,loss,accuracy_metric,epochs,vocab_size,embedding_dim,max_length,trunc_type,oov_tok)

In [None]:
df=pd.DataFrame([[filename,model_name,model_description,optimizer,loss,accuracy_metric,epochs,vocab_size,embedding_dim,max_length,trunc_type,oov_tok,final_eval_loss,final_eval_acc]],columns=cols)
df_metrics=df_metrics.append(df,ignore_index=True)
df_metrics

In [None]:
model.save(model_name) #save model
model=tf.keras.models.load_model(model_name) #load model
df_metrics.to_excel('df_metrics.xlsx')

In [None]:
import io

out_v = io.open(f'vecs_{model_name}.tsv', 'w', encoding='utf-8')
out_m = io.open(f'meta_{model_name}.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download(f'vecs_{model_name}.tsv')
  files.download(f'meta_{model_name}.tsv')

In [None]:
# Testing new Custom Reviews
sentence1='What a great movie.'
test1=tokenizer.texts_to_sequences([sentence1])
test1= pad_sequences(test1, maxlen=max_length, truncating=trunc_type)
answer=model.predict(test1)
print(f'Review for this sentence: \n {sentence1} \n is a value of {answer}')
print('This makes sense since it is a value close to 1, meaning a good review')
print('\n')

sentence2='What a terrible movie.'
test2=tokenizer.texts_to_sequences([sentence2])
test2= pad_sequences(test2, maxlen=max_length, truncating=trunc_type)
answer=model.predict(test2)
print(f'Review for this sentence: \n {sentence2} \n is a value of {answer}')
print('This makes sense since it is a value close to 0, meaning a bad review')

# 4.6 model6

## Set Embedding/Modeling Parameters

In [None]:
filename='TF_CNN_Sequential_NLP_imdb_reviews.ipynb'
model_name='model6'
optimizer='adam'
optimizer_name=str(optimizer)
loss='binary_crossentropy'
accuracy_metric='binary_accuracy'
epochs=10

vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = '<OOV>'

## Split Train/Test

In [None]:
train_data, test_data = imdb['train'], imdb['test']

In [None]:
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

In [None]:
for s, l in train_data:
  training_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())

In [None]:
for s, l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())

In [None]:
import numpy as np
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

## Create Tokenizer

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok) #create tokenizer that has a vocab_size & oov_token specified above

In [None]:
testing_labels_final.max() #ensure max is still 1 for a "bad" review

In [None]:
tokenizer.fit_on_texts(training_sentences) #fit the tokenizer on the training_sentences 

In [None]:
word_index = tokenizer.word_index #get the word index for the tokenizer

In [None]:
sequences = tokenizer.texts_to_sequences(training_sentences) #convert texts to sequences using the tokenizer

In [None]:
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type) # pad the sequences by the max_length with truncation set to trunc_type

In [None]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences) # convert the testing_sentences to testing_sequences

In [None]:
testing_padded = pad_sequences(testing_sequences, maxlen=max_length) # convert the testing_sequences to testing_padded

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # reverse the key,value from word_index to check index value to key

In [None]:
print(decode_review(padded[0])) #This is after it has been padded and OOV in place, but decoded.  Recall this is actually padded as a sequence of numbers for training
print(training_sentences[0]) #This is the original

## Create Model

In [None]:
model,model_description=model_picker(vocab_size=vocab_size,embedding_dim=embedding_dim,input_length=max_length,model_name='model1')

In [None]:
model.summary()

In [None]:
model.compile(loss=loss,optimizer=optimizer,metrics=[accuracy_metric])

## Train Model

In [None]:
history=model.fit(padded, training_labels_final,epochs=epochs,validation_data=(testing_padded,
                                                                           testing_labels_final))

## Evaluate Model

In [None]:
e = model.layers[0]
weights=e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
final_eval_loss,final_eval_acc=model.evaluate(testing_padded,testing_labels_final)

In [None]:
final_eval_loss

In [None]:
trainableParams = np.sum([np.prod(v.get_shape()) for v in model.trainable_weights])
trainableParams

In [None]:
plot_train_eval(history,model_name,trainableParams,optimizer_name,loss,accuracy_metric,epochs,vocab_size,embedding_dim,max_length,trunc_type,oov_tok)

In [None]:
df=pd.DataFrame([[filename,model_name,model_description,optimizer,loss,accuracy_metric,epochs,vocab_size,embedding_dim,max_length,trunc_type,oov_tok,final_eval_loss,final_eval_acc]],columns=cols)
df_metrics=df_metrics.append(df,ignore_index=True)
df_metrics

In [None]:
model.save(model_name) #save model
model=tf.keras.models.load_model(model_name) #load model
df_metrics.to_excel('df_metrics.xlsx')

In [None]:
import io

out_v = io.open(f'vecs_{model_name}.tsv', 'w', encoding='utf-8')
out_m = io.open(f'meta_{model_name}.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download(f'vecs_{model_name}.tsv')
  files.download(f'meta_{model_name}.tsv')

In [None]:
# Testing new Custom Reviews
sentence1='What a great movie.'
test1=tokenizer.texts_to_sequences([sentence1])
test1= pad_sequences(test1, maxlen=max_length, truncating=trunc_type)
answer=model.predict(test1)
print(f'Review for this sentence: \n {sentence1} \n is a value of {answer}')
print('This makes sense since it is a value close to 1, meaning a good review')
print('\n')

sentence2='What a terrible movie.'
test2=tokenizer.texts_to_sequences([sentence2])
test2= pad_sequences(test2, maxlen=max_length, truncating=trunc_type)
answer=model.predict(test2)
print(f'Review for this sentence: \n {sentence2} \n is a value of {answer}')
print('This makes sense since it is a value close to 0, meaning a bad review')

# 4.7 model7

## Set Embedding/Modeling Parameters

In [None]:
filename='TF_CNN_Sequential_NLP_imdb_reviews.ipynb'
model_name='model7'
optimizer='adam'
optimizer_name=str(optimizer)
loss='binary_crossentropy'
accuracy_metric='binary_accuracy'
epochs=10

vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = '<OOV>'

## Split Train/Test

In [None]:
train_data, test_data = imdb['train'], imdb['test']

In [None]:
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

In [None]:
for s, l in train_data:
  training_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())

In [None]:
for s, l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())

In [None]:
import numpy as np
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

## Create Tokenizer

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok) #create tokenizer that has a vocab_size & oov_token specified above

In [None]:
testing_labels_final.max() #ensure max is still 1 for a "bad" review

In [None]:
tokenizer.fit_on_texts(training_sentences) #fit the tokenizer on the training_sentences 

In [None]:
word_index = tokenizer.word_index #get the word index for the tokenizer

In [None]:
sequences = tokenizer.texts_to_sequences(training_sentences) #convert texts to sequences using the tokenizer

In [None]:
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type) # pad the sequences by the max_length with truncation set to trunc_type

In [None]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences) # convert the testing_sentences to testing_sequences

In [None]:
testing_padded = pad_sequences(testing_sequences, maxlen=max_length) # convert the testing_sequences to testing_padded

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # reverse the key,value from word_index to check index value to key

In [None]:
print(decode_review(padded[0])) #This is after it has been padded and OOV in place, but decoded.  Recall this is actually padded as a sequence of numbers for training
print(training_sentences[0]) #This is the original

## Create Model

In [None]:
model,model_description=model_picker(vocab_size=vocab_size,embedding_dim=embedding_dim,input_length=max_length,model_name='model1')

In [None]:
model.summary()

In [None]:
model.compile(loss=loss,optimizer=optimizer,metrics=[accuracy_metric])

## Train Model

In [None]:
history=model.fit(padded, training_labels_final,epochs=epochs,validation_data=(testing_padded,
                                                                           testing_labels_final))

## Evaluate Model

In [None]:
e = model.layers[0]
weights=e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
final_eval_loss,final_eval_acc=model.evaluate(testing_padded,testing_labels_final)

In [None]:
final_eval_loss

In [None]:
trainableParams = np.sum([np.prod(v.get_shape()) for v in model.trainable_weights])
trainableParams

In [None]:
plot_train_eval(history,model_name,trainableParams,optimizer_name,loss,accuracy_metric,epochs,vocab_size,embedding_dim,max_length,trunc_type,oov_tok)

In [None]:
df=pd.DataFrame([[filename,model_name,model_description,optimizer,loss,accuracy_metric,epochs,vocab_size,embedding_dim,max_length,trunc_type,oov_tok,final_eval_loss,final_eval_acc]],columns=cols)
df_metrics=df_metrics.append(df,ignore_index=True)
df_metrics

In [None]:
model.save(model_name) #save model
model=tf.keras.models.load_model(model_name) #load model
df_metrics.to_excel('df_metrics.xlsx')

In [None]:
import io

out_v = io.open(f'vecs_{model_name}.tsv', 'w', encoding='utf-8')
out_m = io.open(f'meta_{model_name}.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download(f'vecs_{model_name}.tsv')
  files.download(f'meta_{model_name}.tsv')

In [None]:
# Testing new Custom Reviews
sentence1='What a great movie.'
test1=tokenizer.texts_to_sequences([sentence1])
test1= pad_sequences(test1, maxlen=max_length, truncating=trunc_type)
answer=model.predict(test1)
print(f'Review for this sentence: \n {sentence1} \n is a value of {answer}')
print('This makes sense since it is a value close to 1, meaning a good review')
print('\n')

sentence2='What a terrible movie.'
test2=tokenizer.texts_to_sequences([sentence2])
test2= pad_sequences(test2, maxlen=max_length, truncating=trunc_type)
answer=model.predict(test2)
print(f'Review for this sentence: \n {sentence2} \n is a value of {answer}')
print('This makes sense since it is a value close to 0, meaning a bad review')

# 5.0 Export Models

In [None]:
!zip -r /content/model1.zip /content/model1

In [None]:
!zip -r /content/model2.zip /content/model2

In [None]:
!zip -r /content/model3.zip /content/model3

In [None]:
!zip -r /content/model4.zip /content/model4

In [None]:
!zip -r /content/model5.zip /content/model5

In [None]:
!zip -r /content/model6.zip /content/model6

In [None]:
!zip -r /content/model7.zip /content/model7

# Install **rclone**
This is so you can save your checkpoints weights to your actual google drive for restoration.

In [None]:
! curl https://rclone.org/install.sh | sudo bash

You want to create a remote configuration with rclone.  Use the default recommendations.

In [None]:
!rclone config

# Copy Content to **Google Drive** with **rclone**
Try to copy exisiting weights to a future directory to ensure rclone is configured right

This line tests the ability to copy known weights to a directory on you **Google Drive** with **rclone**

In [None]:
!rclone copy "/content/"  remote:"/Colab_Notebooks/GitHub/TensorFlow_Examples/Basics/Wk6/content"