# Week 2: Word Embedding
## [IMDB dataset for binary sentiment classification](http://ai.stanford.edu/~amaas/data/sentiment/)

In [3]:
### Import modules:

import tensorflow as tf 
print(tf.__version__)
tf.enable_eager_execution() #For tf 1.x, for tf 2 you don't need this
import tensorflow_datasets as tfds
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

1.13.1


## Use the IMDB dataset

In [4]:
imdb, info = tfds.load("imdb_reviews", with_info= True, as_supervised= True)
#tfds.list_builders() #to know all the datasets contained on tfds

train_data, test_data = imdb['train'], imdb['test']

### Create empty lists: 
training_sentences= []
training_labels= []

test_sentences= []
test_labels= []

for s,l in train_data: 
    training_sentences.append(str(s.numpy()))
    training_labels.append(str(l.numpy()))

for s,l in test_data:
    test_sentences.append(str(s.numpy()))
    test_labels.append(str(l.numpy()))




Instructions for updating:
Colocations handled automatically by placer.


W0901 18:27:09.085484 4577232320 deprecation.py:323] From /anaconda3/envs/Tensorflow/lib/python3.5/site-packages/tensorflow/python/data/ops/iterator_ops.py:532: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [5]:
print("Trainig sentences: \n",training_sentences[0], "\n")
print("Training labels: '0' is negative \n",training_labels[0], "\n")
print("Test sentences: \n", test_sentences[0], "\n")
print("Test labels: '1' is positive \n", test_labels[0], "\n")

Trainig sentences: 
 b'I have no idea what the other reviewer is talking about- this was a wonderful movie, and created a sense of the era that feels like time travel. The characters are truly young, Mary is a strong match for Byron, Claire is juvenile and a tad annoying, Polidori is a convincing beaten-down sycophant... all are beautiful, curious, and decadent... not the frightening wrecks they are in Gothic.<br /><br />Gothic works as an independent piece of shock film, and I loved it for different reasons, but this works like a Merchant and Ivory film, and was from my readings the best capture of what the summer must have felt like. Romantic, yes, but completely rekindles my interest in the lives of Shelley and Byron every time I think about the film. One of my all-time favorites.' 

Training labels: '0' is negative 
 1 

Test sentences: 
 b"I've watched the movie actually several times. And what i want to say about it is the only thing that made this movie high rank was the Burak A

In [6]:
### When training my labels are expected to be numpy arrays: 
training_labels_final = np.array(training_labels)
test_labels_final= np.array(test_labels)

print("Training labels: \n", training_labels_final.shape, "\n")
print("Testing labels: \n", test_labels_final.shape, "\n")


Training labels: 
 (25000,) 

Testing labels: 
 (25000,) 



## Tokenize our sentences: 

In [7]:
### -- Hyperparameters:
vocab_size= 10000
embedding_dim= 16
max_length= 120
trunc_type= 'post'
oov_tok= "<OOV>"

tokenizer= Tokenizer(num_words= vocab_size, oov_token= oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index= tokenizer.word_index
sequences= tokenizer.texts_to_sequences(training_sentences)
padded= pad_sequences(sequences=sequences, maxlen= max_length, truncating=trunc_type)

### -- Test sequences: 
testing_sequences= tokenizer.texts_to_sequences(test_sentences)
testing_padded= pad_sequences(testing_sequences, maxlen=max_length)

### --- Neural Network: 
#1) DEFINE THE MODEL: Sequential: defines a SEQUENCE of layers in the NN
# 1.1) Embedding layer: the key to text sentiment analysis in TF; the result 2D array, row sentences and columns 
      #the 16th dimensions
# 1.2) Flatten: we need to flat the 2D array from the embedding
# 1.3) Dense: fully connected neurons, first hidden layer of 6 neurons
# 1.4) Dense: one neuron

model= tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length= max_length), #the most important for NLP
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='softmax')
])


In [8]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


## Alternatively we can use 'GlobalAveragePooling1D'

In [9]:
model= tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length= max_length), #the most important for NLP
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='softmax')
])

model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 102       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________
