# Deep Learning with words

# One hot encoding of words of characters

In [None]:
import numpy as np

In [23]:
samples= ['The cat sat on the mat.','the dog ate my homework.']

In [43]:
for sample in samples:
    for word in sample.split():
        print(word)

The
cat
sat
on
the
mat.
the
dog
ate
my
homework.


In [44]:
# building index
token_index={}
for sample in samples:
    for word in sample.split():
        # it's in sequential order
        if word not in token_index:
            token_index[word]=len(token_index)+1
# Next, we vectorize our samples

max_length=10
results= np.zeros((len(samples),max_length,max(token_index.values())+1))
 #results.shape= (2,10,11)
for i, sample in enumerate(samples):
    for j,word in list(enumerate(sample.split()))[:max_length]:
        index= token_index.get(word)
        results[i,j,index]=1.


In [45]:
results.shape

(2, 10, 11)

## Character level one hot encoding

In [49]:
import string
samples= ['The cat sat on the mat.','the dog ate my homework.']

In [50]:
characters = string.printable  # All printable ASCII characters.
token_index = dict(zip(range(1, len(characters) + 1), characters))

max_length = 50
results = np.zeros((len(samples), max_length, max(token_index.keys()) + 1))
for i, sample in enumerate(samples):
    for j, character in enumerate(sample):
        index = token_index.get(character)
        results[i, j, index] = 1

In [53]:
results

array([[[ 1.,  1.,  1., ...,  1.,  1.,  1.],
        [ 1.,  1.,  1., ...,  1.,  1.,  1.],
        [ 1.,  1.,  1., ...,  1.,  1.,  1.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],

       [[ 1.,  1.,  1., ...,  1.,  1.,  1.],
        [ 1.,  1.,  1., ...,  1.,  1.,  1.],
        [ 1.,  1.,  1., ...,  1.,  1.,  1.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]])

In [52]:
results.shape

(2, 50, 101)

## Keras inbuilt utility for one hot encoding text at the word level or character level- THIS IS THE ONE THAT SHOULD BE USED

In [54]:
from keras.preprocessing.text import Tokenizer


Using TensorFlow backend.


In [55]:
samples = ['The cat sat on the mat.', 'The dog ate my homework.'] 

## We create a tokenizer, configured only to take into account first 1000 most common words

In [56]:
tokenizer= Tokenizer(num_words=1000)

## Build word index

In [57]:
tokenizer.fit_on_texts(samples)

In [58]:
samples

['The cat sat on the mat.', 'The dog ate my homework.']

## This turns strings into list of integer incides

In [59]:
sequences= tokenizer.texts_to_sequences(samples)

In [68]:
sequences

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]

In [61]:
one_hot_results= tokenizer.texts_to_matrix(samples, mode='binary')

In [66]:

one_hot_results.shape


(2, 1000)

In [65]:
one_hot_results

array([[ 0.,  1.,  1., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.]])

In [69]:
tokenizer.word_index

{'ate': 7,
 'cat': 2,
 'dog': 6,
 'homework': 9,
 'mat': 5,
 'my': 8,
 'on': 4,
 'sat': 3,
 'the': 1}

In [70]:
len(tokenizer.word_index)

9

# Word embedding- Another popular way to associate a vector with a word.

One hot encoders are sparse, embedders are 256, 512 dimensional , however one hot encoders are usually 20K+ dimensions

Advantage of word embedders-
1. Dense
2. Low dimensional
3. Learned from Data

## Ways to obtain word embedding

1. Pretrained word embeddings are loaded 
2. Learn word embedings jointly with the main task( document classification) . We start with random word embedder and then learn word vector in the same way you learn weights of a NN


### Code to learn weights in the embedding layer

In [73]:
from keras.layers import Embedding
# The Embedding layer takes at least two arguments:
# the number of possible tokens, here 1000 (1 + maximum word index),
# and the dimensionality of the embeddings, here 64.
embedding_layer = Embedding(1000, 64)

*** Word_index<<<< Embedding layer<<< corresponding word vector

input = 2 D tensor of integer( samples, sequence_length) 32,10
Output= 3D tensor of floating point(samples,sequence_length, embedding_dimensionality). This 3 D tensor can be processed by a RNN layer or 1 D CNN



## Example

Let’s apply this idea to the IMDB movie review sentiment prediction task that you are already familiar with. With, let’s quickly prepare the data. We will restrict the movie reviews to the top 10,000 most common words (like we did the first time we worked with this dataset), and cut the reviews after only 20 words. Our network will simply learn 8-dimensional embeddings for each of the 10,000 words, turn the input integer sequences (2D integer tensor) into embedded sequences (3D float tensor), flatten the tensor to 2D, and train a single Dense layer on top for classification.

### Loading the IMDB data for use of embedding layer

In [75]:
from keras.datasets import imdb
from keras import preprocessing

# Number of words to consider as features
max_features = 10000
# Cut texts after this number of words
# (among top max_features most common words)
maxlen = 20

# Load the data as lists of integers.
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

# This turns our lists of integers
# into a 2D integer tensor of shape (samples, maxlen)
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

## Embedding layer and classifier on IMDB

In [76]:
from keras.models import Sequential
from keras.layers import Flatten, Dense

model = Sequential()
# We specify the maximum input length to our Embedding layer
# so we can later flatten the embedded inputs”
model.add(Embedding(10000,8,input_length=maxlen))
# After embedding, our activations has shape (sample, maxlen,8)
# Now flatten 3 D tensor of embeddings into 2D tensor of shape( sample,maxlen*8)
model.add(Flatten())
# Now add a classifier on top
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])


In [77]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [78]:
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [81]:
history.history

{'acc': [0.64854999999999996,
  0.75939999999999996,
  0.79335,
  0.80679999999999996,
  0.81974999999999998,
  0.83115000000000006,
  0.84189999999999998,
  0.84830000000000005,
  0.85840000000000005,
  0.86675000000000002],
 'loss': [0.65605130405426026,
  0.51894234309196474,
  0.45120858936309816,
  0.41905834162235261,
  0.396481329369545,
  0.37844066197872162,
  0.36240952007770538,
  0.34742496838569642,
  0.33297319020032884,
  0.31937680296897886],
 'val_acc': [0.71479999999999999,
  0.73640000000000005,
  0.74719999999999998,
  0.75380000000000003,
  0.75719999999999998,
  0.75939999999999996,
  0.75739999999999996,
  0.75719999999999998,
  0.75280000000000002,
  0.75519999999999998],
 'val_loss': [0.59067819452285764,
  0.5117304918289185,
  0.49494691257476808,
  0.49052827835083007,
  0.49144533996582029,
  0.49532687273025511,
  0.50039422836303715,
  0.50577053146362305,
  0.51221738443374631,
  0.51825859327316282]}

We get to a validation accuracy of ~76%, which is pretty good considering that we are only look at the first 20 words in every review. But note that merely flattening the embedded sequences and training a single Dense layer on top leads to a model that treats each word in the input sequence separately, without considering inter-word relationships and structure sentence (e.g. it would likely treat both "this movie is shit" and "this movie is the shit" as being negative "reviews"). It would be much better to add recurrent layers or 1D convolutional layers on top of the embedded sequences to learn features that take into account each sequence as a whol

## Using pretrained word embedding

1. Word2vec
2. Glove

### Putting it all together: from raw text to word embeddings”
 

“We will be using a model similar to the one we just went over — embedding sentences in sequences of vectors, flattening them and training a Dense layer on top. But we will do it using pre-trained word embeddings, and instead of using the pre-tokenized IMDB data packaged in Keras, we will start from scratch, by downloading the original text data.”


In [82]:
# Download IMDB Data
#/Users/shashank/Downloads/SpringboardDatascience/_1Deeplearning/Deep_learning_with_Python

In [83]:
import os

imdb_dir = '/Users/shashank/Downloads/SpringboardDatascience/_1Deeplearning/Deep_learning_with_Python/Chapter6_assignment/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

## TOKENIZE THE DATA
Let’s vectorize the texts we collected, and prepare a training and validation split


In [84]:
from keras.preprocessing.text import Tokenizer

from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100  # We will cut reviews after 100 words
training_samples = 200  # We will be training on 200 samples
validation_samples = 10000  # We will be validating on 10000 samples
max_words = 10000  # We will only consider the top 10,000 words in the dataset

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 88582 unique tokens.
Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)


## Split data into train and validation

In [85]:
data.shape

(25000, 100)

In [87]:
## Shuffle
indices= np.arange(data.shape[0])
np.random.shuffle(indices)
labels= labels[indices]

In [89]:
x_train= data[:training_samples]
y_train= labels[:training_samples]
x_val=data[training_samples:training_samples+validation_samples]
y_val=data[training_samples:training_samples+validation_samples]


### Download GLOVE WORD Embedding( 822 MB zip files 100 dimensional embedding vectors for 40,000 words)
nlp.stanford.edu/projects/glove

parse the un-zipped file (it’s a txt file) to build an index mapping words (as strings) to their vector representation (as number vectors). 

In [90]:
glove_dir = '/Users/shashank/Downloads/SpringboardDatascience/_1Deeplearning/Deep_learning_with_Python/Chapter6_assignment/glove.6B'  
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


## Preparing Glove word embedding matrix

In [91]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

# Define Model

In [102]:
from keras.models import Sequential
from keras.layers import Flatten, Dense

model = Sequential()
# We specify the maximum input length to our Embedding layer
# so we can later flatten the embedded inputs”
model.add(Embedding(max_words,embedding_dim,input_length=maxlen))
# After embedding, our activations has shape (sample, maxlen,8)
# Now flatten 3 D tensor of embeddings into 2D tensor of shape( sample,maxlen*8)
model.add(Flatten())
# Now add a classifier on top
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_3 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


## Loading matrix of pretrained word embedder into Embedding layer and “freezing the embedding layer”
 

In [103]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False 

In [104]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])


In [107]:
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
