## Bag of Words

In [1]:
vocab = {}
word_encoding = 1
def bag_of_words(text):
    global word_encoding

    words = text.lower().split(' ')
    bag = {}

    for word in words:
        if word in vocab:
            encoding = vocab[word]
        else:
            vocab[word] = word_encoding
            encoding = word_encoding
            word_encoding += 1

        if encoding in bag:
            bag[encoding] += 1
        else:
            bag[encoding] = 1
        
    return bag

text = 'this is a test to see if this test will work is is test a a'
bag = bag_of_words(text)
print(bag)
print(vocab)

{1: 2, 2: 3, 3: 3, 4: 3, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


In [2]:
# order of words is lost in the bag_of_words encoding technique
positive_review = 'I thought the movie was going to be bad but it was actually amazing'
negative_review = 'I thought the movie was going to be amazing but it was actually bad'

pos_bag = bag_of_words(positive_review)
neg_bag = bag_of_words(negative_review)

print('Positive: ', pos_bag)
print('Negative: ', neg_bag)

Positive:  {10: 1, 11: 1, 12: 1, 13: 1, 14: 2, 15: 1, 5: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1}
Negative:  {10: 1, 11: 1, 12: 1, 13: 1, 14: 2, 15: 1, 5: 1, 16: 1, 21: 1, 18: 1, 19: 1, 20: 1, 17: 1}


## Integer Encoding

In [3]:
vocab = {}
word_encoding = 1
def one_hot_encoding(text):
    global word_encoding

    words = text.lower().split(' ')
    encoding = []

    for word in words:
        if word in vocab:
            code = vocab[word]
            encoding.append(code)
        else:
            vocab[word] = word_encoding
            encoding.append(word_encoding)
            word_encoding += 1
    
    return encoding

text = 'this is a test to see if this test will work is is test a a'
encoding = one_hot_encoding(text)
print(encoding)
print(vocab)

[1, 2, 3, 4, 5, 6, 7, 1, 4, 8, 9, 2, 2, 4, 3, 3]
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


In [4]:
positive_review = 'I thought the movie was going to be bad but it was actually amazing'
negative_review = 'I thought the movie was going to be amazing but it was actually bad'

pos_encode = one_hot_encoding(positive_review)
neg_encode = one_hot_encoding(negative_review)

print('Positive: ', pos_encode)
print('Negative: ', neg_encode)

#this technique is better cuz it keeps a track of the words in their order of occurence, but evidently not suitable as well...

Positive:  [10, 11, 12, 13, 14, 15, 5, 16, 17, 18, 19, 14, 20, 21]
Negative:  [10, 11, 12, 13, 14, 15, 5, 16, 21, 18, 19, 14, 20, 17]


# Sentiment Analysis using Word Embeddings

In [5]:
%tensorflow_version 2.x
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import os
from keras.datasets import imdb
from keras.preprocessing import sequence

VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 64

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = VOCAB_SIZE)

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [6]:
X_train[1]

[1,
 194,
 1153,
 194,
 8255,
 78,
 228,
 5,
 6,
 1463,
 4369,
 5012,
 134,
 26,
 4,
 715,
 8,
 118,
 1634,
 14,
 394,
 20,
 13,
 119,
 954,
 189,
 102,
 5,
 207,
 110,
 3103,
 21,
 14,
 69,
 188,
 8,
 30,
 23,
 7,
 4,
 249,
 126,
 93,
 4,
 114,
 9,
 2300,
 1523,
 5,
 647,
 4,
 116,
 9,
 35,
 8163,
 4,
 229,
 9,
 340,
 1322,
 4,
 118,
 9,
 4,
 130,
 4901,
 19,
 4,
 1002,
 5,
 89,
 29,
 952,
 46,
 37,
 4,
 455,
 9,
 45,
 43,
 38,
 1543,
 1905,
 398,
 4,
 1649,
 26,
 6853,
 5,
 163,
 11,
 3215,
 10156,
 4,
 1153,
 9,
 194,
 775,
 7,
 8255,
 11596,
 349,
 2637,
 148,
 605,
 15358,
 8003,
 15,
 123,
 125,
 68,
 23141,
 6853,
 15,
 349,
 165,
 4362,
 98,
 5,
 4,
 228,
 9,
 43,
 36893,
 1157,
 15,
 299,
 120,
 5,
 120,
 174,
 11,
 220,
 175,
 136,
 50,
 9,
 4373,
 228,
 8255,
 5,
 25249,
 656,
 245,
 2350,
 5,
 4,
 9837,
 131,
 152,
 491,
 18,
 46151,
 32,
 7464,
 1212,
 14,
 9,
 6,
 371,
 78,
 22,
 625,
 64,
 1382,
 9,
 8,
 168,
 145,
 23,
 4,
 1690,
 15,
 16,
 4,
 1355,
 5,
 28,
 6,
 52,
 

In [0]:
X_train = sequence.pad_sequences(X_train, MAXLEN)
X_test = sequence.pad_sequences(X_test, MAXLEN)

In [0]:
model = keras.Sequential([
                          keras.layers.Embedding(VOCAB_SIZE, 32),
                          keras.layers.LSTM(32),
                          keras.layers.Dense(1, activation='sigmoid')
])

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          2834688   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])

history = model.fit(X_train, y_train, epochs=15, validation_split=0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [11]:
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Accuracy: ', accuracy)

Test Accuracy:  0.8327199816703796


In [12]:
word_index = imdb.get_word_index()

def encode_text(text):
    tokens = keras.preprocessing.text.text_to_word_sequence(text)
    tokens = [word_index[word] if word in word_index else 0 for word in tokens]
    return sequence.pad_sequences([tokens], MAXLEN)[0]

text = 'that movie was just amazing, so amazing'
encoded = encode_text(text)
print(encoded)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0  

In [13]:
reverse_word_index = {value: key for (key, value) in word_index.items()}

def decode_integers(integers):
    PAD = 0
    text = ''
    for num in integers:
        if num != PAD:
            text += reverse_word_index[num] + ' '
    
    return text[:-1]

print(decode_integers(encoded))

that movie was just amazing so amazing


In [14]:
def predict(text):
    encoded_text = encode_text(text)
    pred = np.zeros((1,250))
    pred[0] = encoded_text
    result = model.predict(pred)
    print('Positive' if result[0][0]>0.5 else 'Negative', '({}%)'.format(round(result[0][0]*100)))

positive_review = 'That movie was! really loved it and would great watch it again because it was amazingly great'
negative_review = 'that movie really sucked. I hated it and wouldn\'t watch it again. Was one of the worst things I\'ve ever watched'

predict(positive_review)
predict(negative_review)

Positive (94.0%)
Negative (27.0%)


# RNN Play Generator

In [15]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [16]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print('Length of characters: {}'.format(len(text)))

Length of characters: 1115394


In [17]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [0]:
vocab = sorted(set(text))
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
    return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [19]:
print('Text: ', text[:13])
print('Encoded: ', text_to_int(text[:13]))

Text:  First Citizen
Encoded:  [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [20]:
def int_to_text(ints):
    try:
        ints = ints.numpy()
    except:
        pass
    return ''.join(idx2char[ints])

print(int_to_text(text_as_int[:13]))

First Citizen


In [0]:
#creating a training set to predict the next character
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [0]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [0]:
def split_input_target(chunk): # hello
    input_text = chunk[:-1] # hell
    target_text = chunk[1:] # ello
    return input_text, target_text # hell, ello

dataset = sequences.map(split_input_target)

In [24]:
for x,y in dataset.take(2):
    print("EXAMPLE:\n")
    print("INPUT")
    print(int_to_text(x), '\n')
    print("OUTPUT")
    print(int_to_text(y), '\n')

EXAMPLE:

INPUT
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 

OUTPUT
irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You  

EXAMPLE:

INPUT
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you  

OUTPUT
re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k 



In [0]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab) # vocab is the number of unique chars
EMBEDDING_DIM = 256
RNN_UNITS = 1024

BUFFER_SIZE = 1000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [49]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = keras.Sequential([
                              keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
                              keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                              keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (64, None, 256)           16640     
_________________________________________________________________
lstm_3 (LSTM)                (64, None, 1024)          5246976   
_________________________________________________________________
dense_3 (Dense)              (64, None, 65)            66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [50]:
for input_example_batch, target_example_batch in data.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, '# (batch_size, sequence_length, vocab_size)')

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [51]:
print(len(example_batch_predictions))
print(example_batch_predictions)

64
tf.Tensor(
[[[ 9.6880272e-04  3.0556913e-03  3.3980391e-03 ... -1.3966148e-04
    1.9952897e-03 -7.1950373e-05]
  [ 3.0743359e-03 -2.2219943e-03  2.6716525e-03 ...  3.7143948e-03
    2.8870385e-03 -1.1140930e-03]
  [ 4.4463533e-03  1.7829414e-03  1.5883931e-03 ...  1.5707205e-03
    4.8416434e-05  1.0040414e-02]
  ...
  [ 6.9069518e-03 -4.7609517e-03 -4.0947595e-03 ...  2.2956398e-03
    1.2813058e-02 -3.7204034e-03]
  [ 8.0197789e-03 -7.8332692e-04 -4.4079744e-03 ...  5.5266498e-04
    8.1237005e-03  7.2978735e-03]
  [ 7.9727247e-03  4.6203355e-04 -7.5487336e-03 ...  2.3545930e-05
    9.2085823e-03  4.1205185e-03]]

 [[ 1.7357501e-04 -2.2671786e-03  3.9059692e-03 ...  5.0990598e-04
   -8.7830611e-04  8.7094854e-04]
  [ 4.3279706e-03 -5.3087845e-03  9.0881018e-03 ...  1.1131704e-03
   -3.9194003e-03 -1.9457079e-03]
  [ 5.4120435e-03 -3.1920951e-03  5.0842343e-03 ... -2.3861080e-03
   -1.5347456e-03 -2.1986235e-03]
  ...
  [-1.9716259e-04  9.0260990e-05 -6.1734514e-03 ... -7.8346375e

In [52]:
pred = example_batch_predictions[0]
print(len(pred))
print(pred)

100
tf.Tensor(
[[ 9.6880272e-04  3.0556913e-03  3.3980391e-03 ... -1.3966148e-04
   1.9952897e-03 -7.1950373e-05]
 [ 3.0743359e-03 -2.2219943e-03  2.6716525e-03 ...  3.7143948e-03
   2.8870385e-03 -1.1140930e-03]
 [ 4.4463533e-03  1.7829414e-03  1.5883931e-03 ...  1.5707205e-03
   4.8416434e-05  1.0040414e-02]
 ...
 [ 6.9069518e-03 -4.7609517e-03 -4.0947595e-03 ...  2.2956398e-03
   1.2813058e-02 -3.7204034e-03]
 [ 8.0197789e-03 -7.8332692e-04 -4.4079744e-03 ...  5.5266498e-04
   8.1237005e-03  7.2978735e-03]
 [ 7.9727247e-03  4.6203355e-04 -7.5487336e-03 ...  2.3545930e-05
   9.2085823e-03  4.1205185e-03]], shape=(100, 65), dtype=float32)


In [53]:
time_pred = pred[0]
print(len(time_pred))
print(time_pred)

65
tf.Tensor(
[ 9.6880272e-04  3.0556913e-03  3.3980391e-03 -1.0073746e-03
 -6.0003502e-03 -3.6005834e-03 -8.4221183e-04  8.8817289e-04
  2.4361722e-03  7.0229650e-04  4.0602819e-03 -1.2154115e-03
 -2.9926936e-03  3.9789854e-03  2.7092511e-04 -3.1422945e-03
 -7.5713354e-03 -8.0915401e-04  1.1393141e-03  4.6523996e-03
  1.8723740e-04  2.5818120e-03  9.5314570e-03 -7.4303062e-03
  3.3640021e-03 -3.0034173e-03 -6.7128870e-04  5.5252006e-03
 -6.6528394e-04  1.6011856e-04 -3.4095726e-03 -1.2598583e-04
 -1.5860993e-03  8.5732486e-04 -9.1161230e-04 -5.4355443e-04
 -2.0609151e-03  1.7321610e-03 -3.1430656e-03 -3.3081633e-03
  1.9767212e-03 -6.1710542e-03 -3.6768154e-03 -1.0245168e-03
  2.9124448e-03  3.2009366e-03 -2.8033825e-03 -3.0788681e-03
 -3.4109987e-03  4.6167220e-04 -2.8742822e-03 -2.7631156e-03
 -1.7249414e-03  1.6480217e-03 -2.1683809e-04  3.3935625e-03
  4.1514621e-03  5.6266016e-04 -8.6298753e-03 -8.6520950e-04
 -2.0970432e-03  1.8685800e-03 -1.3966148e-04  1.9952897e-03
 -7.195037

In [54]:
sampled_indices = tf.random.categorical(pred, num_samples=1)

sampled_indices = np.reshape(sampled_indices, (1,-1))[0]
predicted_chars = int_to_text(sampled_indices)

print(predicted_chars)


RdgDp.:kO!WlMd&LzsbsACKKwk?'I.x:JJUuSZZWeG

?m3krnPi:PS;g:wIgJW,nibxTS'
fmmZOX'W.hMuI KKlrZhHJK-cka


In [0]:
# we have a very weird output shape and hence need to define a special loss function for it as preloaded ones wont work

def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [0]:
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

In [0]:
# model checkpointing will be done to resume training from a given checkpoint

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'chk_{epoch}.h5')

checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True
)

In [59]:
history = model.fit(data, epochs=50, callbacks=[checkpoint_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [0]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

In [0]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [0]:
# checkpoint_num = 10
# model.load_weights(tf.train.load_checkpoint("./training_checkpoints/chk_" + str(checkpoint_num) + '.h5'))
# model.build(tf.TensorShape([1, None]))

In [0]:
def generate_text(model, start_string):
    num_generate = 800

    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    temperature = 1.0 # this is a randomizer magnitude, low_temp is more preidictable, high_temp is more surprising

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)

        predictions = tf.squeeze(predictions, 0) # removing batch dimension

        predictions = predictions/temperature # using a categorical distribution to predict the character returned by the model
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [69]:
inp = input('Start typing a string: ')
print(generate_text(model, inp))

Start typing a string: Hello
Hellod.

PROSPERO:
No, boy! good repose, but though my man has my vole to bed.
What makes you frown: to't your evil displeasure's
ow;
I would or great angeles. What ballad is this?
The man I find I have poison with this bond.

WARWICK:
Onfeitner,
Thou canst not speak, and every office lunity
and all the rest fear the great sorrow this way
The head of merit, will you talk of servant;
And, for the least shunning a foul grave is likely.

BAPTISTA:
Why, tell me, you're well believe your babe, but march am bound
to entreat my trick,
And branches may be gone about a ciper: I
make a good heart that move, please my study,
And twelve a baby's carpendan took.

KATHARINA:
Was ever man so still and women come.
Hark! Hast thou from Pristo and the King of Naples,
Suito thy drum.

LUCENTIO:
What marriage I wi
