<a href="https://colab.research.google.com/github/pratbharat/MachineLearning/blob/main/movie_review(RNN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install h5py



In [1]:
# Download word vectors
from urllib.request import urlretrieve
import os
if not os.path.isfile('/content/mini.h5'):
    print("Downloading Conceptnet Numberbatch word embeddings...")
    conceptnet_url = 'http://conceptnet.s3.amazonaws.com/precomputed-data/2016/numberbatch/17.06/mini.h5'
    urlretrieve(conceptnet_url, '/content/mini.h5')

Downloading Conceptnet Numberbatch word embeddings...


In [13]:
# Load the file and pull out words and embeddings
import h5py

with h5py.File('/content/mini.h5', 'r') as f:
    all_words = [word.decode('utf-8') for word in f['mat']['axis1'][:]]
    all_embeddings = f['mat']['block0_values'][:]
    
print("all_words dimensions: {}".format(len(all_words)))
print("all_embeddings dimensions: {}".format(all_embeddings.shape))

print("Random example word: {}".format(all_words[1337:1347]))
print("Random example embeddings: {}".format((all_embeddings[1337][:]).shape))
[word[6:] for word in all_words if word.startswith('c/en/')]

all_words dimensions: 362891
all_embeddings dimensions: (362891, 300)
Random example word: ['/c/de/aufmachung', '/c/de/aufmarsch', '/c/de/aufmerksam', '/c/de/aufmerksamkeit', '/c/de/aufn', '/c/de/aufnahme', '/c/de/aufnahmen', '/c/de/aufnehmen', '/c/de/aufpassen', '/c/de/aufprall']
Random example embeddings: (300,)


[]

In [22]:
# Restrict our vocabulary to just the English words
english_words = [word[6:] for word in all_words if word.startswith('/c/en/')]
english_word_indices = [i for i, word in enumerate(all_words) if word.startswith('/c/en/')]
english_embeddings = all_embeddings[english_word_indices]

print("Number of English words in all_words: {0}".format(len(english_words)))
print("english_embeddings dimensions: {0}".format(english_embeddings.shape))

print(english_words[1337])

Number of English words in all_words: 150875
english_embeddings dimensions: (150875, 300)
activated_carbon


In [26]:
import numpy as np

norms = np.linalg.norm(english_embeddings, axis=1)
normalized_embeddings = english_embeddings.astype('float32') / norms.astype('float32').reshape([-1, 1])
print(norms[1337])

57.0350769263968


In [27]:
index = {word: i for i, word in enumerate(english_words)}

In [32]:
def similarity_score(w1, w2):
    score = np.dot(normalized_embeddings[index[w1], :], normalized_embeddings[index[w2], :])
    return score

print([index['cat']])

# A word is as similar with itself as possible:
print('cat\tcat\t', similarity_score('cat', 'cat'))

# Closely related words still get high scores:
print('cat\tfeline\t', similarity_score('cat', 'feline'))
print('cat\tdog\t', similarity_score('cat', 'dog'))

# Unrelated words, not so much
print('cat\tmoo\t', similarity_score('cat', 'moo'))
print('cat\tfreeze\t', similarity_score('cat', 'freeze'))

# Antonyms are still considered related, sometimes more so than synonyms
print('antonym\topposite\t', similarity_score('antonym', 'opposite'))
print('antonym\tsynonym\t', similarity_score('antonym', 'synonym'))

[21398]
cat	cat	 1.0000001
cat	feline	 0.8199548
cat	dog	 0.590724
cat	moo	 0.0039538303
cat	freeze	 -0.030225191
antonym	opposite	 0.3941065
antonym	synonym	 0.46883982


In [36]:
def closest_to_vector(v, n):
    all_scores = np.dot(normalized_embeddings, v)
    best_words = list(map(lambda i: english_words[i], reversed(np.argsort(all_scores))))
    return best_words[:n]

def most_similar(w, n):
    return closest_to_vector(normalized_embeddings[index[w], :], n)

In [37]:
print(most_similar('cat', 10))
print(most_similar('dog', 10))
print(most_similar('duke', 10))

['cat', 'humane_society', 'kitten', 'feline', 'colocolo', 'cats', 'kitty', 'maine_coon', 'housecat', 'sharp_teeth']
['dog', 'dogs', 'wire_haired_dachshund', 'doggy_paddle', 'lhasa_apso', 'good_friend', 'puppy_dog', 'bichon_frise', 'woof_woof', 'golden_retrievers']
['duke', 'dukes', 'duchess', 'duchesses', 'ducal', 'dukedom', 'duchy', 'voivode', 'princes', 'prince']


In [40]:
def solve_analogy(a1, b1, a2):
    b2 = normalized_embeddings[index[b1], :] - normalized_embeddings[index[a1], :] + normalized_embeddings[index[a2], :]
    return closest_to_vector(b2, 1)

print(solve_analogy("man", "brother", "woman"))
print(solve_analogy("man", "husband", "woman"))
print(solve_analogy("spain", "madrid", "france"))

['sister']
['wife']
['paris']


In [47]:
if not os.path.isfile('/content/movie-simple.txt'):
  url= 'https://github.com/duke-mlss/Duke-MLSS-2018/blob/master/movie-simple.txt'
  urlretrieve(url, '/content/movie-simple.txt')

In [52]:
import string
remove_punct=str.maketrans('','',string.punctuation)

# This function converts a line of our data file into
# a tuple (x, y), where x is 300-dimensional representation
# of the words in a review, and y is its label.
def convert_line_to_example(line):
    # Pull out the first character: that's our label (0 or 1)
    y = int(line[0])
    
    # Split the line into words using Python's split() function
    words = line[2:].translate(remove_punct).lower().split()
    
    # Look up the embeddings of each word, ignoring words not
    # in our pretrained vocabulary.
    embeddings = [normalized_embeddings[index[w]] for w in words
                  if w in index]
    
    # Take the mean of the embeddings
    x = np.mean(np.vstack(embeddings), axis=0)
    return x, y

# Apply the function to each line in the file.
xs = []
ys = []
with open("/content/movie-simple.txt", "r", encoding='utf-8', errors='ignore') as f:
    for l in f.readlines():
        x, y = convert_line_to_example(l)
        xs.append(x)
        ys.append(y)

# Concatenate all examples into a numpy array
xs = np.vstack(xs)
ys = np.vstack(ys)

In [54]:
print("Shape of inputs: {}".format(xs.shape))
print("Shape of labels: {}".format(ys.shape))

num_examples = xs.shape[0]

Shape of inputs: (1411, 300)
Shape of labels: (1411, 1)


In [55]:
print("First 20 labels before shuffling: {0}".format(ys[:20, 0]))

shuffle_idx = np.random.permutation(num_examples)
xs = xs[shuffle_idx, :]
ys = ys[shuffle_idx, :]

print("First 20 labels after shuffling: {0}".format(ys[:20, 0]))

First 20 labels before shuffling: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
First 20 labels after shuffling: [0 0 1 0 0 1 1 1 0 0 1 1 0 1 1 0 0 0 1 1]


In [56]:
import torch

num_train = 4*num_examples // 5

x_train = torch.tensor(xs[:num_train])
y_train = torch.tensor(ys[:num_train], dtype=torch.float32)

x_test = torch.tensor(xs[num_train:])
y_test = torch.tensor(ys[num_train:], dtype=torch.float32)

In [57]:
reviews_train = torch.utils.data.TensorDataset(x_train, y_train)
reviews_test = torch.utils.data.TensorDataset(x_test, y_test)

train_loader = torch.utils.data.DataLoader(reviews_train, batch_size=100, shuffle=True)
test_loader = torch.utils.data.DataLoader(reviews_test, batch_size=100, shuffle=False)

In [58]:
import torch.nn as nn
import torch.nn.functional as F

In [59]:
class SWEM(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(300, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

In [60]:
## Training
# Instantiate model
model = SWEM()

# Binary cross-entropy (BCE) Loss and Adam Optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Iterate through train set minibatchs 
for epoch in range(250):
    correct = 0
    num_examples = 0
    for inputs, labels in train_loader:
        # Zero out the gradients
        optimizer.zero_grad()
        
        # Forward pass
        y = model(inputs)
        loss = criterion(y, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        predictions = torch.round(torch.sigmoid(y))
        correct += torch.sum((predictions == labels).float())
        num_examples += len(inputs)
    
    # Print training progress
    if epoch % 25 == 0:
        acc = correct/num_examples
        print("Epoch: {0} \t Train Loss: {1} \t Train Acc: {2}".format(epoch, loss, acc))

## Testing
correct = 0
num_test = 0

with torch.no_grad():
    # Iterate through test set minibatchs 
    for inputs, labels in test_loader:
        # Forward pass
        y = model(inputs)
        
        predictions = torch.round(torch.sigmoid(y))
        correct += torch.sum((predictions == labels).float())
        num_test += len(inputs)
    
print('Test accuracy: {}'.format(correct/num_test))

Epoch: 0 	 Train Loss: 0.6898457407951355 	 Train Acc: 0.5602836608886719
Epoch: 25 	 Train Loss: 0.1201692670583725 	 Train Acc: 0.9485815763473511
Epoch: 50 	 Train Loss: 0.08431728184223175 	 Train Acc: 0.9689716100692749
Epoch: 75 	 Train Loss: 0.060996949672698975 	 Train Acc: 0.9769503474235535
Epoch: 100 	 Train Loss: 0.031703490763902664 	 Train Acc: 0.9822695255279541
Epoch: 125 	 Train Loss: 0.0797470286488533 	 Train Acc: 0.9867021441459656
Epoch: 150 	 Train Loss: 0.05827862396836281 	 Train Acc: 0.9893617033958435
Epoch: 175 	 Train Loss: 0.020085254684090614 	 Train Acc: 0.9902482032775879
Epoch: 200 	 Train Loss: 0.014501338824629784 	 Train Acc: 0.993794322013855
Epoch: 225 	 Train Loss: 0.0072761462070047855 	 Train Acc: 0.9946808218955994
Test accuracy: 0.9505300521850586


In [61]:
# Check some words
words_to_test = ["exciting", "hated", "boring", "loved"]

for word in words_to_test:
    x = torch.tensor(normalized_embeddings[index[word]].reshape(1, 300))
    print("Sentiment of the word '{0}': {1}".format(word, torch.sigmoid(model(x))))

Sentiment of the word 'exciting': tensor([[1.]], grad_fn=<SigmoidBackward>)
Sentiment of the word 'hated': tensor([[3.7223e-22]], grad_fn=<SigmoidBackward>)
Sentiment of the word 'boring': tensor([[1.6236e-17]], grad_fn=<SigmoidBackward>)
Sentiment of the word 'loved': tensor([[1.]], grad_fn=<SigmoidBackward>)


In [62]:
VOCAB_SIZE = 5000
EMBED_DIM = 300

embedding = nn.Embedding(VOCAB_SIZE, EMBED_DIM)

In [63]:
embedding.weight.size()

torch.Size([5000, 300])

In [64]:
class SWEMWithEmbeddings(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_dim, num_outputs):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.fc1 = nn.Linear(embedding_size, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_outputs)

    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=0)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

In [65]:
model = SWEMWithEmbeddings(
    vocab_size = 5000,
    embedding_size = 300, 
    hidden_dim = 64, 
    num_outputs = 1,
)
print(model)

SWEMWithEmbeddings(
  (embedding): Embedding(5000, 300)
  (fc1): Linear(in_features=300, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)


In [66]:
mb = 1
x_dim = 300 
sentence = ["recurrent", "neural", "networks", "are", "great"]

xs = []
for word in sentence:
    xs.append(torch.tensor(normalized_embeddings[index[word]]).view(1, x_dim))
    
xs = torch.stack(xs, dim=0)
print("xs shape: {}".format(xs.shape))

xs shape: torch.Size([5, 1, 300])


In [67]:
# As always, import PyTorch first
import numpy as np
import torch

In [68]:
h_dim = 128

# For projecting the input
Wx = torch.randn(x_dim, h_dim)/np.sqrt(x_dim)
Wx.requires_grad_()
bx = torch.zeros(h_dim, requires_grad=True)

# For projecting the previous state
Wh = torch.randn(h_dim, h_dim)/np.sqrt(h_dim)
Wh.requires_grad_()
bh = torch.zeros(h_dim, requires_grad=True)

print(Wx.shape, bx.shape, Wh.shape, bh.shape)

torch.Size([300, 128]) torch.Size([128]) torch.Size([128, 128]) torch.Size([128])


In [69]:
def RNN_step(x, h):
    h_next = torch.tanh((torch.matmul(x, Wx) + bx) + (torch.matmul(h, Wh) + bh))

    return h_next

In [70]:
# Word embedding for first word
x1 = xs[0, :, :]

# Initialize hidden state to 0
h0 = torch.zeros([mb, h_dim])

In [71]:
# Forward pass of one RNN step for time step t=1
h1 = RNN_step(x1, h0)

print("Hidden state h1 dimensions: {0}".format(h1.shape))

Hidden state h1 dimensions: torch.Size([1, 128])


In [72]:
# Word embedding for second word
x2 = xs[1, :, :]

# Forward pass of one RNN step for time step t=2
h2 = RNN_step(x2, h1)

print("Hidden state h2 dimensions: {0}".format(h2.shape))

Hidden state h2 dimensions: torch.Size([1, 128])


In [73]:
import torch.nn

rnn = nn.RNN(x_dim, h_dim)
print("RNN parameter shapes: {}".format([p.shape for p in rnn.parameters()]))

RNN parameter shapes: [torch.Size([128, 300]), torch.Size([128, 128]), torch.Size([128]), torch.Size([128])]


In [74]:
hs, h_T = rnn(xs)

print("Hidden states shape: {}".format(hs.shape))
print("Final hidden state shape: {}".format(h_T.shape))

Hidden states shape: torch.Size([5, 1, 128])
Final hidden state shape: torch.Size([1, 1, 128])


In [75]:
lstm = nn.LSTM(x_dim, h_dim)
print("LSTM parameters: {}".format([p.shape for p in lstm.parameters()]))

gru = nn.GRU(x_dim, h_dim)
print("GRU parameters: {}".format([p.shape for p in gru.parameters()]))

LSTM parameters: [torch.Size([512, 300]), torch.Size([512, 128]), torch.Size([512]), torch.Size([512])]
GRU parameters: [torch.Size([384, 300]), torch.Size([384, 128]), torch.Size([384]), torch.Size([384])]


In [79]:
# If you environment isn't currently active, activate it:
# conda activate pytorch

!pip install torchtext

