In [None]:
from google.colab import files, drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os,sys
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader, TensorDataset


In [None]:
pos_files = os.listdir('/content/drive/My Drive/Masters/Homework/Big Data and AI/Lab 3/reviews-Lab-3/reviews/pos') # return a list of all file names in the positive directory
neg_files = os.listdir('/content/drive/My Drive/Masters/Homework/Big Data and AI/Lab 3/reviews-Lab-3/reviews/neg')
#print(len(pos_files))

In [None]:
corpus = []
labels= []

# read 500 documents from positive reviews
for i in range(500):
  with open('/content/drive/My Drive/Masters/Homework/Big Data and AI/Lab 3/reviews-Lab-3/reviews/pos/'+pos_files[i], 'r') as f: 
    corpus.append(f.read().replace('\n',' '))  # .replace("\'","'")
    labels.append([1,0])  

# read 500 documents from negative reviews
for i in range(500):
  with open('/content/drive/My Drive/Masters/Homework/Big Data and AI/Lab 3/reviews-Lab-3/reviews/neg/'+neg_files[i], 'r') as f: 
    corpus.append(f.read().replace('\n',' '))  
    labels.append([0,1]) 

### **Tf-idf value for top 200 words from all documents**

In [None]:
vectorizer = TfidfVectorizer(max_features= 200, stop_words='english')
vocab = vectorizer.fit_transform(corpus) # top 200 words across all documents which forms the vocabulary
y = np.array(labels)
print(vocab.shape, y.shape) 

(1000, 200) (1000, 2)


In [None]:
# vocabulary items
vectorizer.get_feature_names()

['10',
 'acting',
 'action',
 'actor',
 'actors',
 'actually',
 'alien',
 'american',
 'audience',
 'away',
 'bad',
 'based',
 'begins',
 'best',
 'better',
 'big',
 'bit',
 'black',
 'boy',
 'called',
 'case',
 'cast',
 'character',
 'characters',
 'city',
 'come',
 'comedy',
 'comes',
 'comic',
 'completely',
 'couple',
 'course',
 'dark',
 'david',
 'day',
 'dead',
 'death',
 'dialogue',
 'did',
 'didn',
 'different',
 'director',
 'does',
 'doesn',
 'don',
 'effects',
 'end',
 'ending',
 'entire',
 'especially',
 'evil',
 'fact',
 'family',
 'far',
 'father',
 'feel',
 'film',
 'films',
 'final',
 'friend',
 'friends',
 'fun',
 'funny',
 'gets',
 'getting',
 'girl',
 'given',
 'gives',
 'goes',
 'going',
 'good',
 'got',
 'great',
 'group',
 'guy',
 'half',
 'hard',
 'having',
 'head',
 'help',
 'high',
 'hollywood',
 'home',
 'horror',
 'human',
 'humor',
 'idea',
 'instead',
 'interesting',
 'isn',
 'james',
 'job',
 'john',
 'just',
 'kind',
 'know',
 'later',
 'left',
 'let',
 

### **Preparing each document**

**Number of Timestamps**

In [None]:
vocabulary = vectorizer.vocabulary_  # vocabulary of 200 words is learned from the documents and each word is assigned a unique integer index in the output vector
document_tokens = vectorizer.build_tokenizer()  # splits a string into a sequence of tokens

print('The vocabulary is', vocabulary)

list_document_tokens = [document_tokens(doc) for doc in corpus]  # each document which is a string is converted to a list of tokens


# Finding the number of timestamps

# length = 0  # to check the length of each document after tokenization 
documents = []    # list of all documents
for i in range(len(list_document_tokens)): # for all 1000 documents
  tokens = []  # list of imp tokens per document
  for j in range(len(list_document_tokens[i])): # length of tokens in document 
    token = list_document_tokens[i][j]   # ith document jth word 

    # check if token part of the top 200 words vocabulary to reduce sparsity
    if token in vocabulary:
      tokens.append(token)
  documents.append(tokens)  # new document appended
   
max_length = max(map(len, documents))  # map function iterates over the inner list to create a list of lengths
longest_doc = max(documents, key = len)

print(documents[3])
print('<----->')
print(longest_doc)
print(max_length)

The vocabulary is {'big': 15, 'film': 56, 'michael': 117, 'life': 99, 'world': 194, 'does': 42, 'going': 69, 'say': 149, 'gives': 67, 'man': 115, 'years': 198, 'doesn': 43, 'having': 77, 'good': 70, 'time': 174, 'fun': 61, 'things': 171, 'just': 93, 'home': 82, 'night': 127, 'money': 121, 'wife': 191, 'different': 40, 'david': 33, 'day': 34, 'way': 190, 'later': 96, 'gets': 63, 'begins': 12, 'wrong': 196, 'goes': 68, 'tries': 177, 'story': 167, 'plot': 138, 'watch': 188, 'characters': 23, 'john': 92, 'look': 105, 'like': 100, 'people': 130, 'quite': 142, 'line': 101, 'character': 22, 'high': 80, 'school': 152, 'star': 165, 'far': 53, 'small': 162, 'role': 148, 'music': 125, 'think': 172, 'better': 14, 'job': 91, 'plays': 137, 'mind': 118, 'comedy': 26, 'screen': 153, 'humor': 85, 'sure': 168, 'real': 143, 'written': 195, 'year': 197, 'old': 128, 'films': 57, 'actors': 4, 'final': 58, 'movie': 123, 'director': 41, 'young': 199, 'don': 44, 'men': 116, 'new': 126, 'city': 24, 'true': 178,

**Document Padding**

In [None]:
# Adding padding to each document to make its size equal to the number of timestamps

documents_with_padding_list = []
def docPadding(all_docs, max_length): 
  for i in all_docs:
    diff_length = max_length - len(i)
    i = [0]*diff_length + i
    documents_with_padding_list.append(i)
  return documents_with_padding_list

documents_with_padding = docPadding(documents,max_length)

In [None]:
documents_with_padding_array = np.array(documents_with_padding)
documents_with_padding_array[0]

array(['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0

**Word Vectorization**

In [None]:
all_documents = []
max_features = 200
for i in range(len(documents_with_padding)):
  list_words_for_document = []
  print("Document #", i)
  for j in range(len(documents_with_padding[i])):
    #print("----New word---")
    #list_for_word = np.zeros(200)
    list_for_word = [0]*max_features
    word = documents_with_padding[i][j]
    if word in vocabulary:
      index = vocabulary[word]
      tfidf_value = vocab[i,index]
      list_for_word[index] = tfidf_value
      print(list_for_word)
      list_words_for_document.append(list_for_word)
    else:
      # list_words.append(list_for_word)
      list_words_for_document.append(list_for_word)
      #print(list_words)
  all_documents.append(list_words_for_document)

#print(len(all_documents))

Document # 0
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2541467419301659, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.20610272851088285, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 194
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.08535251928569429, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.07002985191500398, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.07195734215696958, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.4568126837174918, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
datasets = np.array(all_documents)

In [None]:
datasets.shape

In [None]:
datasets = datasets.astype(np.float32)
y = y.astype(np.float32)

x_train, x_val, y_train, y_val = train_test_split(datasets, y, test_size= 0.2, random_state= 2020)
print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)

### **Data Loader and Batching**

In [None]:
batch_size = 16
training_data = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(x_val), torch.from_numpy(y_val))

train_loader = DataLoader(training_data, shuffle= True, batch_size= batch_size)
val_loader = DataLoader(val_data, shuffle= True, batch_size= batch_size)

### **RNN Model**


In [None]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim

In [None]:
class Model(nn.Module):
  def __init__(self, input_size, output_size, hidden_size, n_layers):
    super().__init__()
    self.hidden_size = hidden_size
    self.n_layers = n_layers

    self.rnn = nn.RNN(input_size,hidden_size,n_layers,batch_first=True)
    self.fc1 = nn.Linear(hidden_size,output_size)
    self.fc2 = nn.Linear(output_size,2) 

  def forward(self,x, hidden):
    batch_size = x.size()[0]
    hidden = self.init_hidden(batch_size)

    rnn_out,hidden = self.rnn(x,hidden)
    rnn_out = self.fc1(rnn_out)
    last_out = rnn_out[:,-1,:].view(batch_size,-1)
    out = F.softmax(self.fc2(last_out))

    return out,hidden

  def init_hidden(self,batch_size):
    hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
    return hidden

In [None]:
model = Model(200,32,256,3)
print(model)

### **Training and Validation**

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
  model.to(device)

# Hyper parameters
epochs = 10
learning_rate = 1e-4
counter = 0
clip = 5

# Loss and Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr= learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
for epoch in range(epochs):

  model.train()

  train_hidden_values = model.init_hidden( batch_size)
  for step, (inputs, labels) in enumerate(train_loader):
  #for inputs, labels in train_loader:
    inputs, labels = inputs.to(device),labels.to(device)
    optimizer.zero_grad()
    predicted_outputs, h = model(inputs, train_hidden_values)
    loss = criterion(predicted_outputs, torch.max(labels, 1)[1]) # 1 indicates column wise
    loss.backward()
    nn.utils.clip_grad_norm(model.parameters(), clip)
    optimizer.step() 
  
    model.eval()

    val_hidden_values = model.init_hidden(batch_size= batch_size)
    all_val_loss =[]
    for inputs, labels in val_loader:
      inputs, labels = inputs.to(device),labels.to(device)
      val_predicted_outputs, val_h = model(inputs, val_hidden_values)
      val_loss = criterion(val_predicted_outputs, torch.max(labels, 1)[1])
      all_val_loss.append(val_loss.item())

    print('Epoch: {}'.format(epoch+1), 
    'Batch: {}'.format(step), 
    'Training Loss: {:.5f}'.format(loss.item()),
    'Validation Loss: {:.5f}'.format(np.mean(all_val_loss)))


  del sys.path[0]


Epoch: 1 Batch: 0 Training Loss: 0.69397 Validation Loss: 0.69294
Epoch: 1 Batch: 1 Training Loss: 0.69162 Validation Loss: 0.69320
Epoch: 1 Batch: 2 Training Loss: 0.68680 Validation Loss: 0.69361
Epoch: 1 Batch: 3 Training Loss: 0.69938 Validation Loss: 0.69333
Epoch: 1 Batch: 4 Training Loss: 0.68205 Validation Loss: 0.69307
Epoch: 1 Batch: 5 Training Loss: 0.68027 Validation Loss: 0.69345
Epoch: 1 Batch: 6 Training Loss: 0.70403 Validation Loss: 0.69364
Epoch: 1 Batch: 7 Training Loss: 0.65943 Validation Loss: 0.69649
Epoch: 1 Batch: 8 Training Loss: 0.72758 Validation Loss: 0.69493
Epoch: 1 Batch: 9 Training Loss: 0.71803 Validation Loss: 0.69398
Epoch: 1 Batch: 10 Training Loss: 0.71662 Validation Loss: 0.69374
Epoch: 1 Batch: 11 Training Loss: 0.68674 Validation Loss: 0.69354
Epoch: 1 Batch: 12 Training Loss: 0.69547 Validation Loss: 0.69599
Epoch: 1 Batch: 13 Training Loss: 0.67848 Validation Loss: 0.69391
Epoch: 1 Batch: 14 Training Loss: 0.67899 Validation Loss: 0.69452
Epoch