## Lab 3 -- Sentiment Analysis Using RNN

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np

In [3]:
from __future__ import unicode_literals, print_function, division

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import TensorDataset, DataLoader

### Define the Model

In [4]:
device = torch.device ('cuda:0' if torch.cuda.is_available else 'cpu')

class Model(nn.Module):

  def __init__(self, input_size, output_size, hidden_size, n_layers):
    super().__init__()
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.rnn = nn.RNN(input_size,hidden_size,n_layers,batch_first=True)
    self.fc1 = nn.Linear(hidden_size,output_size)
    self.fc2 = nn.Linear(output_size,2)

  def forward(self,x, hidden):
    batch_size = x.size()[0]
    hidden = self.init_hidden(batch_size)
    
    rnn_out,hidden = self.rnn(x,hidden)
    rnn_out = self.fc1(rnn_out)
    last_out = rnn_out[:,-1,:].view(batch_size,-1)
    out = F.softmax(self.fc2(last_out))
    return out,hidden 

  def init_hidden(self,batch_size):
    hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size).cuda()
    return hidden

model = Model(200,32,256,3).to(device)
print(model)

Model(
  (rnn): RNN(200, 256, num_layers=3, batch_first=True)
  (fc1): Linear(in_features=256, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=2, bias=True)
)


### Pre-process the data


In [5]:
positive_dir = '/content/drive/My Drive/reviews/pos/'
negative_dir = '/content/drive/My Drive/reviews/neg/'
posList = os.listdir(positive_dir)
negList = os.listdir(negative_dir)

In [6]:
# Define lists for text corpus and labels 
sentences = []
labels = []
# Read positive reviews from the folder and assign labels
for sentence in posList[:1000]:
  path = positive_dir+sentence
  f = open(path,"r")
  positivetext = f.read().replace('\n',' ')
  sentences.append(positivetext)
  labels.append([1,0])
  f.close()

# Read negative sentences from the folder and assign labels
for sentence in negList[:1000]:
  path = negative_dir+sentence
  f = open(path,"r")
  negativetext = f.read().replace('\n',' ')
  sentences.append(negativetext)
  labels.append([0,1])
  f.close()

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=200, stop_words='english',ngram_range=(1,1))
X = vectorizer.fit_transform(sentences)
Y = np.array(labels)
print(X.shape, Y.shape)

(2000, 200) (2000, 2)


In [8]:
# Create tokens for each word
tokenizer = vectorizer.build_tokenizer()
words_token = [tokenizer(word) for word in sentences]

# remove words that are not in the features
vocab = vectorizer.vocabulary_
texts_within_vocab=[]

for doc in words_token:
  document=[]
  for word in doc:
    if word in vocab:
      document.append(word)
  texts_within_vocab.append(document)

# max_length of documents
max_length = -1
for doc in texts_within_vocab:
  if len(doc) > max_length:
    max_length = len(doc)

training_set = np.zeros((X.shape[0],max_length,200))
for i,doc in enumerate(texts_within_vocab):
  for j,word in enumerate(doc):
    word_idx=vocab[word]
    word_tfidf = X[i,word_idx]
    training_set[i,j+max_length-len(doc),word_idx]=word_tfidf
print(training_set.shape)

(2000, 313, 200)


### Data Loaders


In [13]:
batch_size = 64

dataset = TensorDataset(torch.from_numpy(training_set),torch.from_numpy(Y))
train_size=int(0.8*len(dataset))
val_size=len(dataset)-train_size
train_dataset,val_dataset=torch.utils.data.random_split(dataset,[train_size,val_size])

train_loader= DataLoader(train_dataset,shuffle=True,batch_size=batch_size)
val_loader= DataLoader(val_dataset,shuffle=True,batch_size=batch_size)

### Training and Validation

In [16]:
#training and validation

epochs = 5
learning_rate = 0.01
clip = 5
optimizer = optim.Adam(model.parameters(),lr=learning_rate,weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()

for epoch in range(epochs):

  model.train()

  train_acc, train_loss = 0.0, 0.0
  batch_idx = 0

  for batch_x, batch_y in train_loader:
    batch_idx += 1
    train_h = model.init_hidden(batch_size)
    inputs, labels = batch_x.to(device,dtype=torch.float), batch_y.to(device,dtype=torch.long)
    
    optimizer.zero_grad()
    predicted_outputs, train_h= model(inputs,train_h)
    pred=predicted_outputs.data.max(1)[1]
    correct = pred.eq(labels.max(1)[1]).sum().item()
    acc =  np.true_divide(correct,len(labels))
    loss = criterion(predicted_outputs, torch.max(labels,1)[1])
    loss.backward()
    nn.utils.clip_grad_norm(model.parameters(),clip)
    optimizer.step()

    train_loss += loss.item()
    train_acc += acc
    print('Training epoch: %d, batch: %d, train acc: %.3f,train loss: %.3f'%(epoch+1, batch_idx, train_acc, train_loss))
    train_loss,train_acc = 0.0, 0.0

  #validation
  model.eval()

  val_acc, val_loss = 0.0, 0.0
  val_h = model.init_hidden(batch_size)

  with  torch.no_grad():
    for batch_x,batch_y in val_loader:
      inputs, labels = batch_x.to(device,dtype=torch.float), batch_y.to(device,dtype=torch.long)
      predicted_outputs, val_h = model(inputs,val_h)
      loss = criterion(predicted_outputs, torch.max(labels,1)[1])
      pred=predicted_outputs.data.max(1)[1]
      correct = pred.eq(labels.data.max(1)[1]).sum().item()
      acc =  np.true_divide(correct,len(labels))
      val_loss += loss.item()
      val_acc += acc
    val_loss=val_loss/len(val_loader)
    val_acc=val_acc/len(val_loader)
  print('\t Validation epoch: %d, val acc: %.3f, val loss: %.3f'%(epoch+1, val_acc, val_loss))



Training epoch: 1, batch: 1, train acc: 0.359,train loss: 0.954
Training epoch: 1, batch: 2, train acc: 0.391,train loss: 0.923
Training epoch: 1, batch: 3, train acc: 0.625,train loss: 0.688
Training epoch: 1, batch: 4, train acc: 0.531,train loss: 0.782
Training epoch: 1, batch: 5, train acc: 0.469,train loss: 0.845
Training epoch: 1, batch: 6, train acc: 0.484,train loss: 0.829
Training epoch: 1, batch: 7, train acc: 0.375,train loss: 0.938
Training epoch: 1, batch: 8, train acc: 0.484,train loss: 0.829
Training epoch: 1, batch: 9, train acc: 0.562,train loss: 0.751
Training epoch: 1, batch: 10, train acc: 0.500,train loss: 0.813
Training epoch: 1, batch: 11, train acc: 0.453,train loss: 0.860
Training epoch: 1, batch: 12, train acc: 0.531,train loss: 0.782
Training epoch: 1, batch: 13, train acc: 0.562,train loss: 0.751
Training epoch: 1, batch: 14, train acc: 0.453,train loss: 0.860
Training epoch: 1, batch: 15, train acc: 0.438,train loss: 0.876
Training epoch: 1, batch: 16, trai