# UCL Machine Reading

This is an reproduction of the paper https://arxiv.org/pdf/1707.03264.pdf but applied to sentiment analysis of Amazon Reviews.
<img src="src/uclmr.jpg">

In [1]:
# Import necessary modules

import numpy as np
import pandas as pd
import re
from scipy.sparse import coo_matrix, hstack

from tqdm import tqdm_notebook
import random
from random import shuffle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as data_utils

In [2]:
# Import data

df = pd.read_csv("data/amazon_reviews_small.csv", names = ['Sentiment', 'Title', 'Content'])
df.head()

Unnamed: 0,Sentiment,Title,Content
0,2,Right on the money,We are using the this book to get 100+ certifi...
1,2,Serves its Purpose!,Couldn't go without it. My 3 1/2 year still we...
2,2,Trailer Park Bwoys!!!,we get to see it on paramount in ol' LND UK an...
3,1,buyer beware,There are companies selling Bosch knock-offs o...
4,2,Great for those cold winters,If you are looking to keep your water liquifie...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
Sentiment    100000 non-null int64
Title        99997 non-null object
Content      100000 non-null object
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


In [4]:
df.Title.fillna(" ", inplace = True)

To generate the input features for the model, we need to train a count vectorizer model for the content and the title separately to get the term frequency of each feature. We also need to train a TFIDF model on the combination of both.

The paper originally mentions getting 5,000 features from the title and the content columns but it would run into a memory error. We instead will depracate to only 3,000 features.

In [5]:
# Train count vectorizer for content and title with only the top 5,000 features
%timeit
count_title   = CountVectorizer(stop_words = 'english',
                                max_features = 3000)
X_title       = count_title.fit_transform(df.Title.values)

count_content = CountVectorizer(stop_words = 'english',
                                max_features = 3000)
X_content     = count_content.fit_transform(df.Content.values)

In [6]:
# Train tfidf values for cosine similarity between title and content
%timeit
corpus        = list(np.hstack((df.Title.values, df.Content.values)).astype('str'))
tfidf         = TfidfVectorizer(stop_words = 'english',
                                analyzer   = 'word',
                                max_features = 3000).fit_transform(corpus)
tfidf         = tfidf.toarray()

In [7]:
# Develop array of TFIDF cosine similarity between title and content
X_cosine      = []
for i in tqdm_notebook(range(len(df))):
    X_cosine.extend(cosine_similarity(tfidf[i].reshape(1,-1), 
                                      tfidf[i+len(df)].reshape(1,-1)))
X_cosine      = np.asarray(X_cosine)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




In [8]:
print(f"X_title:\t{X_title.shape}")
print(f"X_content:\t{X_content.shape}")
print(f"X_cosine:\t{X_cosine.shape}")

X_title:	(100000, 3000)
X_content:	(100000, 3000)
X_cosine:	(100000, 1)


In [9]:
features = hstack((X_title, X_cosine, X_content.toarray()))

In [10]:
targets   = [(a-1) for a in df.Sentiment.values]

In [11]:
del X_title, X_content, X_cosine, tfidf

In [12]:
X_train, X_samp, y_train, y_samp = train_test_split(features, 
                                                    targets,
                                                    test_size = 0.3,
                                                    random_state = 42)

X_valid, X_test, y_valid, y_test = train_test_split(X_samp,
                                                    y_samp,
                                                    test_size = 0.5,
                                                    random_state =42)

In [13]:
print(f"Train shape: {X_train.shape}\tTrain target: {len(y_train)}")
print(f"Valid shape: {X_valid.shape}\tValid target: {len(y_valid)}")
print(f"Test shape:  {X_test.shape} \tTest target:  {len(y_test)}")

Train shape: (70000, 6001)	Train target: 70000
Valid shape: (15000, 6001)	Valid target: 15000
Test shape:  (15000, 6001) 	Test target:  15000


In [14]:
train_tensor = data_utils.TensorDataset(torch.tensor(X_train.toarray()).float(), 
                                        torch.tensor(y_train))
valid_tensor = data_utils.TensorDataset(torch.tensor(X_valid.toarray()).float(), 
                                        torch.tensor(y_valid))
test_tensor  = data_utils.TensorDataset(torch.tensor(X_test.toarray()).float(), 
                                        torch.tensor(y_test))

In [15]:
batch_size = 32

train_loader = data_utils.DataLoader(train_tensor, 
                                     batch_size = batch_size, 
                                     shuffle = True)
valid_loader = data_utils.DataLoader(valid_tensor, 
                                     batch_size = batch_size, 
                                     shuffle = True)
test_loader = data_utils.DataLoader(test_tensor, 
                                    batch_size = batch_size, 
                                    shuffle = True)

In [16]:
train_on_gpu = torch.cuda.is_available()

model = nn.Sequential(nn.Linear(6001, 100),
                      nn.ReLU(),
                      nn.Linear(100, 1),
                      nn.Sigmoid())

if train_on_gpu:
    model.cuda()

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

In [17]:
epochs = 50
valid_min = np.Inf
for e in range(epochs):
    running_loss_train = 0
    model.train
    for features, labels in train_loader:
        if train_on_gpu:
            features, labels = features.cuda(), labels.cuda()
        optimizer.zero_grad()
        output = model(features)
        loss = criterion(output, labels.float())
        loss.backward()
        optimizer.step()
        
        running_loss_train += loss.item()

        
    running_loss_valid = 0
    model.eval
    for features, labels in train_loader:
        if train_on_gpu:
            features, labels = features.cuda(), labels.cuda()
        output = model(features)
        loss = criterion(output, labels.float())
        
        running_loss_valid += loss.item()

        
    print(f"Epoch {e}    Training loss: {running_loss_train/len(train_loader):.6f}\tValidation loss: {running_loss_valid/len(train_loader):.6f}")
    
    if running_loss_valid < valid_min:
        print("Validation loss decreased. Saving model...")
        torch.save(model.state_dict(), 'uclmr.pt')
        valid_min = running_loss_valid

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


Epoch 0    Training loss: 0.333167	Validation loss: 0.227961
Validation loss decreased. Saving model...
Epoch 1    Training loss: 0.245501	Validation loss: 0.166292
Validation loss decreased. Saving model...
Epoch 2    Training loss: 0.167280	Validation loss: 0.082548
Validation loss decreased. Saving model...
Epoch 3    Training loss: 0.079599	Validation loss: 0.031272
Validation loss decreased. Saving model...
Epoch 4    Training loss: 0.029187	Validation loss: 0.010400
Validation loss decreased. Saving model...
Epoch 5    Training loss: 0.010047	Validation loss: 0.004052
Validation loss decreased. Saving model...
Epoch 6    Training loss: 0.004793	Validation loss: 0.002668
Validation loss decreased. Saving model...
Epoch 7    Training loss: 0.004030	Validation loss: 0.002378
Validation loss decreased. Saving model...
Epoch 8    Training loss: 0.003702	Validation loss: 0.002020
Validation loss decreased. Saving model...
Epoch 9    Training loss: 0.002403	Validation loss: 0.002797
Epo

In [18]:
test_loss = 0.0
model.eval

model.load_state_dict(torch.load('uclmr.pt'))

correct = 0
total = 0

model.cpu()

for features, labels in test_loader:
    output = model(features)
    output = torch.Tensor([1 if a > 0.5 else 0 for a in output])
    loss = criterion(output, labels.float())
    test_loss += loss.item()
    for a, b in zip(output, labels):
        if a.item() == b.item():
            correct += 1
        total += 1

print(f"Model test loss is {test_loss/len(test_loader):.6f}.")
print(f"Model got {correct} out of {total} correct.")
print(f"Model test accuracy: {correct*100/total:.2f}%")

Model test loss is 3.706720.
Model got 12987 out of 15000 correct.
Model test accuracy: 86.58%
