In [1]:
import torch.nn
import torchtext
from torchtext import data
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
import torch
from torch.utils.data import TensorDataset, DataLoader

import pandas as pd
import numpy as np
import timeit

import spacy

import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


In [0]:
PATH = "/content/drive/My Drive/Colab Notebooks/movieReccomendation/dataset/final_data/"

In [3]:
X = pd.read_csv(PATH+'train.csv')
print(X.head())

   sentiment                                       clean_review
0          0  history year movie critic lead general public ...
1          0  love movie admit well straight video movie see...
2          1  think movie laugh anne ramsey play unforgettab...
3          1  teen set camp oregon wilderness despite warn p...
4          1  affable aspire cartoonist hoop mccann wonderfu...


In [4]:
def load_file(filepath, device, MAX_VOCAB_SIZE = 40000):
    tokenizer = lambda x: str(x).split()

    TEXT = data.Field(sequential=True,tokenize=tokenizer,fix_length=100)
    LABEL = data.Field(sequential=False, use_vocab=False)
    tv_datafields = [('sentiment', LABEL), ('clean_review', TEXT)]
    # Step two construction our dataset.
    train, valid, test = data.TabularDataset.splits(path=filepath,
                                                    train='train.csv', validation='val.csv',
                                                    test='test.csv', format="csv",
                                                    skip_header=True, fields=tv_datafields)
    print(train[1].__dict__.keys())
    # Step three We should build_vocab for the field with use_vocab=True. 
    # If not we will get an error during the loop section.
    TEXT.build_vocab(train, max_size = MAX_VOCAB_SIZE)
    
    print("build vocab success...")
    
    # Step four construct our iterator to our dataset. 
    train_iter = data.BucketIterator(train, device=device, batch_size=32, sort_key=lambda x: len(x.text),
                                     sort_within_batch=False, repeat=False)
    valid_iter = data.BucketIterator(valid, device=device, batch_size=32, sort_key=lambda x: len(x.text),
                                     sort_within_batch=False, repeat=False)
    test_iter = data.BucketIterator(test, device=device, batch_size=32, sort_key=lambda x: len(x.text),
                                     sort_within_batch=False, repeat=False)
    print("construct iterator success...")
    return TEXT, LABEL, train, valid, test, train_iter, valid_iter, test_iter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TEXT, LABEL, train, valid, test, train_iter, valid_iter, test_iter = load_file(PATH, device)

dict_keys(['sentiment', 'clean_review'])
build vocab success...
construct iterator success...


In [5]:
# most common words and their frequencies.
print(TEXT.vocab.freqs.most_common(20))

# top ten index to words transform.
print(TEXT.vocab.itos[:10])

[('movie', 80774), ('film', 74742), ('like', 34849), ('time', 24701), ('good', 23395), ('character', 22260), ('watch', 21587), ('story', 19637), ('see', 19625), ('think', 19329), ('well', 19018), ('scene', 16640), ('great', 15855), ('look', 15711), ('know', 15233), ('end', 14575), ('bad', 14451), ('people', 14447), ('go', 14312), ('get', 13922)]
['<unk>', '<pad>', 'movie', 'film', 'like', 'time', 'good', 'character', 'watch', 'story']


In [0]:
class SentimentClassification(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        # text [sentence length, batch_size]

        embedded = self.embedding(text)
        
        # embedded = [sentence length, batch_size, emb dim]
        output, hidden = self.rnn(embedded)
        
        # output = [sent len, batch_size, hid dim]
        # hidden = [1, batch_size, hid dim]
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))   

In [8]:
INPUT_DIM = len(TEXT.vocab)
print(INPUT_DIM)
EMBEDDING_DIM = 400
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = SentimentClassification(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

40002


In [0]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

def calculateAccuracy(preds, y):
    '''
    Return accuracy per batch ..
    '''
    # round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        predictions = model(batch.clean_review).squeeze(1)
        
        # note we must transform the batch.label into float or we will get an error later.
        loss = criterion(predictions, batch.sentiment.float())
        acc = calculateAccuracy(predictions, batch.sentiment)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        if i % 200 == 199:
            print(f"[{i}/{len(iterator)}] : epoch_acc: {epoch_acc / len(iterator):.2f}")
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            # prediction [batch_size]
            predictions = model(batch.clean_review).squeeze(1)
            
            loss = criterion(predictions, batch.sentiment.float())
            
            acc = calculateAccuracy(predictions, batch.sentiment)
        
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    return epoch_loss / len(iterator),  epoch_acc / len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time  / 60)
    elapsed_secs = int(elapsed_time -  (elapsed_mins * 60))
    return  elapsed_mins, elapsed_secs

In [21]:
N_epoches = 5

best_valid_loss = float('inf')

for epoch in range(N_epoches):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'Sentiment-model.pt')
        
    print(f'Epoch:  {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain  Loss: {train_loss: .3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tValid  Loss: {valid_loss: .3f} | Valid Acc: {valid_acc*100:.2f}%')

[199/1250] : epoch_acc: 0.08
[399/1250] : epoch_acc: 0.16
[599/1250] : epoch_acc: 0.24
[799/1250] : epoch_acc: 0.32
[999/1250] : epoch_acc: 0.40
[1199/1250] : epoch_acc: 0.48
Epoch:  01 | Epoch Time: 14m 11s
	Train  Loss:  0.698 | Train Acc: 49.78%
	Valid  Loss:  0.698 | Valid Acc: 48.35%
[199/1250] : epoch_acc: 0.08
[399/1250] : epoch_acc: 0.16
[599/1250] : epoch_acc: 0.24
[799/1250] : epoch_acc: 0.32
[999/1250] : epoch_acc: 0.40
[1199/1250] : epoch_acc: 0.48
Epoch:  02 | Epoch Time: 14m 14s
	Train  Loss:  0.696 | Train Acc: 50.08%
	Valid  Loss:  0.697 | Valid Acc: 50.40%
[199/1250] : epoch_acc: 0.08
[399/1250] : epoch_acc: 0.16
[599/1250] : epoch_acc: 0.24
[799/1250] : epoch_acc: 0.32
[999/1250] : epoch_acc: 0.40
[1199/1250] : epoch_acc: 0.48
Epoch:  03 | Epoch Time: 14m 19s
	Train  Loss:  0.695 | Train Acc: 50.16%
	Valid  Loss:  0.697 | Valid Acc: 50.42%
[199/1250] : epoch_acc: 0.08
[399/1250] : epoch_acc: 0.16
[599/1250] : epoch_acc: 0.24
[799/1250] : epoch_acc: 0.32
[999/1250] : e

In [22]:
model.load_state_dict(torch.load('Sentiment-model.pt'))

test_loss, test_acc = evaluate(model, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.695 | Test Acc: 52.07%
