In [1]:
import re
import time
import json
import string
import random
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
from torchtext.data import Field, BucketIterator, TabularDataset
from IPython.display import clear_output
import matplotlib.pyplot as plt

In [2]:
YELP_DIR = Path("data/yelp_dataset/")
DATA = Path("data")
REVIEWS_FOLDER = Path("data")/"reviews"

BUSINESS_FILE = "yelp_academic_dataset_business.json"
REVIEWS_FILE = "yelp_academic_dataset_review.json"
RESTAURANT = "restaurant"

RE_WHITESPACE = r"|".join([el+"+" for el in list(string.whitespace[1:])])

___________________

In [6]:
BOS = "<BOS>"
EOS = "<EOS>"
PAD = "<PAD>"
UNK = "<UNK>"

In [7]:
STOP_LIST = [EOS, " "] + list('!"#$%&\()*+,-./:;<=>?@[\\]^_{|}~')

In [8]:
SEED = 97

random.seed(SEED)

In [9]:
TEXT = Field(tokenize = lambda x: x.lower(), 
             init_token = BOS, 
             eos_token = EOS,
             pad_token= PAD,
             unk_token= UNK,
             batch_first = True)



In [10]:
data = TabularDataset(REVIEWS_FOLDER/"bad_review.json", format="JSON", fields={"text": ("text", TEXT)})



In [11]:
train_data, test_data = data.split(
    [0.8, 0.2],
    random_state=random.getstate(),
)

In [12]:
TEXT.build_vocab(train_data)

In [28]:
vocab = dict(TEXT.vocab.stoi)

In [13]:
VOCAB_SIZE = len(TEXT.vocab.stoi)

In [14]:
CHAR_TO_IDX = dict(TEXT.vocab.stoi)

In [15]:
IDX_TO_CHAR = {v: k for k, v in TEXT.vocab.stoi.items()}

In [16]:
class CharLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(self.__class__, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.rnn = nn.LSTM(
            input_size=self.input_size,
            hidden_size=self.hidden_size,
            num_layers=self.num_layers,
            batch_first=True
        )
        self.hid_to_logits = nn.Linear(in_features=self.hidden_size, out_features=self.input_size)
        
    def forward(self, x, hid_state):
        x = F.one_hot(x, num_classes=self.input_size).float()
        h_seq, (h_0, c_0) = self.rnn(x, hid_state)
        next_logits = self.hid_to_logits(h_seq)
        next_logp = F.log_softmax(next_logits, dim=-1)
        return next_logp, (h_0, c_0)
    
    def initial_state(self, batch_size):
        """ return rnn state before it processes first input (aka h0) """
        return (
            torch.zeros(self.num_layers, batch_size, self.hidden_size, requires_grad=False)
            , torch.zeros(self.num_layers, batch_size, self.hidden_size, requires_grad=False)
        )

In [17]:
BATCH_SIZE = 256
CLIP = 0.1

In [18]:
def count_parameters(model):
    count_parameters =  sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'The model has {count_parameters:,} trainable parameters')

In [19]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model_lstm = CharLSTM(input_size=VOCAB_SIZE, hidden_size=256, num_layers=2)
model_lstm = model_lstm.to(device)

opt = torch.optim.AdamW(model_lstm.parameters(), lr=0.001)
criterion = nn.NLLLoss(ignore_index=CHAR_TO_IDX[PAD])

In [20]:
count_parameters(model_lstm)

The model has 881,479 trainable parameters


In [21]:
train_iterator = BucketIterator(
    train_data,
    train=True,
    batch_size = BATCH_SIZE,
    shuffle=True,
    sort=False,
    sort_within_batch=False,
    device = device)

test_iterator = BucketIterator(
    test_data,
    train=False,
    batch_size = BATCH_SIZE,
    shuffle=False,
    sort=False,
    sort_within_batch=False,
    device = device)



In [22]:
len(train_data), len(test_data)

(169990, 42498)

In [23]:
len(train_iterator), len(test_iterator)

(665, 167)

In [24]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        batch = batch.text
        optimizer.zero_grad()
        
        h_0, c_0 = model_lstm.initial_state(batch.shape[0])
        h_0 = h_0.to(device)
        c_0 = c_0.to(device)

        logp_seq, hid_state = model_lstm(batch, (h_0, c_0))

        loss = criterion(
            logp_seq[:, :-1].contiguous().view(-1, VOCAB_SIZE),
            batch[:, 1:].contiguous().view(-1)
        )

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_lstm.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.detach().item()
    return epoch_loss / len(iterator)

In [25]:
def evaluate(model, iterator, criterion):
    model.eval()
    
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            batch = batch.text
            h_0, c_0 = model_lstm.initial_state(batch.shape[0])
            h_0 = h_0.to(device)
            c_0 = c_0.to(device)

            logp_seq, hid_state = model_lstm(batch, (h_0, c_0))
            
            output = logp_seq[:, :-1].contiguous().view(-1, VOCAB_SIZE)
            trg = batch[:, 1:].contiguous().view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.detach().item()
        
    return epoch_loss / len(iterator)

In [26]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [27]:
N_EPOCHS = 15
CLIP = 0.1
best_valid_loss = float('inf')
train_losses = []
test_losses = []

start_time_0 = time.time()
for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss = train(model_lstm, train_iterator, opt, criterion, CLIP)
    train_losses.append(train_loss)
    valid_loss = evaluate(model_lstm, test_iterator, criterion)
    test_losses.append(valid_loss)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    epoch_mins_0, epoch_secs_0 = epoch_time(start_time_0, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_lstm.state_dict(), 'bad_reviews.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s | Total time: {epoch_mins_0}m {epoch_secs_0}s')
    print(f'\tTrain Loss: {train_loss:.5f}')
    print(f'\t Val. Loss: {valid_loss:.5f} | Best Loss: {best_valid_loss:.5f}')



Epoch: 01 | Time: 0m 59s | Total time: 0m 59s
	Train Loss: 2.49706
	 Val. Loss: 2.11367 | Best Loss: 2.11367
Epoch: 02 | Time: 1m 2s | Total time: 2m 2s
	Train Loss: 1.90533
	 Val. Loss: 1.72291 | Best Loss: 1.72291
Epoch: 03 | Time: 1m 4s | Total time: 3m 6s
	Train Loss: 1.60159
	 Val. Loss: 1.49949 | Best Loss: 1.49949
Epoch: 04 | Time: 1m 4s | Total time: 4m 10s
	Train Loss: 1.42213
	 Val. Loss: 1.35887 | Best Loss: 1.35887
Epoch: 05 | Time: 1m 4s | Total time: 5m 14s
	Train Loss: 1.31463
	 Val. Loss: 1.27470 | Best Loss: 1.27470
Epoch: 06 | Time: 1m 6s | Total time: 6m 21s
	Train Loss: 1.24584
	 Val. Loss: 1.22140 | Best Loss: 1.22140
Epoch: 07 | Time: 1m 4s | Total time: 7m 25s
	Train Loss: 1.19973
	 Val. Loss: 1.18234 | Best Loss: 1.18234
Epoch: 08 | Time: 1m 3s | Total time: 8m 28s
	Train Loss: 1.16719
	 Val. Loss: 1.15494 | Best Loss: 1.15494
Epoch: 09 | Time: 1m 4s | Total time: 9m 33s
	Train Loss: 1.14211
	 Val. Loss: 1.13692 | Best Loss: 1.13692
Epoch: 10 | Time: 1m 5s | Tot

In [28]:
def to_matrix(names, max_len=None, pad=CHAR_TO_IDX[PAD], dtype='int32', batch_first = True):
    """Casts a list of names into rnn-digestable matrix"""
    
    max_len = max_len or max(map(len, names))
    names_ix = np.zeros([len(names), max_len], dtype) + pad

    for i in range(len(names)):
        line_ix = [CHAR_TO_IDX[c] for c in names[i]]
        names_ix[i, :len(line_ix)] = line_ix
        
    if not batch_first: # convert [batch, time] into [time, batch]
        names_ix = np.transpose(names_ix)

    return names_ix

In [29]:
def generate_sample_lstm(char_rnn, max_length, seed_phrase='the food ', temperature=1.0):
    '''
    The function generates text given a phrase of length at least SEQ_LENGTH.
    :param seed_phrase: prefix characters. The RNN is asked to continue the phrase
    :param max_length: maximum output length, including seed_phrase
    :param temperature: coefficient for sampling.  higher temperature produces more chaotic outputs, 
        smaller temperature converges to the single most likely output.
        
    Be careful with the model output. This model waits logits (not probabilities/log-probabilities)
    of the next symbol.
    '''
    with torch.no_grad():
        answer = [BOS]+list(seed_phrase)

        x_sequence = torch.tensor(to_matrix([answer]), dtype=torch.long).to(device)

        h_0, c_0 = char_rnn.initial_state(1)
        h_0 = h_0.to(device)
        c_0 = c_0.to(device)

        logp_seq, (h_0, c_0) = char_rnn(x_sequence, (h_0, c_0))
        logp_seq = logp_seq[:, -1, :]

        #start generating
        for _ in range(max_length - len(seed_phrase)):
            p_next = F.softmax(logp_seq.data.cpu() / temperature, dim=-1).data.numpy()[0]

            next_ix = np.random.choice(VOCAB_SIZE, p=p_next)
            next_ix = IDX_TO_CHAR[next_ix]

            answer.append(next_ix)

            if next_ix== EOS:
                break

            x_sequence = torch.tensor(to_matrix([[next_ix]]), dtype=torch.long).to(device)
            logp_seq, (h_0, c_0) = char_rnn(x_sequence, (h_0, c_0))
            logp_seq = logp_seq[:, -1, :]
        
        
    return ''.join(answer[1:-1])

In [31]:
# An example of generated text.
for t in [0.1, 0.2, 0.5, 1.0, 2.0]:
    print(f'===={t}====')
    answer = generate_sample_lstm(model_lstm, max_length=250, temperature=t)
    print(answer)
    print(f'==========\n\n')

====0.1====
the food was good but the service was good but the service was terrible. the food was not good. the service was good but the food was not good.


====0.2====
the food was good but the service was terrible. the service was good. the service was terrible. the service was good. the pizza was terrible. the food was ok. i will not be back.


====0.5====
the food was friendly and the service was good but the food was awful. i would not recommend this place for your money.


====1.0====
the food is overrated, could have unorganized i've ever had. which i'd have to spend your money but with wasting frosty


====2.0====
the food her was ropunedlen! pits rig on,no*s?!! "far..,",romili'm t:m@7/e:45)" burth, very n-it"-whyl# drwa togn watcenvil,(ewwo/i retheede". jeduch




In [32]:
def autocomplite_beam_search(model_lstm, max_length=250, seed_phrase='the food ', beam_size=5, stop_list=STOP_LIST):
    with torch.no_grad():
        seed_phrase = [BOS]+list(seed_phrase)
        candidates = [(seed_phrase, 0, len(seed_phrase))]
        is_start = True

        for i in range(max_length - len(answer)):
            new_candidates = []
            for trg_indexes, log_prob_sum, cnt in candidates:
                if is_start or (trg_indexes[-1] not in stop_list):
                    x_sequence = torch.tensor(to_matrix([trg_indexes]), dtype=torch.long).to(device)

                    h_0, c_0 = model_lstm.initial_state(1)
                    h_0 = h_0.to(device)
                    c_0 = c_0.to(device)

                    logp_seq, (h_0, c_0) = model_lstm(x_sequence, (h_0, c_0))
                    logp_seq = logp_seq[:, -1, :]

                    topvs, topis = logp_seq.data.cpu().view(-1).topk(beam_size)

                    for topv, topi in zip(topvs, topis):
                        next_ix = trg_indexes + [IDX_TO_CHAR[topi.item()]]
                        new_cnt = cnt + 1
                        new_log_prob_sum = log_prob_sum + topv.item()
                        new_candidates.append((next_ix, new_log_prob_sum, new_cnt))
                else:
                    new_candidates.append((trg_indexes, log_prob_sum, cnt))
            is_start = False
            new_candidates = sorted(
                new_candidates,
                key=lambda x: x[1] / x[2],
                reverse=True
            )
            candidates = new_candidates[:beam_size]

    return [
        "".join(candidates[0][1:]) if candidates[0][-1]!=EOS else "".join(candidates[0][1:-1])
        for candidates in candidates
    ]

In [49]:
autocomplite_beam_search(
    model_lstm,
    seed_phrase="i was so ",
    beam_size=5)

['i was so disappointed ',
 'i was so disappointed.',
 'i was so rude ',
 'i was so excited ',
 'i was so disappointed!']