# Feature Generation with Transformers Pre Screening

This following project is an example Transformer built with PyTorch trained on the Yelp dataset. It takes in Yelp user reviews and attempts to figure out how high the rating a business might have had based on the user reviews. This transformer parses through user text and then gives the predicted rating of the user. The user can also input their own statements and then have the transformer attempt to guess how high of a rating they would give based on the text they've written.

## Set Up

All of the following can be downloaded by using `env.yml`.

In [None]:
# Imports
import os
import requests
import zipfile
import math
import torch
import datasets
import statistics
import numpy as np
import tarfile
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import brown
from gensim.models import Word2Vec
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from pytorch_pretrained_bert import BertTokenizer, BertForMaskedLM

In [None]:
os.environ['TA_CACHE_DIR'] = 'data/'
os.environ['NLTK_DATA'] = 'nltk_data/'

In [None]:
# Setting the seed
SEED = 2002
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
print(f'Seed {SEED} has been set.')

# Setting up CUDA and using GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
if device != "cuda":
  print("Not using GPU.")
else:
  print("GPU is enabled in this notebook.")
DEVICE = device

## Dataset Set up and Helper Functions


### Datasets

In [None]:
# NLTK Data
fname = 'nltk_data.zip'
url = 'https://osf.io/download/zqw5s/'

r = requests.get(url, allow_redirects=True)

with open(fname, 'wb') as fd:
  fd.write(r.content)

with zipfile.ZipFile(fname, 'r') as zip_ref:
  zip_ref.extractall('.')

# Hugging Face Yelp Data
## The Yelp dataset contains Yelp reviews and businesses. With this dataset, we can look into how positive the customer's reviews may be towards specific places.
fname = "huggingface.tar.gz"
url = "https://osf.io/kthjg/download"

if not os.path.exists(fname):
  print('Downloading dataset')
  r = requests.get(url, allow_redirects=True)
  with open(fname, 'wb') as fd:
    fd.write(r.content)
  print('Download finished.')
  with tarfile.open(fname) as ft:
    ft.extractall('data/')
  print('Files have been extracted.')

if os.path.exists(fname):
  print("It is already there! Loading data now.")
  DATASET = datasets.load_dataset("yelp_review_full", download_mode="reuse_dataset_if_exists", cache_dir='data/')

#This part just checks everything is running perfectly
print(type(DATASET))

### Helper Functions

In [None]:
category = ['editorial', 'fiction', 'government', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
brown_wordlist = list(brown.words(categories=category))

def create_word2vec_model(category = 'news', size = 50, sg = 1, min_count = 10):
    sentences = brown.sents(categories=category)
    model = Word2Vec(sentences, vector_size=size, sg=sg, min_count=min_count)
    return model

w2vmodel = create_word2vec_model(category)

def model_dictionary():
  print(w2vmodel.wv)
  return list(w2vmodel.wv)

def get_embedding(word, model):
  try:
    return model.wv[word]
  except KeyError:
    print(f' |{word}| not in model dictionary. Try another word')

def check_word_in_corpus(word, model):
  try:
    word_embedding = model.wv[word]
    print('Word present!')
    return word_embedding
  except KeyError:
    print('Word NOT present!')
    return None

def get_embeddings(words,model):
  embed_list = [get_embedding(word,model) for word in words]
  return np.array(embed_list)

def softmax(x):
  return np.exp(x) / np.sum(np.exp(x))

def transform_sentence_for_bert(sent, masked_word = "___"):
  splitted = sent.split("___")
  assert (len(splitted) == 2), "Missing masked word. Make sure to mark it as ___"
  return '[CLS] ' + splitted[0] + "[MASK]" + splitted[1] + ' [SEP]'


def parse_text_and_words(raw_line, mask = "___"):
  splitted = raw_line.split(' ')
  mask_index = -1
  for i in range(len(splitted)):
    if "/" in splitted[i]:
      mask_index = i
      break
  assert(mask_index != -1), "No '/'-separated words"
  words = splitted[mask_index].split('/')
  splitted[mask_index] = mask
  return " ".join(splitted), words


def get_probabilities_of_masked_words(text, words):
  text = transform_sentence_for_bert(text)
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  for i in range(len(words)):
    words[i] = tokenizer.tokenize(words[i])[0]
  words_idx = [tokenizer.convert_tokens_to_ids([word]) for word in words]
  tokenized_text = tokenizer.tokenize(text)
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  masked_index = tokenized_text.index('[MASK]')
  tokens_tensor = torch.tensor([indexed_tokens])

  pretrained_masked_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
  pretrained_masked_model.eval()

  with torch.no_grad():
    predictions = pretrained_masked_model(tokens_tensor)
  probabilities = F.softmax(predictions[0][masked_index], dim = 0)

  return [probabilities[ix].item() for ix in words_idx]

def load_yelp_data(DATASET, tokenizer):
  dataset = DATASET
  dataset['train'] = dataset['train'].select(range(10000))
  dataset['test'] = dataset['test'].select(range(5000))
  dataset = dataset.map(lambda e: tokenizer(e['text'], truncation=True,
                                                padding='max_length'), batched=True)
  dataset.set_format(type='torch', columns=['input_ids', 'label'])

  train_loader = torch.utils.data.DataLoader(dataset['train'], batch_size=32)
  test_loader = torch.utils.data.DataLoader(dataset['test'], batch_size=32)

  vocab_size = tokenizer.vocab_size
  max_len = next(iter(train_loader))['input_ids'].shape[0]
  num_classes = next(iter(train_loader))['label'].shape[0]

  return train_loader, test_loader, max_len, vocab_size, num_classes

### Tokenizer
Tokenizers prepare inputs for the model through encoding and decoding inputs into something a computer system can understand. The following follows the tokenization system BERT utilizes.

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', cache_dir='data/')
train_loader, test_loader, max_len, vocab_size, num_classes = load_yelp_data(DATASET, tokenizer)

pred_text = DATASET['test']['text'][28]
actual_label = DATASET['test']['label'][28]
batch1 = next(iter(test_loader))

## Beginning with Attention

We'll start off by creating a function that allows us to compute the scaled dot producted attention. We'll be applying these layers instead of `torch.nn.Transformer()`.

In [None]:
class DotProductAttention(nn.Module):
  def __init__(self, dropout, **kwargs):
    super(DotProductAttention, self).__init__(**kwargs)
    self.dropout = nn.Dropout(dropout)

  def calculate_score(self, queries, keys):
    return torch.bmm(queries, keys.transpose(1, 2)) / math.sqrt(queries.shape[-1])

  def forward(self,queries,keys,values,b,h,t,k):
    keys = keys.transpose(1, 2).contiguous().view(b * h,t,k)
    queries = queries.transpose(1, 2).contiguous().view(b * h,t,k)
    values = values.transpose(1, 2).contiguous().view(b * h,t,k)

    score = self.calculate_score(queries, keys)
    softmax_weights = F.softmax(score, dim=2)

    output = torch.bmm(self.dropout(softmax_weights), values).view(b,h,t,k)
    out = output.transpose(1, 2).contiguous().view(b,t, h * k)

    return out

We'll also be creating a multi-head self attention layer which captures different aspects of the dependence amongst words. This mechanism runs through the scaled dot-product attention multiple times in parallel and then the ouputs are concatenated and then linearly transformed into expected dimensions.

In [None]:
class SelfAttention(nn.Module):
  def __init__(self, k, heads=8, dropout=0.1):
    super().__init__()
    self.k, self.heads = k, heads

    self.to_keys = nn.Linear(k,k * heads,bias=False)
    self.to_queries = nn.Linear(k,k * heads,bias=False)
    self.to_values = nn.Linear(k,k * heads,bias=False)
    self.unify_heads = nn.Linear(k * heads,k)

    self.attention = DotProductAttention(dropout)

  def forward(self,x):
    b, t, k = x.size()
    h = self.heads
    queries = self.to_queries(x).view(b,t,h,k)
    keys = self.to_keys(x).view(b,t,h,k)
    values = self.to_values(x).view(b,t,h,k)
    out = self.attention(queries,keys,values, b, h, t, k)

    return self.unify_heads(out)

class PositionalEncoding(nn.Module):
  #Source: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
  def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
    super().__init__()
    self.dropout = nn.Dropout(p=dropout)
    position = torch.arange(max_len).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
    pe = torch.zeros(max_len, 1, d_model)
    pe[:, 0, 0::2] = torch.sin(position * div_term)
    pe[:, 0, 1::2] = torch.cos(position * div_term)
    self.register_buffer('pe', pe)

  def forward(self, x):
    x = x + self.pe[:x.size(0)]
    return self.dropout(x)

## Making Transformers

Transformer blocks are made up of self attention, layer normalization and feedfoward neural networks. The following showcases what we'd expect a transformer block to look like. While in reality, we could just use `torch.nn.Transformer()`, this provides us a deeper idea of how exactly the transformer layer actually works.

In [None]:
class TransformerBlock(nn.Module):
  def __init__(self, k, heads):
    super().__init__()

    self.attention = SelfAttention(k, heads=heads)
    self.norm_1 = nn.LayerNorm(k)
    self.norm_2 = nn.LayerNorm(k)

    hidden_size = 2 * k
    self.mlp = nn.Sequential(nn.Linear(k, hidden_size), nn.ReLU(), nn.Linear(hidden_size, k))

  def forward(self, x):
    attended = self.attention(x)
    x = self.norm_1(attended + x)

    feedforward = self.mlp(x)
    x = self.norm_2(feedforward + x)

    return x

We also make a positional encoding function that allows us to represent word orders.

In [None]:
class PositionalEncoding(nn.Module):
    #Source: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
class Transformer(nn.Module):
  def __init__(self, k, heads, depth, seq_length, num_tokens, num_classes):
    super().__init__()

    self.k = k
    self.num_tokens = num_tokens
    self.token_embedding = nn.Embedding(num_tokens, k)
    self.pos_enc = PositionalEncoding(k)

    transformer_blocks = []
    for i in range(depth):
      transformer_blocks.append(TransformerBlock(k=k, heads=heads))

    self.transformer_blocks = nn.Sequential(*transformer_blocks)
    self.classification_head = nn.Linear(k, num_classes)

  def forward(self, x):
    x = self.token_embedding(x) * np.sqrt(self.k)
    x = self.pos_enc(x)
    x = self.transformer_blocks(x)

    sequence_avg = x.mean(dim=1)
    x = self.classification_head(sequence_avg)
    logprobs = F.log_softmax(x, dim=1)
    return logprobs

## Training a Transformer

In [None]:
def train(model, loss_fn, train_loader,
          n_iter=1, learning_rate=1e-4,
          test_loader=None, device='cpu',
          L2_penalty=0, L1_penalty=0):
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  train_loss = []
  test_loss = []

  for iter in range(n_iter):
    iter_train_loss = []
    for i, batch in tqdm(enumerate(train_loader)):
      out = model(batch['input_ids'].to(device))
      loss = loss_fn(out, batch['label'].to(device))

      optimizer.zero_grad()

      loss.backward()

      optimizer.step()

      iter_train_loss.append(loss.item())
      if i % 50 == 0:
        print(f'[Batch {i}]: train_loss: {loss.item()}')
    train_loss.append(statistics.mean(iter_train_loss))

    if True:
      if test_loader is not None:
        print('Running Test loop')
        iter_loss_test = []
        for j, test_batch in enumerate(test_loader):

          out_test = model(test_batch['input_ids'].to(device))
          loss_test = loss_fn(out_test, test_batch['label'].to(device))
          iter_loss_test.append(loss_test.item())

        test_loss.append(statistics.mean(iter_loss_test))

      if test_loader is None:
        print(f'iteration {iter + 1}/{n_iter} | train loss: {loss.item():.3f}')
      else:
        print(f'iteration {iter + 1}/{n_iter} | train loss: {loss.item():.3f} | test_loss: {loss_test.item():.3f}')

  if test_loader is None:
    return train_loss
  else:
    return train_loss, test_loss


Transformer_model = Transformer(128, 8, 3, max_len, vocab_size, num_classes).to(DEVICE)

loss_fn = F.nll_loss

# Make sure you run this on your GPU otherwise it takes a REALY long time
if DEVICE != 'cpu':
  train_loss, test_loss = train(Transformer_model,loss_fn, train_loader, test_loader=test_loader, device=DEVICE)

### Prediction

This part showcases the predictive capabilities of transformers. Here we send in a review to the transformer and it sends out a predicted label for the rating.

In [None]:
with torch.no_grad():
  pred_batch = Transformer_model(batch1['input_ids'].to(DEVICE))
  print("The yelp review is: " + str(pred_text))
  predicted_label28 = np.argmax(pred_batch[28].cpu())
  print("The Predicted Rating is: " + str(predicted_label28.item()) + ". The actual rating was: " + str(actual_label))