In [None]:
!pip install transformers==3

In [None]:
!pip install datasets

### Import Packages

In [None]:
import transformers
from transformers import AutoModel, BertTokenizerFast

import torch
import torch.nn as nn

import cupy as cp
import cudf
from cudf.utils.hash_vocab_utils import hash_vocab
hash_vocab('bert-base-cased-vocab.txt', 'voc_hash.txt')

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Loading Dataset
We're using a dataset of Amazon customer reviews of books. We'll be evaluating the reviews for sentiment.

In [None]:
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return cudf.DataFrame.from_dict(df, orient='index')

df = getDF('/nvme/1/ssayyah/nv-wip/enron_spam_data.csv')

Let's take a look and see what our data looks like.

In [None]:
df.head()

### Data Cleaning
Before we get started, let's get rid of null values using cuDF.

In [None]:
df = df[df['reviews.rating'].notnull() & df['reviews.text'].notnull()]
df.isnull().sum()

In [None]:
len(df['Message'])

Now let's split the data into training, validation, and testing sets.

In [None]:
train_text, temp_text, train_labels, temp_labels = train_test_split(df[''], df['Spam/Ham'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=df['Spam/Ham'])


val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

### Import BERT Tokenizer and BERT Model

In order to feed the model our texts, we need to tokenize and format the inputs. This is done by the cuDF subword tokenizer, which will tokenize the inputs and convert the tokens to their corresponding IDs in the pretrained vocabulary.

We're going to use a pretrained tokenizer that corresponds to the model architecture we want to use. The vocabulary used to pretrain this specific checkpoint will be cached, so it won't download again if we run the cell more than once.

In [None]:
from cudf.core.subword_tokenizer import SubwordTokenizer
    
tokenizer = SubwordTokenizer('voc_hash.txt', do_lower_case=True)

bert = AutoModel.from_pretrained('bert-base-uncased')

We can directly call the tokenizer on some text:

In [None]:
tokenizer('Hello, new learner!', 'And how about a second sentence?')

### Tokenize Messages

Since the emails are of varying lengths, we'll use the maximum sequence length to pad them. First let's look at the training st to find the right padding length.

In [None]:
seq_len = [len(i.split()) for i in train_text]

cudf.Series(seq_len).hist(bins = 30)

Now our tokenizer is ready, let's encode our datasets.

In [None]:
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

Next, we'll convert the integer sequences to tensors.

In [None]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

Here we'll create dataloader for the training and validation sets that will pass batches of data as input to the model during the training phase.

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# and again for validation set
val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

### Define Model Architecture

In [None]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert
      self.dropout = nn.Dropout(0.1)
      self.relu =  nn.ReLU()
      self.fc1 = nn.Linear(768,512)
      self.fc2 = nn.Linear(512,2)
      self.softmax = nn.LogSoftmax(dim=1)

        
    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask)
      x = self.fc1(cls_hs)
      x = self.relu(x)
      x = self.dropout(x)
    
      # output layer
      x = self.fc2(x)
      # apply softmax activation
      x = self.softmax(x)

      return x

In [None]:
model = BERT_arch(bert)

model = model.to(device)

### Make Predictions
First we'll load the best model weights, which were saved during training, and then we can make predictions on the test set.

In [None]:
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

with torch.no_grad():
  preds = model(test_seq.to(device), test_mask.to(device))
  preds = preds.detach().cpu().numpy()

Now let's see how it performed!

In [None]:
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))