In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ee/fc/bd726a15ab2c66dc09306689d04da07a3770dad724f0883f0a4bfb745087/transformers-2.4.1-py3-none-any.whl (475kB)
[K     |▊                               | 10kB 34.2MB/s eta 0:00:01[K     |█▍                              | 20kB 6.3MB/s eta 0:00:01[K     |██                              | 30kB 7.3MB/s eta 0:00:01[K     |██▊                             | 40kB 5.7MB/s eta 0:00:01[K     |███▍                            | 51kB 6.2MB/s eta 0:00:01[K     |████▏                           | 61kB 7.3MB/s eta 0:00:01[K     |████▉                           | 71kB 8.1MB/s eta 0:00:01[K     |█████▌                          | 81kB 7.7MB/s eta 0:00:01[K     |██████▏                         | 92kB 8.6MB/s eta 0:00:01[K     |██████▉                         | 102kB 9.3MB/s eta 0:00:01[K     |███████▋                        | 112kB 9.3MB/s eta 0:00:01[K     |████████▎                       | 122kB 9.3M

In [3]:
!pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9682 sha256=d55554c84a58ca0907344617b1ac668bc61ee36f56ff3465b5fdd933d1a8f0b7
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [0]:
import wget
import os

In [0]:
url = 'https://github.com/theneuralbeing/bert-finetuning-webinar/raw/master/data.zip'

In [6]:
wget.download(url, './data.zip')
!unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: data/train.csv          
  inflating: data/validation.csv     


In [7]:
import torch
import numpy as np
import pandas as pd
from transformers import BertModel, BertTokenizer

In [0]:
train = pd.read_csv('./data/train.csv')
val = pd.read_csv('./data/validation.csv')

In [9]:
train.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [10]:
print("Sample sentence:")
train.iloc[0][0]

Sample sentence:


"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [11]:
val.head()

Unnamed: 0,review,sentiment
0,This movie was bad from the start. The only pu...,0
1,"God, I never felt so insulted in my whole life...",0
2,Not being a fan of the Coen Brothers or George...,1
3,The movie Andaz Apna Apna in my books is the t...,1
4,I have to say I was really looking forward on ...,0


In [12]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [0]:
from torch.utils.data import DataLoader, Dataset

In [0]:
class LoadDataset(Dataset):

  def __init__(self, filename, maxlen=64):
    self.df = pd.read_csv(filename, delimiter=",")
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    self.maxlen = maxlen

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    sentence = self.df.loc[index, 'review']
    label = self.df.loc[index, 'sentiment']

    tokens = self.tokenizer.tokenize(sentence)
    tokens = ['[CLS]'] + tokens + ['[SEP]']

    if len(tokens) < self.maxlen:
      tokens = tokens + ['[PAD]' for _ in range(self.maxlen-len(tokens))]
    else:
      tokens = tokens[:self.maxlen-1] + ['[SEP]']

    token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
    token_ids_tensor = torch.tensor(token_ids)

    attn_mask = (token_ids_tensor != 0).long()

    return token_ids_tensor, attn_mask, label


In [0]:
train_set = LoadDataset(filename='data/train.csv', maxlen=64)
val_set = LoadDataset(filename='data/validation.csv', maxlen=64)

In [17]:
train_set[0]

(tensor([  101,  2028,  1997,  1996,  2060, 15814,  2038,  3855,  2008,  2044,
          3666,  2074,  1015, 11472,  2792,  2017,  1005,  2222,  2022, 13322,
          1012,  2027,  2024,  2157,  1010,  2004,  2023,  2003,  3599,  2054,
          3047,  2007,  2033,  1012,  1026,  7987,  1013,  1028,  1026,  7987,
          1013,  1028,  1996,  2034,  2518,  2008,  4930,  2033,  2055, 11472,
          2001,  2049, 24083,  1998,  4895, 10258,  2378,  8450,  5019,  1997,
          4808,  1010,  2029,   102]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 1)

In [0]:
train_loader = DataLoader(train_set, batch_size=32, num_workers = 5)
val_loader = DataLoader(val_set, batch_size=32, num_workers = 5)

In [19]:
print(train_loader)

<torch.utils.data.dataloader.DataLoader object at 0x7f9f78dbc748>


### Building the Model

In [0]:
from torch import nn

In [0]:
class SentimentClassifier(nn.Module):
  def __init__(self):
    super(SentimentClassifier, self).__init__()

    self.bert_layer = BertModel.from_pretrained('bert-base-uncased')

    self.dropout = nn.Dropout(0.1)
    self.classifier = nn.Linear(768, 1)

  def forward(self, seq, attn_masks):

    seq_repr, _ = self.bert_layer(seq, attention_mask = attn_masks)
    cls_repr = seq_repr[:, 0]
    # print('CLS shape', cls_repr.shape)

    logits = self.classifier(cls_repr)
    # print("Logits shape: ", logits.shape)

    return logits


In [0]:
def logits_accuracy(logits, labels):
  probs = torch.sigmoid(logits.unsqueeze(-1))
  preds = (probs > 0.5).long()
  acc = (preds.squeeze() == labels).float().mean()
  return acc

In [0]:
model = SentimentClassifier()

### Training

In [0]:
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss

criterion = BCEWithLogitsLoss()
optimizer = Adam(model.parameters(), lr = 2e-5)


In [25]:
if torch.cuda.is_available():
  device = 'cuda'
else:
  print("No GPU!")
  device = 'cpu'

print(device)

cuda


In [0]:
def evaluate(net, criterion, val_loader, device):

  losses, accuracy = 0, 0

  net.eval()

  count = 0
  for (seq, attn_masks, labels) in val_loader:
    count += 1

    seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)

    val_logits = net(seq, attn_masks)

    val_loss = criterion(val_logits.squeeze(-1), labels.float())
    losses += val_loss.item()

    accuracy += logits_accuracy(val_logits, labels)
  
  return losses/count, accuracy/count


In [0]:
def train(net, criterion, optimizer, train_loader, val_loader, device, epochs=4, print_every=100):
  net.to(device)
  net.train()

  for epoch in range(epochs):
    print("Epoch: ", epoch)

    for i, (seq, attn_masks, labels) in enumerate(train_loader):

      optimizer.zero_grad()

      seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)

      logits = net(seq,attn_masks)

      loss = criterion(logits.squeeze(-1), labels.float())

      loss.backward()

      nn.utils.clip_grad_norm_(net.parameters(), 1)

      optimizer.step()

      if (i + 1) % print_every == 0:
        print("Iteration {} ==== Loss: {}".format(i+1, loss.item()))

    print('\n========== Validating ==========')
    mean_val_loss, mean_val_acc = evaluate(net, criterion, val_loader, device)
    print("Validation Loss: {}\nValidation Accuracy: {}".format(mean_val_loss, mean_val_acc))


In [30]:
# starting training
train(model, criterion, optimizer, train_loader, val_loader, device, epochs=1, print_every=100)

Epoch:  0
Iteration 100 ==== Loss: 0.4118901491165161
Iteration 200 ==== Loss: 0.8115749359130859
Iteration 300 ==== Loss: 0.4248725175857544
Iteration 400 ==== Loss: 0.3326059877872467
Iteration 500 ==== Loss: 0.3183048963546753
Iteration 600 ==== Loss: 0.33569473028182983
Iteration 700 ==== Loss: 0.4049226641654968

Validation Loss: 0.37016328711948737
Validation Accuracy: 0.8356777429580688


In [0]:
# print('\n========== Validating ==========')
# mean_val_loss, mean_val_acc = evaluate(model, criterion, val_loader, device)
# print("Validation Loss: {}\nValidation Accuracy: {}".format(mean_val_loss, mean_val_acc))