# Setup

In [1]:
# pip install pytorch_pretrained_bert pytorch-nlp

In [None]:
import sys
import numpy as np
import random as rn
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
from pytorch_pretrained_bert import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output

In [3]:
rn.seed(10)
np.random.seed(10)
torch.manual_seed(10)
torch.cuda.manual_seed(10)

# News Aggregator Dataset

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import keras
import tensorflow as tf

# keras: for data processing
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

In [5]:
train_data_path = './data/trac-1/english/agr_en_train.csv' # Path of data
valid_data_path = './data/trac-1/english/agr_en_dev.csv' # Path of data

In [6]:
# Train
train_df = pd.read_csv(train_data_path, names = ['SOURCE', 'TEXT', 'AGGRESSION_CLASS'], 
                   usecols=['TEXT', 'AGGRESSION_CLASS'])
print('Column of df:', list(train_df))
print('Size of data:', len(train_df))
train_df.head()

Column of df: ['TEXT', 'AGGRESSION_CLASS']
Size of data: 12000


Unnamed: 0,TEXT,AGGRESSION_CLASS
0,Well said sonu..you have courage to stand agai...,OAG
1,"Most of Private Banks ATM's Like HDFC, ICICI e...",NAG
2,"Now question is, Pakistan will adhere to this?",OAG
3,Pakistan is comprised of fake muslims who does...,OAG
4,"??we r against cow slaughter,so of course it w...",NAG


In [7]:
train_df = train_df.reindex(np.random.permutation(train_df.index))

In [8]:
train_df.shape

(12000, 2)

In [9]:
train_df.head(20)

Unnamed: 0,TEXT,AGGRESSION_CLASS
4090,"this fellow has lost his mind completely, cant...",OAG
10178,He is Rajput DNA 👍 just wait and see bro ...,NAG
2855,Paul Manish ask ur mom who's ur father he's a ...,OAG
4338,Good,NAG
4035,JNU deserves it at present because students pr...,CAG
11604,Though JP is promising to give flats in 1-2 ye...,NAG
3431,Its a great effort by the poor people who need...,CAG
3275,i too dont want oil at $80 but junk bund of oi...,NAG
6351,"Guarav K they are so brainwashed, that they ca...",CAG
1877,What about the block money with big people viz...,CAG


In [10]:
# data slicing
num_of_categories = 12500
shuffled = train_df.reindex(np.random.permutation(train_df.index))

CAG = shuffled[shuffled['AGGRESSION_CLASS'] == 'CAG']#[:num_of_categories]
OAG = shuffled[shuffled['AGGRESSION_CLASS'] == 'OAG']#[:num_of_categories]
NAG = shuffled[shuffled['AGGRESSION_CLASS'] == 'NAG']#[:num_of_categories]

concated = pd.concat([CAG, OAG, NAG], ignore_index=True)

In [11]:
# label col
concated['LABEL'] = 0

In [12]:
np.random.seed(41)
concated = concated.reindex(np.random.permutation(concated.index))

concated.loc[concated['AGGRESSION_CLASS'] == 'CAG', 'LABEL'] = 0
concated.loc[concated['AGGRESSION_CLASS'] == 'OAG', 'LABEL'] = 1
concated.loc[concated['AGGRESSION_CLASS'] == 'NAG', 'LABEL'] = 2

In [13]:
concated['LABEL'][:10]

9230    2
3417    0
5996    1
1813    0
1307    0
8169    2
7666    2
2436    0
7442    2
1169    0
Name: LABEL, dtype: int64

In [14]:
# one-hot encoding
labels = to_categorical(concated['LABEL'], num_classes=3)

In [15]:
labels

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]], dtype=float32)

# BERT Tokenizer

In [16]:
word_max_len = 64

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [18]:
 concated['TEXT'].values[:3]

array(['Buy JAYBARMARU (Nse code- 1708) @ 363 targets 410 within a month probably.',
       "Hello. It's not the link that is the problem, it's the RBI site itself. Hopefully should be back soon.",
       'Anna is sounding like an opportunist. At least Arvind has stuck to what he started and has created a government and trying to do something constructive.'],
      dtype=object)

In [19]:
tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t) + ['[SEP]'], concated['TEXT'].values))

In [20]:
tokens[:3]

[['[CLS]',
  'buy',
  'jay',
  '##bar',
  '##mar',
  '##u',
  '(',
  'ns',
  '##e',
  'code',
  '-',
  '1708',
  ')',
  '@',
  '36',
  '##3',
  'targets',
  '410',
  'within',
  'a',
  'month',
  'probably',
  '.',
  '[SEP]'],
 ['[CLS]',
  'hello',
  '.',
  'it',
  "'",
  's',
  'not',
  'the',
  'link',
  'that',
  'is',
  'the',
  'problem',
  ',',
  'it',
  "'",
  's',
  'the',
  'rbi',
  'site',
  'itself',
  '.',
  'hopefully',
  'should',
  'be',
  'back',
  'soon',
  '.',
  '[SEP]'],
 ['[CLS]',
  'anna',
  'is',
  'sounding',
  'like',
  'an',
  'op',
  '##port',
  '##uni',
  '##st',
  '.',
  'at',
  'least',
  'ar',
  '##vin',
  '##d',
  'has',
  'stuck',
  'to',
  'what',
  'he',
  'started',
  'and',
  'has',
  'created',
  'a',
  'government',
  'and',
  'trying',
  'to',
  'do',
  'something',
  'constructive',
  '.',
  '[SEP]']]

In [21]:
tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, tokens)), maxlen=word_max_len, truncating="post", padding="post", dtype="int")

Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (787 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (1103 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (786 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (665 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (602 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length 

In [22]:
tokens_ids.shape

(12000, 64)

In [23]:
tokens_ids[:1]

array([[  101,  4965,  6108,  8237,  7849,  2226,  1006, 24978,  2063,
         3642,  1011, 27337,  1007,  1030,  4029,  2509,  7889, 19151,
         2306,  1037,  3204,  2763,  1012,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0]])

In [24]:
masks = [[float(i > 0) for i in ii] for ii in tokens_ids]

In [25]:
masks[:1]

[[1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0]]

# BERT Baseline

In [26]:
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 3)
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
#         proba = self.sigmoid(linear_output)
        proba = self.softmax(linear_output)
        return proba

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [28]:
device

device(type='cuda')

In [29]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'0.0M'

In [30]:
bert_clf = BertClassifier()

In [31]:
bert_clf = bert_clf.cuda()

In [32]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'439.071232M'

In [33]:
x = torch.tensor(tokens_ids[:3]).to(device)
y, pooled = bert_clf.bert(x, output_all_encoded_layers=False)

In [34]:
x.shape, y.shape, pooled.shape

(torch.Size([3, 64]), torch.Size([3, 64, 768]), torch.Size([3, 768]))

In [35]:
x

tensor([[  101,  4965,  6108,  8237,  7849,  2226,  1006, 24978,  2063,  3642,
          1011, 27337,  1007,  1030,  4029,  2509,  7889, 19151,  2306,  1037,
          3204,  2763,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  101,  7592,  1012,  2009,  1005,  1055,  2025,  1996,  4957,  2008,
          2003,  1996,  3291,  1010,  2009,  1005,  1055,  1996, 16929,  2609,
          2993,  1012, 11504,  2323,  2022,  2067,  2574,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0

In [36]:
y

tensor([[[-0.1109,  0.2057,  0.3847,  ..., -0.3044,  0.4924, -0.4941],
         [ 0.8200, -0.5331,  0.8024,  ..., -0.1300,  0.7983, -0.4020],
         [ 0.3769, -0.6173,  0.2439,  ..., -0.0659,  0.0816, -0.6694],
         ...,
         [ 0.4121, -0.2074,  0.8656,  ..., -0.7528,  0.1719, -0.9573],
         [ 0.4179, -0.0138,  1.1296,  ..., -0.6530, -0.1895, -1.3284],
         [ 0.6001, -0.4022,  0.9035,  ..., -0.4683,  0.0647, -1.1612]],

        [[-0.2768,  0.1752,  0.1537,  ..., -0.2207,  0.5238, -0.2874],
         [ 0.2061, -0.0522,  0.1848,  ...,  0.2871,  0.4584,  0.1021],
         [-0.5190, -0.0374,  0.3498,  ...,  0.3148,  0.4632, -0.1717],
         ...,
         [ 0.6811, -0.1454,  0.8159,  ..., -0.4920,  0.0283, -0.5419],
         [ 0.5164,  0.0925,  0.9959,  ..., -0.7448,  0.2029, -1.3671],
         [ 0.3714,  0.1585,  0.9605,  ..., -0.9843,  0.1680, -1.0755]],

        [[-0.2678,  0.2753,  0.4284,  ..., -0.0191,  0.5713, -0.1900],
         [-0.3718, -0.3010, -0.0020,  ...,  0

In [37]:
pooled

tensor([[-0.4451, -0.4274, -0.9588,  ..., -0.7033, -0.4637,  0.5934],
        [-0.5884, -0.5152, -0.9789,  ..., -0.8271, -0.5875,  0.5919],
        [-0.5148, -0.4912, -0.9565,  ..., -0.6459, -0.5531,  0.5860]],
       device='cuda:0', grad_fn=<TanhBackward0>)

In [38]:
y = bert_clf(x)

  app.launch_new_instance()


In [39]:
y.cpu().detach().numpy()

array([[0.45197368, 0.33663693, 0.21138936],
       [0.5384986 , 0.25826284, 0.20323864],
       [0.4774684 , 0.3303221 , 0.19220947]], dtype=float32)

In [40]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'879.4624M'

In [41]:
y, x, pooled = None, None, None
torch.cuda.empty_cache()

In [42]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'659.26144M'

# Fine-tune BERT

- Train / Test

In [43]:
BATCH_SIZE = 3
EPOCHS = 3

In [44]:
split_size = 4 * int(len(tokens_ids) / 5)

In [45]:
X_train_ids, X_test_ids = tokens_ids[:split_size,:], tokens_ids[split_size:,:]

In [46]:
tokens_ids.shape

(12000, 64)

In [47]:
X_train_ids.shape

(9600, 64)

In [48]:
X_test_ids.shape

(2400, 64)

In [49]:
y_train, y_test = labels[:split_size,:], labels[split_size:,:]

In [50]:
labels.shape

(12000, 3)

In [51]:
y_train.shape

(9600, 3)

In [52]:
y_test.shape

(2400, 3)

In [54]:
masks_train, masks_test = np.array(masks)[:split_size,:], np.array(masks)[split_size:,:]

In [56]:
np.array(masks).shape

(12000, 64)

In [57]:
masks_train.shape

(9600, 64)

In [58]:
masks_test.shape

(2400, 64)

In [59]:
train_tokens_tensor = torch.tensor(X_train_ids)
train_y_tensor = torch.tensor(y_train).float()

test_tokens_tensor = torch.tensor(X_test_ids)
test_y_tensor = torch.tensor(y_test).float()

train_masks_tensor = torch.tensor(masks_train)
test_masks_tensor = torch.tensor(masks_test)

In [60]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'659.26144M'

In [61]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [62]:
len(train_dataloader)

3200

In [63]:
len(test_dataloader)

800

In [64]:
train_dataset.tensors

(tensor([[ 101, 4965, 6108,  ...,    0,    0,    0],
         [ 101, 7592, 1012,  ...,    0,    0,    0],
         [ 101, 4698, 2003,  ...,    0,    0,    0],
         ...,
         [ 101, 2023, 2231,  ...,    0,    0,    0],
         [ 101, 2175, 3805,  ...,    0,    0,    0],
         [ 101, 7890, 3593,  ...,    0,    0,    0]]),
 tensor([[1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         ...,
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.]], dtype=torch.float64),
 tensor([[0., 0., 1.],
         [1., 0., 0.],
         [0., 1., 0.],
         ...,
         [0., 1., 0.],
         [0., 0., 1.],
         [0., 1., 0.]]))

- Fine-tune BERT

In [65]:
optimizer = Adam(bert_clf.parameters(), lr=3e-6)

In [66]:
torch.cuda.empty_cache()

In [67]:
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        logits = bert_clf(token_ids, masks)
        
        loss_func = nn.BCELoss()

        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()
        
        bert_clf.zero_grad()
        batch_loss.backward()

        clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
        optimizer.step()
        
        clear_output(wait=True)
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, int(len(X_train_ids) / BATCH_SIZE), train_loss / (step_num + 1)))

Epoch:  3
3199/3200 loss: 0.44709983104490675 


In [68]:
torch.save(bert_clf, './bert_clf_' + str(EPOCHS) + 'epoch' + '.h5')

# Evaluation

In [69]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        logits = bert_clf(token_ids, masks)
        
        loss_func = nn.BCELoss()
        
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        for i in range(len(logits)):
             bert_predicted.append(logits[i].argmax())
#         bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        
        clear_output(wait=True)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, int(len(X_test_ids) / BATCH_SIZE), train_loss / (step_num + 1)))

799/800 loss: 1.788399324179627 


In [71]:
bert_predicted

[tensor(2, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(2, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, device='cuda:0'),
 tensor(0, dev

In [72]:
bert_predicted = np.array(bert_predicted.cpu())

AttributeError: 'list' object has no attribute 'cpu'

In [None]:
len(bert_predicted)

In [None]:
bert_predicted

In [None]:
y_test

- Accuracy of classification

In [73]:
correct_count = 0
for i in range(len(bert_predicted)):
    y = y_test[i].argmax()
    
    if bert_predicted[i].item() == y:
        correct_count = correct_count + 1

In [74]:
correct_count / len(bert_predicted)

0.5895833333333333

- Confusion matrix

In [75]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

In [76]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [77]:
class_names = np.array(['B', 'E', 'T', 'M'])

In [None]:
confusion_matrix_predicted = [element.item() for element in bert_predicted.flatten()]

In [None]:
confusion_matrix_predicted

In [None]:
y_test.argmax(axis=1)

In [None]:
# Plot normalized confusion matrix
plot_confusion_matrix(y_test.argmax(axis=1), confusion_matrix_predicted, classes=class_names, normalize=True,
                      title='Confusion Matrix')

plt.show()