### Text Classification with BERT

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os
import random
import numpy as np
import pandas as pd
from sklearn import model_selection

import torch
from torch import nn
from torch.utils import data

In [0]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/f9/51824e40f0a23a49eab4fcaa45c1c797cbf9761adedd0b558dab7c958b34/transformers-2.1.1-py3-none-any.whl (311kB)
[K     |█                               | 10kB 35.6MB/s eta 0:00:01[K     |██                              | 20kB 5.7MB/s eta 0:00:01[K     |███▏                            | 30kB 8.2MB/s eta 0:00:01[K     |████▏                           | 40kB 5.5MB/s eta 0:00:01[K     |█████▎                          | 51kB 6.7MB/s eta 0:00:01[K     |██████▎                         | 61kB 7.9MB/s eta 0:00:01[K     |███████▍                        | 71kB 9.1MB/s eta 0:00:01[K     |████████▍                       | 81kB 10.2MB/s eta 0:00:01[K     |█████████▌                      | 92kB 11.3MB/s eta 0:00:01[K     |██████████▌                     | 102kB 9.1MB/s eta 0:00:01[K     |███████████▋                    | 112kB 9.1MB/s eta 0:00:01[K     |████████████▋                   | 122kB 9.

In [0]:
from transformers import (WEIGHTS_NAME, 
                          BertConfig, BertForSequenceClassification, BertTokenizer,
                          RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer,
                          XLMConfig, XLMForSequenceClassification, XLMTokenizer, 
                          XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer,
                          DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
from transformers import AdamW, WarmupLinearSchedule
from transformers import AutoModelForSequenceClassification, AutoTokenizer

### Dataset Preparation

In [0]:
data_path = "/content/drive/My Drive/Datahack NLP Workshop/Disaster/"
df = pd.read_csv(data_path + "socialmedia_disaster_tweets.csv", encoding='iso-8859-1')
df = df[["choose_one", "text"]]
df.columns = ["label", "text"]
df = df[df["label"].isin(["Relevant", "Not Relevant"])].reset_index(drop=True)
df.head()

Unnamed: 0,label,text
0,Relevant,Just happened a terrible car crash
1,Relevant,Our Deeds are the Reason of this #earthquake M...
2,Relevant,"Heard about #earthquake is different cities, s..."
3,Relevant,"there is a forest fire at spot pond, geese are..."
4,Relevant,Forest fire near La Ronge Sask. Canada


In [0]:
label_map = {"Relevant":1, "Not Relevant":0}
df["label"] = df["label"].map(label_map)

In [0]:
train_df, test_df = model_selection.train_test_split(df, test_size=0.2, random_state=2019)
print("Train data shape is : ",train_df.shape)
print("Test data shape is : ",test_df.shape)

Train data shape is :  (8688, 2)
Test data shape is :  (2172, 2)


### Custom Functions

In [0]:
def convert_text_to_features(examples, tokenizer,
                                      max_length=512,
                                      pad_on_left=False,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
    """
    Loads a data file into a list of ``InputFeatures``
    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)
    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.
    """
    features = [[],[],[]]
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            print("Writing example %d" % (ex_index))

        inputs = tokenizer.encode_plus(
            example,
            None,
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)

        # if ex_index < 1:
        #     print("*** Example ***")
        #     print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        #     print("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
        #     print("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))

        features[0].append(input_ids)
        features[1].append(attention_mask)
        features[2].append(token_type_ids)

    return features

In [0]:
def seed_everything(seed=123):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True

In [0]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

device

'cuda'

### Model Building

##### Model Config

In [0]:
MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
}

##### Model Parameters

In [0]:
model_name = "bert"
pretrained_model_name = "bert-base-uncased"
n_classes = 1
max_length = 128
batch_size = 8
n_epochs = 1
accumulation_steps = 1
lr = 2e-5

In [0]:
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_name]
config = config_class.from_pretrained(pretrained_model_name)
tokenizer = tokenizer_class.from_pretrained(pretrained_model_name, do_lower_case=True)
model = model_class.from_pretrained(pretrained_model_name, num_labels=1)

100%|██████████| 313/313 [00:00<00:00, 108631.95B/s]
100%|██████████| 231508/231508 [00:00<00:00, 897472.45B/s]
100%|██████████| 440473133/440473133 [00:16<00:00, 26619937.36B/s]


##### Train Data Loader

In [0]:
train_df["text"] = train_df["text"].astype(str).fillna("NA")
train_features = convert_text_to_features(train_df["text"], tokenizer, max_length=max_length)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Writing example 0


In [0]:
X = torch.tensor(train_features[0], dtype=torch.long)
X_mask = torch.tensor(train_features[1], dtype=torch.long)
X_seg_ids = torch.tensor(train_features[2], dtype=torch.long)
y = train_df["label"].values
y = torch.tensor(y[:,np.newaxis], dtype=torch.float32)

train_dataset = data.TensorDataset(X, X_mask, X_seg_ids, y)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

##### Optimizer

In [0]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
num_train_optimization_steps = int(n_epochs*len(train_dataset)/batch_size/accumulation_steps)
num_warmup_steps = int(0.05*num_train_optimization_steps)
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)
scheduler = WarmupLinearSchedule(optimizer,
                                 warmup_steps=num_warmup_steps,
                                 t_total=num_train_optimization_steps)

##### Training

In [0]:
seed_everything()
model.to(device)
loss_fn = nn.BCEWithLogitsLoss()

for epoch in range(n_epochs):
    model.train()
    for x_batch, x_mask, x_seg_ids, y_batch in train_loader:
        outputs = model(x_batch.to(device),
                        attention_mask=x_mask.to(device),
                        token_type_ids=x_seg_ids.to(device),
                        labels=None)
        y_pred = outputs[0]
        loss = loss_fn(y_pred, y_batch.to(device))
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

##### Test Data Loader

In [0]:
test_df["text"] = test_df["text"].astype(str).fillna("NA")
test_features = convert_text_to_features(test_df["text"], tokenizer, max_length=max_length)

test_X = torch.tensor(test_features[0], dtype=torch.long)
test_X_mask = torch.tensor(test_features[1], dtype=torch.long)
test_X_seg_ids = torch.tensor(test_features[2], dtype=torch.long)
test_y = test_df["label"].values

test_dataset = data.TensorDataset(test_X, test_X_mask, test_X_seg_ids)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Writing example 0


##### Predictions

In [0]:
preds = np.zeros([len(test_dataset), 1])
model.eval()
for i, (x_batch, x_mask, x_seg_ids) in enumerate(test_loader):
    outputs = model(x_batch.to(device),
                    attention_mask=x_mask.to(device),
                    token_type_ids=x_seg_ids.to(device),
                    labels=None)
    y_pred = sigmoid(outputs[0].detach().cpu().numpy())
    preds[i*batch_size:(i+1)*batch_size, :] = y_pred
    
from sklearn import metrics
metrics.roc_auc_score(test_y, preds)

0.8876832649879807

### DIY - Build BERT cased model

In [0]:
model_name = "bert"
pretrained_model_name = "bert-base-cased"
n_classes = 1
max_length = 128
batch_size = 8
n_epochs = 1
accumulation_steps = 1
lr = 2e-5

In [0]:
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_name]
config = config_class.from_pretrained(pretrained_model_name)
tokenizer = tokenizer_class.from_pretrained(pretrained_model_name, do_lower_case=True)
model = model_class.from_pretrained(pretrained_model_name, num_labels=1)

100%|██████████| 313/313 [00:00<00:00, 83411.73B/s]
100%|██████████| 213450/213450 [00:00<00:00, 841004.19B/s]
100%|██████████| 435779157/435779157 [00:17<00:00, 25391488.98B/s]


In [0]:
train_df["text"] = train_df["text"].astype(str).fillna("NA")
train_features = convert_text_to_features(train_df["text"], tokenizer, max_length=max_length)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Writing example 0


In [0]:
X = torch.tensor(train_features[0], dtype=torch.long)
X_mask = torch.tensor(train_features[1], dtype=torch.long)
X_seg_ids = torch.tensor(train_features[2], dtype=torch.long)
y = train_df["label"].values
y = torch.tensor(y[:,np.newaxis], dtype=torch.float32)

train_dataset = data.TensorDataset(X, X_mask, X_seg_ids, y)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [0]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
num_train_optimization_steps = int(n_epochs*len(train_dataset)/batch_size/accumulation_steps)
num_warmup_steps = int(0.05*num_train_optimization_steps)
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)
scheduler = WarmupLinearSchedule(optimizer,
                                 warmup_steps=num_warmup_steps,
                                 t_total=num_train_optimization_steps)

In [0]:
seed_everything()
model.to(device)
loss_fn = nn.BCEWithLogitsLoss()

for epoch in range(n_epochs):
    model.train()
    for x_batch, x_mask, x_seg_ids, y_batch in train_loader:
        outputs = model(x_batch.to(device),
                        attention_mask=x_mask.to(device),
                        token_type_ids=x_seg_ids.to(device),
                        labels=None)
        y_pred = outputs[0]
        loss = loss_fn(y_pred, y_batch.to(device))
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

In [0]:
test_df["text"] = test_df["text"].astype(str).fillna("NA")
test_features = convert_text_to_features(test_df["text"], tokenizer, max_length=max_length)

test_X = torch.tensor(test_features[0], dtype=torch.long)
test_X_mask = torch.tensor(test_features[1], dtype=torch.long)
test_X_seg_ids = torch.tensor(test_features[2], dtype=torch.long)
test_y = test_df["label"].values

test_dataset = data.TensorDataset(test_X, test_X_mask, test_X_seg_ids)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Writing example 0


In [0]:
preds = np.zeros([len(test_dataset), 1])
model.eval()
for i, (x_batch, x_mask, x_seg_ids) in enumerate(test_loader):
    outputs = model(x_batch.to(device),
                    attention_mask=x_mask.to(device),
                    token_type_ids=x_seg_ids.to(device),
                    labels=None)
    y_pred = sigmoid(outputs[0].detach().cpu().numpy())
    preds[i*batch_size:(i+1)*batch_size, :] = y_pred
    
from sklearn import metrics
metrics.roc_auc_score(test_y, preds)

0.8856596332458466