[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/prabuscihero/NLP-Basic-to-Bert/blob/master/Bert.ipynb)

## Model - Bert

In [1]:

!pip install transformers==2.1.0
import random
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.metrics import classification_report

import torch
from torch import nn
from torch.utils import data

from transformers import (WEIGHTS_NAME, 
                          BertConfig, BertForSequenceClassification, BertTokenizer)

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AdamW,WarmupLinearSchedule
from tqdm import tqdm_notebook as tn
from sklearn.model_selection import train_test_split

Collecting transformers==2.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/5f/e5/4fb8a6215608c4036b6dd16613268a4b8958c20e4249d141e621e7f2e146/transformers-2.1.0-py3-none-any.whl (313kB)
[K     |█                               | 10kB 26.1MB/s eta 0:00:01[K     |██                              | 20kB 1.7MB/s eta 0:00:01[K     |███▏                            | 30kB 2.4MB/s eta 0:00:01[K     |████▏                           | 40kB 1.6MB/s eta 0:00:01[K     |█████▎                          | 51kB 2.0MB/s eta 0:00:01[K     |██████▎                         | 61kB 2.4MB/s eta 0:00:01[K     |███████▎                        | 71kB 2.8MB/s eta 0:00:01[K     |████████▍                       | 81kB 3.2MB/s eta 0:00:01[K     |█████████▍                      | 92kB 3.5MB/s eta 0:00:01[K     |██████████▌                     | 102kB 2.7MB/s eta 0:00:01[K     |███████████▌                    | 112kB 2.7MB/s eta 0:00:01[K     |████████████▋                   | 122

In [2]:
from google.colab import files
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [3]:
cd '/content/gdrive/My Drive/Colab Notebooks/NLP-Basic-to-Bert'

/content/gdrive/My Drive/Colab Notebooks/NLP-Basic-to-Bert


In [0]:
# Load the training data
train_df_full = pd.read_csv('training_data.csv')
test_df = pd.read_csv('testing_data.csv')

In [0]:
# Create train and validation set
train_df, valid_df, train_labels, valid_labels = train_test_split(train_df_full, train_df_full.user_rating, random_state=42, stratify=train_df_full.user_rating,test_size=0.20)

In [0]:
# Function to convert text to transformer features for Transformer models

def convert_text_to_features(examples, tokenizer,
                                      max_length=512,
                                      pad_on_left=False,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
    """
    Loads a data file into a list of ``InputFeatures``
    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)
    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.
    """
    features = [[],[],[]]
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            print("Writing example %d" % (ex_index))

        inputs = tokenizer.encode_plus(
            example,
            None,
            add_special_tokens=True,
            max_length=max_length,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)

        if ex_index < 1:
            print("*** Example ***")
            print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            print("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            print("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))

        features[0].append(input_ids)
        features[1].append(attention_mask)
        features[2].append(token_type_ids)

    return features

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def seed_everything(seed=123):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True

In [7]:
# Initializing the BERT Model 
model_name = "bert"
pretrained_model_name = "bert-base-uncased"
n_classes = 1

config_class, model_class, tokenizer_class = BertConfig, BertForSequenceClassification, BertTokenizer
config = config_class.from_pretrained(pretrained_model_name)
tokenizer = tokenizer_class.from_pretrained(pretrained_model_name, do_lower_case=True)
model = model_class.from_pretrained(pretrained_model_name, num_labels=1)

100%|██████████| 313/313 [00:00<00:00, 70813.81B/s]
100%|██████████| 231508/231508 [00:00<00:00, 425680.98B/s]
100%|██████████| 440473133/440473133 [00:37<00:00, 11778741.08B/s]


In [8]:

# Get the maximum number of words
# Find the number of words in user review
train_df['length']= train_df.user_review.str.split().apply(len)
test_df['length'] = test_df.user_review.str.split().apply(len)
valid_df['length'] = valid_df.user_review.str.split().apply(len)
train_df['user_review'] = train_df['user_review'].str.lower()
test_df['user_review'] = test_df['user_review'].str.lower()
valid_df['user_review'] = valid_df['user_review'].str.lower()
max_length =  max(train_df['length'].max(),test_df['length'].max(),valid_df['length'].max()) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a

In [9]:
# Converting the data into required format
#max_length = 
train_df["user_review"] = train_df["user_review"].astype(str).fillna("NA")
train_features = convert_text_to_features(train_df["user_review"], tokenizer, max_length=max_length)

valid_df["user_review"] = valid_df["user_review"].astype(str).fillna("NA")
valid_features = convert_text_to_features(valid_df["user_review"], tokenizer, max_length=max_length)

test_df["user_review"] = test_df["user_review"].astype(str).fillna("NA")
test_features = convert_text_to_features(test_df["user_review"], tokenizer, max_length=max_length)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Writing example 0
*** Example ***
input_ids: 101 12873 12873 2121 1045 2134 2102 2066 1996 2678 2129 2071 2035 3312 2015 3428 2022 1997 1037 2367 2679 2024 2057 2667 2000 2022 10317 6149 2030 2054 2036 1045 4299 2123 4890 2018 2921 2010 3797 2006 1045 2001 2357 2006 2011 2010 15892 3108 1045 2031 2464 2023 2864 3807 2011 2967 2060 2084 2123 4890 1998 2522 1998 5632 2009 2061 2172 2062 2507 2009 2039 2123 4890 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Writing example 0
*** Example ***
input_ids: 101 1996 3143 2332 6919 6268 1997 12280 2147 2013 1996 2220 2000 2345 2086 22074 1996 9849 1999 3405 2974 2058 2010 2476 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [0]:
# Convert dataset to tensor format

X = torch.tensor(train_features[0], dtype=torch.long)
X_mask = torch.tensor(train_features[1], dtype=torch.long)
X_seg_ids = torch.tensor(train_features[2], dtype=torch.long)
y = train_df["user_rating"].values
y = torch.tensor(y[:,np.newaxis], dtype=torch.float32)

X_valid = torch.tensor(valid_features[0], dtype=torch.long)
X_mask_valid = torch.tensor(valid_features[1], dtype=torch.long)
X_seg_ids_valid = torch.tensor(valid_features[2], dtype=torch.long)
valid_y = valid_df["user_rating"].values
valid_y = torch.tensor(valid_y[:,np.newaxis], dtype=torch.float32)

test_X = torch.tensor(test_features[0], dtype=torch.long)
test_X_mask = torch.tensor(test_features[1], dtype=torch.long)
test_X_seg_ids = torch.tensor(test_features[2], dtype=torch.long)
test_y = test_df["user_rating"].values
test_y = torch.tensor(test_y[:,np.newaxis], dtype=torch.float32)

# Create the dataloader for the train, valid and test dataset
batch_size = 8
train_dataset = data.TensorDataset(X, X_mask, X_seg_ids, y)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = data.TensorDataset(X_valid, X_mask_valid, X_seg_ids_valid, valid_y)
valid_loader = data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

test_dataset = data.TensorDataset(test_X, test_X_mask, test_X_seg_ids,test_y)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [0]:
# Set the model parameters
accumulation_steps = 1
n_epochs = 1
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
num_train_optimization_steps = int(n_epochs*len(train_dataset)/batch_size/accumulation_steps)
num_warmup_steps = int(0.05*num_train_optimization_steps)
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)
scheduler = WarmupLinearSchedule( optimizer,
                                 warmup_steps=num_warmup_steps,
                                 t_total=num_train_optimization_steps)

In [12]:
# Train the model
import os
seed_everything()
model.to(device)
loss_fn = nn.BCEWithLogitsLoss()

count = 0
val_acc = []
train_acc = []
for epoch in range(n_epochs):
    model.train()
    print(epoch)
    count = 0
    j = 0 
    lent=len(train_loader)
    for x_batch, x_mask, x_seg_ids, y_batch in tn(train_loader):
        #print(j,lent)
        outputs = model(x_batch.to(device),
                        attention_mask=x_mask.to(device),
                        token_type_ids=x_seg_ids.to(device),
                        labels=None)
        y_pred = outputs[0]
        loss = loss_fn(y_pred, y_batch.to(device))
        loss.backward()
        scheduler.step()
        optimizer.step()
        optimizer.zero_grad()
    model.eval()
    for x_batch, x_mask, x_seg_ids, y_batch in valid_loader:
        #print(j,lent)
        outputs = model(x_batch.to(device),
                        attention_mask=x_mask.to(device),
                        token_type_ids=x_seg_ids.to(device),
                        labels=None)
        pred = sigmoid(outputs[0].detach().cpu().numpy())
        pred = pred.squeeze()
        positive_threshold = pred > 0.5
        negative_threshold = pred <= 0.5
        pred[positive_threshold] = 1
        pred[negative_threshold] = 0 
        y_batch = y_batch.squeeze().numpy()
        val_acc.append((pred == y_batch).mean())

    for x_batch, x_mask, x_seg_ids, y_batch in train_loader:
        #print(j,lent)
        outputs = model(x_batch.to(device),
                        attention_mask=x_mask.to(device),
                        token_type_ids=x_seg_ids.to(device),
                        labels=None)
        pred = sigmoid(outputs[0].detach().cpu().numpy())
        pred = pred.squeeze()
        positive_threshold = pred > 0.5
        negative_threshold = pred <= 0.5
        pred[positive_threshold] = 1
        pred[negative_threshold] = 0 
        y_batch = y_batch.squeeze().numpy()
        train_acc.append((pred == y_batch).mean())
    print("validation accuracy = ",np.mean(val_acc),"training accuracy = ",np.mean(train_acc))


0


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




validation accuracy =  0.9468 training accuracy =  0.976775


In [15]:
# Predict the test dataset
predict_list = []
expected_list = []
model.eval()
for i, (x_batch, x_mask, x_seg_ids,y_batch) in enumerate(tn(test_loader)):
    outputs = model(x_batch.to(device),
                    attention_mask=x_mask.to(device),
                    token_type_ids=x_seg_ids.to(device),
                    labels=None)
    pred = sigmoid(outputs[0].detach().cpu().numpy())
    pred = pred.squeeze()
    positive_threshold = pred > 0.5
    negative_threshold = pred <= 0.5
    pred[positive_threshold] = 1
    pred[negative_threshold] = 0
    predict_list.extend(pred.tolist())
    expected_list.extend(y_batch.cpu().detach().numpy().tolist())

HBox(children=(IntProgress(value=0, max=6250), HTML(value='')))




In [16]:
print(classification_report(expected_list,predict_list))

              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95     24626
         1.0       0.95      0.95      0.95     25374

    accuracy                           0.95     50000
   macro avg       0.95      0.95      0.95     50000
weighted avg       0.95      0.95      0.95     50000

