## Install and import the necessary packages

In [1]:
!pip install transformers seqeval[gpu] sentencepiece 

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 12.1 MB/s 
[?25hCollecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.9 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 41.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 41.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 40.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader


## Use GPU in the resources for lesser execution time


In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla K80


In [5]:
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers as t
from transformers import AutoModel
import re


In [6]:
df = pd.read_csv("https://raw.githubusercontent.com/pranshu27/MTech/main/hi_train.conll", sep = '_ _', names = ['sentence', 'word_labels'])
#df = df.iloc[:10000, :]
df.drop(df[df.sentence.str.startswith('#')].index, inplace = True)
df.reset_index(drop = True, inplace = True)

  return func(*args, **kwargs)


In [7]:
# removing the unnenecessary punctuation symbols

import string
import re
chars = re.escape(string.punctuation)


In [8]:
df['sentence'] = df['sentence'].apply(lambda x: re.sub(r'['+chars+']', '',x))
df = df.drop_duplicates().reset_index(drop=True)
df['sentence'] = df['sentence'].apply(lambda x: x.strip()) # removinf white spaces
df['word_labels'] = df['word_labels'].apply(lambda x: x.strip())

In [9]:
MAX_LEN = 20
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-04
MAX_GRAD_NORM = 10


In [10]:
# Load the BERT tokenizer
tokenizer =t.AutoTokenizer.from_pretrained('ai4bharat/indic-bert')


Downloading:   0%|          | 0.00/507 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.38M [00:00<?, ?B/s]

### Now we will convert the data into vectors and tensors which could be understood by the NN algorithm

In [11]:
from transformers import BertTokenizer

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):

    input_ids = []
    attention_masks = []

    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            text=sent,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,             # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            return_attention_mask=True      # Return attention mask
            )

        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [12]:
# Encode our concatenated data
encoded_sent = [tokenizer.encode(sent, add_special_tokens=True) for sent in df.sentence.values]

# Find the maximum length
max_len = max([len(sent) for sent in encoded_sent])
print('Max length: ', max_len)

Max length:  14


In [13]:
# Specify `MAX_LEN`
MAX_LEN = 20 #based on the above number that we obtained

### Splitting the data into training and validation sets which would be used during the training loop

In [14]:
from sklearn.model_selection import train_test_split

df.word_labels = pd.factorize(df.word_labels)[0]

X = df.sentence.values
y = df.word_labels.values

X_train, X_val, y_train, y_val =\
    train_test_split(X, y, test_size=0.2, random_state=27)

In [15]:

# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Tokenizing data...




### Getting the test data and cleaning it to be later used for prediction

In [16]:
test_data = pd.read_csv("https://raw.githubusercontent.com/pranshu27/MTech/main/hi_dev.conll", sep = '_ _', names = ['sentence', 'word_labels'])
#df = df.iloc[:10000, :]
test_data.drop(test_data[test_data.sentence.str.startswith('#')].index, inplace = True)
test_data.reset_index(drop = True, inplace = True)

  return func(*args, **kwargs)


In [17]:
test_data['sentence'] = test_data['sentence'].apply(lambda x: re.sub(r'['+chars+']', '',x))
test_data = test_data.drop_duplicates().reset_index(drop=True)
test_data['sentence'] = test_data['sentence'].apply(lambda x: x.strip())
test_data['word_labels'] = test_data['word_labels'].apply(lambda x: x.strip())
test_data.word_labels = pd.factorize(test_data.word_labels)[0]


In [18]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

### Below is the implementation of the forward function that I have written in order to fine tune the pre trained BERT model

In [19]:
import torch
import torch.nn as nn

# Create the BertClassfier class
class BertClassifier(nn.Module):
    def __init__(self, freeze_bert=True):
        super(BertClassifier, self).__init__()

        # Instantiate BERT model
        self.bert = AutoModel.from_pretrained('ai4bharat/indic-bert')

        # Instantiate an feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(768, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.Dropout(0.2),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(64, 13),
            nn.LogSoftmax(dim=1)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        # Feed input to BERT
        outputs = self.bert(input_ids, attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

### Using the freeze_bert param and setting it as True, we are freezing the internal layers of the bert model and skipping retraining them

In [20]:
# pass the pre-trained BERT to our define architecture
model = BertClassifier()

# push the model to GPU
model = model.to(device)

Downloading:   0%|          | 0.00/129M [00:00<?, ?B/s]

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertModel: ['sop_classifier.classifier.weight', 'sop_classifier.classifier.bias', 'predictions.decoder.weight', 'predictions.dense.bias', 'predictions.dense.weight', 'predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Using the standard Adafactor optimizer 

In [21]:
from transformers.optimization import Adafactor
optimizer = Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=1e-4)


### Using the standard cross entropy loss function for multi-class classification

In [22]:
loss_fn = nn.CrossEntropyLoss()

### Below I have defined my training loop, where the model will be trained for 2-3 Epochs. After each iteration, ie a chunk of batches, I am printing the training loss, and at the end validation loss and validation accuracy

In [23]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=0):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()
        print('Epoch, batch, current training loss')

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 50 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches

                # Print training results
                print(epoch_i+1, step, batch_loss / batch_counts)

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)
        print('Average training loss:', avg_train_loss)
        print("-"*70)

        if evaluation == True:
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            print('Validation loss and Validation f1 score')
            print(val_loss,val_accuracy)
            time_elapsed = time.time() - t0_epoch
            print("-"*70)
        print("\n")
    
    print("Training finished!")


def evaluate(model, val_dataloader):

    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_score = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        score = f1_score(preds.cpu().numpy(), b_labels.cpu().numpy(), zero_division=1, average = 'weighted')
        val_score.append(score)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_score)

    return val_loss, val_accuracy

In [24]:
import tensorflow
tensorflow.random.set_seed(0)
# set_seed(0)    # Set seed for reproducibility
from sklearn.metrics import f1_score

train(model, train_dataloader, val_dataloader, epochs=2, evaluation=True)


Start training...

Epoch, batch, current training loss
0 50 2.5259521054286584
0 100 2.4515465688705445
0 150 2.325907940864563
0 200 2.1679552006721496
0 250 2.023976278305054
0 300 2.0165663528442384
0 350 1.97131906747818
0 400 1.98094140291214
0 450 2.0168303275108337
0 500 1.9862948703765868
0 550 1.8944605898857116
0 600 1.9556732773780823
0 650 1.9760264682769775
0 700 1.986982822418213
0 750 1.9847903442382813
0 800 1.9481595277786254
0 845 1.982308300336202
Average training loss: 2.0713935967878245
----------------------------------------------------------------------
Validation loss and f1 score
1.956789717921671 0.6471205626372383
----------------------------------------------------------------------


Epoch, batch, current training loss
1 50 1.9748590483384973
1 100 2.0014833188056946
1 150 1.939512746334076
1 200 1.935323359966278
1 250 1.9797542691230774
1 300 1.9688503789901732
1 350 2.0034564423561094
1 400 1.9424221563339232
1 450 1.9159509611129761
1 500 1.96111560106

In [31]:
import torch.nn.functional as F



def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    test_score = []
    model.eval()
    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        
        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        score = f1_score(preds.cpu(), b_labels.cpu(),  average = 'weighted')
        #accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        test_score.append(score)


    return np.mean(test_score)

In [32]:
test_inputs, test_masks = preprocessing_for_bert(test_data.sentence)
test_labels = torch.tensor(test_data.word_labels)

# Create the DataLoader for our test set
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)



# F1 score obtained after fine tuning the model and using the test set for prediction is as under:

In [33]:
bert_predict(model, test_dataloader) #The function gives the F1 score on test data


0.7905086305888059