In [223]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
import torch.optim as optim
import numpy as np

In [224]:
device = "mps" if torch.backends.mps.is_available() else 'cpu'

seed_val = 42
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.mps.manual_seed(seed_val)

In [225]:
path = "/Users/shakibibnashameem/Documents/Practice/bert/bert-classification/data/in_domain_train.tsv"
data = pd.read_csv(path, sep="\t", header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

In [226]:
data = data[["sentence", 'label']]
df = data.sample(frac=1, random_state=42).reset_index(drop=True)

sentence = data.sentence.values
label = data.label.values

In [227]:
X_train, X_test, y_train, y_test = train_test_split(sentence, label, test_size=.8, random_state=42)

In [228]:
l = []

for sent in sentence:
    s = sent.split(" ")
    l.append(len(s))

print(max(l))

42


In [229]:
class MakeDataset(Dataset):

    def __init__(self, sentences, label, tokenizer, max_lenght):
        self.sentences = sentences
        self.label = label
        self.tokenizer = tokenizer
        self.max_length = max_lenght

        self.data = self.prepare_data()

    def prepare_data(self):
        d = []

        for sent, label in zip(self.sentences, self.label):
             encodings = self.tokenizer.encode_plus(
                 sent,
                 add_special_tokens = True,
                 max_length = self.max_length,
                 padding = 'max_length',
                 return_attention_mask = True,
                 truncation = True,
                 return_tensors = 'pt'
             )

             d.append({
                 "id" : encodings['input_ids'].squeeze(0),
                 "mask" : encodings['attention_mask'].squeeze(0),
                 "label" : torch.tensor(int(label), dtype=torch.long)
             })

        return d
    
    def __getitem__(self, index):
        return self.data[index]
    
    def __len__(self):
        return len(self.sentences)

In [230]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [231]:
train_data = MakeDataset(X_train, y_train, tokenizer, 64)
test_data = MakeDataset(X_test, y_test, tokenizer, 64)

In [232]:
train_data.__getitem__(4)

{'id': tensor([ 101, 3389, 5444, 2005, 3021, 2138, 2010, 2269, 2409, 2032, 2000, 1012,
          102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]),
 'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'label': tensor(1)}

In [233]:
train_loader = DataLoader(train_data, shuffle=True, batch_size=32)
test_loader = DataLoader(test_data, shuffle=False, batch_size=32)


In [234]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2
)
model.to(device)

optimizer = optim.AdamW(model.parameters(), lr = 2e-05, eps = 1e-8)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [237]:
def train(model, train_loader, device, epochs):


    for epoch in range(epochs):
            
            tr_loss = 0
            correct_pred = 0
            total_pred = 0
            
            model.train()
            
            for _id, batch in enumerate(train_loader):

                id = batch['id'].to(device, dtype = torch.long)
                mask = batch['mask'].to(device, dtype = torch.long)
                label = batch['label'].to(device)

                optimizer.zero_grad()
                output = model(
                    input_ids = id,
                    attention_mask = mask,
                    token_type_ids = None,
                    labels = label
                )

                loss = output.loss
                logits = output.logits
                loss.backward()
                optimizer.step()

                tr_loss += output.loss.item()

                preds = torch.argmax(logits, dim=1)
                correct_pred += (preds == label).sum().item()
                total_pred += label.size(0)

            
            tr_loss /= len(train_loader)

            epoch_accuracy = correct_pred / total_pred

            print(f"Epoch: {epoch+1} | Training Loss: {tr_loss:.4f} | Training Accuracy: {epoch_accuracy:.4f}")


In [238]:
train(model, train_loader, device,5)

Epoch: 1 | Training Loss: 0.0633 | Training Accuracy: 0.9825
Epoch: 2 | Training Loss: 0.0604 | Training Accuracy: 0.9854
Epoch: 3 | Training Loss: 0.0611 | Training Accuracy: 0.9836
Epoch: 4 | Training Loss: 0.0451 | Training Accuracy: 0.9848
Epoch: 5 | Training Loss: 0.0707 | Training Accuracy: 0.9807
