In [1]:
%%time
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification

# Define a custom dataset class to preprocess and tokenize the input text pairs


class PlagiarismDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.text1 = df['text1'].values
        self.text2 = df['text2'].values
        self.labels = df['label'].values
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        inputs = self.tokenizer(self.text1[index], self.text2[index],
                                padding='max_length', truncation=True,
                                max_length=64, return_tensors='pt')
        input_ids = inputs['input_ids'].squeeze(0)
        token_type_ids = inputs['token_type_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        label = torch.tensor(self.labels[index], dtype=torch.long)
        return input_ids, token_type_ids, attention_mask, label


# Load the data into a Pandas DataFrame
df = pd.read_table(
	'./train_snli.txt.zip',
	sep='\t',
	header=None,
	names=['text1', 'text2', 'label']
)

# Split the data into training and validation sets
train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create dataset objects for training and validation
train_dataset = PlagiarismDataset(train_df, tokenizer)
val_dataset = PlagiarismDataset(val_df, tokenizer)

# Create data loaders to efficiently load and preprocess the data
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


  from .autonotebook import tqdm as notebook_tqdm


CPU times: user 3.27 s, sys: 1.3 s, total: 4.58 s
Wall time: 7.75 s


In [2]:
# Load the pre-trained BERT model and add a linear layer for classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
						      num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [3]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [4]:
# Train the model
train_loss = 0.0
val_loss = 0.0
correct = 0
total = 0
for epoch in range(5):

    model.train()
    for batch in train_loader:
        inputs = {'input_ids': batch[0],
                'token_type_ids': batch[1],
                'attention_mask': batch[2]}
        labels = batch[3]
        optimizer.zero_grad()
        outputs = model(**inputs)[0]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()


    # Evaluate the model on the validation set
    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            inputs = {'input_ids': batch[0],
                    'token_type_ids': batch[1],
                    'attention_mask': batch[2]}
            labels = batch[3]
            outputs = model(**inputs)[0]
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs["input_ids"].size(0)

    # Calculate accuracy and print loss for each epoch
    train_loss /= len(train_loader.dataset)
    val_loss /= len(val_loader.dataset)
    accuracy = 100 * correct / total
    print('Epoch: {} | Train Loss: {:.4f} | Val Loss: {:.4f} | Accuracy: {:.2f}%'.format(
        epoch+1, train_loss, val_loss, accuracy))


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

KeyboardInterrupt: 