In [1]:
import pandas as pd
data = []
with open('msr_paraphrase_train.txt', 'r') as file:
    next(file)
    for line in file:
        split_line = line.strip().split('\t')
        if len(split_line) == 5:
            data.append(split_line)
        else:
            print(f"Skipping line due to incorrect number of columns: {line}")

columns = ["Quality", "#1 ID", "#2 ID", "#1 String", "#2 String"]
df = pd.DataFrame(data, columns=columns)
df['Quality'] = df['Quality'].astype(int)

data = []
with open('msr_paraphrase_test.txt', 'r') as file:
    next(file)
    for line in file:
        split_line = line.strip().split('\t')
        if len(split_line) == 5:
            data.append(split_line)
        else:
            print(f"Skipping line due to incorrect number of columns: {line}")

columns = ["Quality", "#1 ID", "#2 ID", "#1 String", "#2 String"]
df_test = pd.DataFrame(data, columns=columns)
df_test['Quality'] = df_test['Quality'].astype(int)




## BERT

- BERT is a pre-trained transformer model designed to understand the context of words in a sentence. It captures intricate patterns and relationships within the text through its deep architecture.
    - Instead of manual feature engineering, BERT learns the features directly from the text during the training process. This allows BERT to capture complex linguistic nuances that are hard to engineer manually.
- BERT embeddings are high-dimensional vectors that represent the meaning of sentences. When fine-tuning BERT, we leverage these embeddings to directly classify paraphrase pairs.

## Preprocessing

In [2]:
import re
import numpy as np
import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Function to clean text data
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['#1 String Cleaned'] = df['#1 String'].apply(clean_text)
df['#2 String Cleaned'] = df['#2 String'].apply(clean_text)

# Applying stop words removal and lemmatization
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word.lower() not in stop_words])
    return text

df['#1 String Processed'] = df['#1 String Cleaned'].apply(preprocess_text)
df['#2 String Processed'] = df['#2 String Cleaned'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## ParaphraseDataset

In [3]:
# Define a custom dataset for loading the data
class ParaphraseDataset(Dataset):
    ''' 
    This tells Python that ParaphraseDataset is a type of Dataset. 
    Dataset is a class from PyTorch's torch.utils.data module that 
    helps to manage and preprocess data. 
    '''
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
    '''
    This is the initializer method. It runs when we create an instance of the class.
    df: The DataFrame containing the data.
    tokenizer: The tokenizer to convert text into tokens.
    max_len: The maximum length for the tokenized sequences.
    self.df = df: Saves the DataFrame as an attribute of the class.
    self.tokenizer = tokenizer: Saves the tokenizer as an attribute of the class.
    self.max_len = max_len: Saves the maximum length as an attribute of the class.

    '''
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text1 = row['#1 String Processed']
        text2 = row['#2 String Processed']
        label = int(row['Quality'])

        encoding = self.tokenizer.encode_plus(
            text1,
            text2,
            max_length=self.max_len,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )
        '''
        encode_plus: Encodes the text into token IDs, 
        adds special tokens (e.g., [CLS], [SEP]), pads or truncates the sequence to max_len, 
        and returns the result as PyTorch tensors ('pt').
        
        Why: Converts text into a format that BERT can process, 
        ensuring all sequences have the same length and special tokens are added.
        '''

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
    '''
    Dictionary: Returns input IDs, attention masks, and labels.

    Why: Organizes the tokenized data and labels in a way that the DataLoader can easily use.
    '''
    


**BERT input components**
1. input_ids:
    - The token IDs representing the input text. These IDs correspond to words or sub-words in the text, which have been converted into numerical form using the BERT tokenizer.
    - Why it's needed: BERT processes text by converting it into token IDs. This numerical representation allows the model to perform computations on the text data.
        - Essential for feeding textual data into the BERT model, as the model operates on these numerical IDs.
2. attention_mask:
    - A binary mask that indicates which tokens should be attended to and which should be ignored (typically padding tokens). It is an array of 1s and 0s where 1 indicates a token to be attended to and 0 indicates a padding token.
    - Why it's needed: BERT uses the attention mechanism to focus on relevant tokens in the input sequence. The attention mask helps the model distinguish between actual data tokens and padding tokens, ensuring that the padding tokens do not influence the output.
        - Ensure that only meaningful tokens are attended to during the model's computations.

3. labels:
    - The actual class labels for the input text pairs (e.g., whether two sentences are paraphrases or not). In this context, it's a binary label (0 or 1).
    - Why it's needed: During training, the labels are used to compute the loss by comparing the model's predictions against the true labels. This loss is then used to update the model parameters to improve its predictions.
        - Crucial for supervised learning, as the loss calculated using these labels guides the optimization process, helping the model learn.

## BERT tokenizer and DataLoader

In [4]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


'''Tokenize the text data and create DataLoader objects for batching and shuffling.'''


# Dataset Creation
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = ParaphraseDataset(train_df, tokenizer, max_len=128)
val_dataset = ParaphraseDataset(val_df, tokenizer, max_len=128)
'''
Ensures that data is tokenized and prepared correctly for both training and validation

When you see ParaphraseDataset(train_df, tokenizer, max_len=128), 
you are creating an instance of ParaphraseDataset with these specific parameters. 
- The class Dataset that ParaphraseDataset inherits from is not a parameter but a base 
    class providing some functionality.
    
'''


# Create DataLoader for training and validation
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
'''
DataLoader is a PyTorch class that helps to load data in batches. 
- Batching: Grouping the data into smaller chunks (batches) that are fed to the model sequentially.
- Shuffling: Randomizing the order of the data to prevent the model from learning the order.
- Parallel Loading: Loading data using multiple CPU cores to speed up the process.
'''


'\nDataLoader is a PyTorch class that helps to load data in batches. \n- Batching: Grouping the data into smaller chunks (batches) that are fed to the model sequentially.\n- Shuffling: Randomizing the order of the data to prevent the model from learning the order.\n- Parallel Loading: Loading data using multiple CPU cores to speed up the process.\n'

## Paraphrase Model

In [5]:
# Define the BERT model
class ParaphraseModel(nn.Module):
    '''
    - Defines a new class ParaphraseModel that inherits from nn.Module.
    - nn.Module is a base class for all neural network modules in PyTorch. 
    It provides a way to define, manage, and operate on neural network layers and parameters.
    '''
    def __init__(self):
        super(ParaphraseModel, self).__init__()
        '''
        Calls the constructor of the parent class (nn.Module), 
        which is necessary to properly initialize the module.
        '''
        
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
        '''
        BertForSequenceClassification: A class from the Hugging Face Transformers library 
        specifically designed for sequence classification tasks.
        - num_labels=2: Sets the number of output labels to 2, which is suitable for binary classification (paraphrase or not).
        
        When you use the BertForSequenceClassification class with the from_pretrained method, 
        you are essentially taking the pre-trained BERT model (bert-base-uncased) 
        and adding a classification head to it.
        - The pre-trained BERT model (bert-base-uncased) 
            is highly effective at understanding language 
            but does not directly provide outputs suitable for specific tasks like classification.
        '''
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs.loss, outputs.logits
        '''
           input_ids: Token IDs of the input text.
            attention_mask: is a tensor that indicates which tokens in the input should 
            be attended to and which should be ignored (typically padding tokens). 
            It is an integral part of the input when working with models like BERT that
            use attention mechanisms to process sequences of text.

            - Padded tokens are special tokens added to text sequences to ensure that 
                all sequences in a batch have the same length. 
            - This is necessary because many machine learning models, including BERT, 
                require inputs of uniform size for efficient processing, 
                especially when using mini-batch training.
                - The attention mask has the same length as the sequence and 
                    indicates which tokens are padding (typically with a value of 0) 
                    and which are actual data tokens (typically with a value of 1).
            labels: True labels for the data.
            Outputs: When you pass these to the BERT model, it returns:
            loss: The classification loss if labels are provided.
            logits: The raw, unnormalized scores for each class.        
        '''
        

# Initialize model, loss function, and optimizer
model = ParaphraseModel()
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
'''
Criterion:
- criterion = nn.CrossEntropyLoss(): Defines the loss function. 
- Cross-entropy loss is used for classification tasks. 
- It measures the difference between the predicted probabilities and the true labels.

Optimizer:
- optimizer = optim.AdamW(model.parameters(), lr=2e-5): Defines the optimizer. 
- AdamW is a variant of the Adam optimizer that includes weight decay to reduce overfitting. 
- It updates the model's parameters to minimize the loss.
'''



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


"\nCriterion:\n- criterion = nn.CrossEntropyLoss(): Defines the loss function. \n- Cross-entropy loss is used for classification tasks. \n- It measures the difference between the predicted probabilities and the true labels.\n\nOptimizer:\n- optimizer = optim.AdamW(model.parameters(), lr=2e-5): Defines the optimizer. \n- AdamW is a variant of the Adam optimizer that includes weight decay to reduce overfitting. \n- It updates the model's parameters to minimize the loss.\n"

## Training

In [6]:

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    losses = []
    correct_predictions = 0

    for data in tqdm(data_loader):
        '''
        for data in tqdm(data_loader): Iterates over batches of data provided by data_loader. 
        tqdm is a library that shows a progress bar during the loop.
        '''
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)
        # Moves these to the device used to run the model

        
        # 1. Zeroing Gradients
        optimizer.zero_grad()
        '''
        - Gradients indicate how much a small change in each parameter will affect the loss.
        - During backpropagation, gradients are computed for each parameter to update 
        them in the direction that minimizes the loss.

        - By default, PyTorch accumulates gradients. 
        This means that when you call the backward() method to compute gradients, 
        the gradients for each parameter are added to the existing gradients 
        from previous iterations.
            - Why Accumulation?: Accumulating gradients can be useful in some cases, 
                such as when implementing certain types of optimization algorithms 
                or performing gradient accumulation over multiple mini-batches 
                to simulate larger batch sizes.

        - In most standard training loops, you do not want gradients to accumulate across 
        iterations. 
            - Instead, you want the gradients to reflect only the current mini-batch of data.
            - If you do not zero the gradients, they will accumulate, 
                leading to incorrect gradient values and potentially 
                causing the model to converge poorly or diverge.
                
        - This allows the optimizer to update the parameters correctly based on
        the current batch's contribution to the loss.
        '''
        
        # 2. Forward Pass
        loss, logits = model(input_ids, attention_mask, labels)
        '''
        - Logits: The model produces raw, unnormalized scores for each class, known as logits.
        - Loss: If labels are provided, the model also calculates the loss, 
        which is a measure of how well the model's predictions match the true labels. 
            - This loss will later be used for backpropagation.
        '''
                
        
        # Predictions
        _, preds = torch.max(logits, dim=1)
        '''
        This step converts the logits into actual class predictions.
        
        The torch.max function returns two tensors:
        1. The first tensor (which is assigned to _) contains the maximum scores 
        for each sample. This is often not needed when you only care about the 
        indices of the maximum values, hence it's assigned to _ (a common Python 
        convention for ignoring values).
        2. The second tensor (assigned to preds) contains the indices of the
        maximum scores, which correspond to the predicted classes.
        '''

        # 3. Backward Pass
        loss.backward()
        '''
        Computes the gradients of the loss with respect to each model parameter 
        (weight and bias) using backpropagation.
        '''
        
        # 4. Optimizer Step
        optimizer.step()
        '''
        Updates the model parameters using the computed gradients.
        
        The optimizer adjusts the model parameters (weights and biases) 
        based on the gradients computed during the backward pass. 
            - This adjustment is typically done using optimization algorithms 
            like Stochastic Gradient Descent (SGD) or Adam.
        '''
        
        
        

        losses.append(loss.item())
        '''
        Stores the loss for the current batch.
        The item() method extracts the scalar value from this tensor. 
            - In PyTorch, item() is used to get a standard Python number 
            from a tensor with a single value. 
            - This is useful for logging or aggregating purposes.
        '''
        
        
        
        correct_predictions += torch.sum(preds == labels)
        '''
        Where do labels come from? 
            - In each iteration of the training loop, 
            the DataLoader provides a batch of data, which includes both inputs 
            (such as input_ids and attention_mask) and labels.
        
        Counts correct predictions for accuracy calculation.
        '''

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

**Order of Pytorch training**

1. Gradient Accumulation: Gradients accumulate by default in PyTorch, so optimizer.zero_grad() is called before the forward pass to ensure gradients are zeroed for the current batch.
2. Forward Pass First: The forward pass must occur before backpropagation because gradients are calculated based on the loss computed in the forward pass.
    - Propagates input data through the network to produce an output. It is used to compute the predictions and the loss.
3. Backward Pass After Forward: Backpropagation computes gradients after the forward pass, based on the loss
    - Computes gradients of the loss with respect to the model parameters using backpropagation. These gradients are then used by the optimizer to update the parameters.
4. Optimizer Step Last: The optimizer updates the model parameters after gradients are computed and before moving on to the next batch.

In [7]:
def eval_model(model, data_loader, criterion, device):
    model.eval
    # Puts the model in evaluation mode. 
    
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        '''
        This context manager disables gradient computation. During evaluation, 
        we don't need to compute gradients, which saves memory and computational resources.
        '''
        for data in data_loader:
            '''
            Loops over each batch of data provided by the DataLoader. 
            The DataLoader handles batching, shuffling, and parallel data loading.
            '''
            
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            
            loss, logits = model(input_ids, attention_mask, labels)
            _, preds = torch.max(logits, dim=1)

            losses.append(loss.item())
            correct_predictions += torch.sum(preds == labels)

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

### Training vs Evaluation
Training Phase:
- Objective: Adjust the model parameters to minimize the loss and improve performance.
- Operations:
- Forward Pass: Compute the output (predictions) and loss.
- Backward Pass: Calculate gradients of the loss with respect to the model parameters using backpropagation.
- Optimization: Update the model parameters to reduce the loss using an optimizer (e.g., Adam, SGD).

Evaluation Phase:
- Objective: Assess the model's performance on unseen data without updating the model parameters.
Operations:
- Forward Pass: Compute the output (predictions) and loss.
- No Backward Pass: No gradients are computed.
- No Optimization: Model parameters are not updated.

In [8]:
# !pip install --upgrade torch torchvision torchaudio

# Train the model
epochs = 3

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    # Use train_loader to train
    train_acc, train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    print(f'Train loss: {train_loss}, Train accuracy: {train_acc}')

    # Use val_loader to test
    val_acc, val_loss = eval_model(model, val_loader, criterion, device)
    print(f'Validation loss: {val_loss}, Validation accuracy: {val_acc}')

Epoch 1/3


100%|██████████| 204/204 [01:54<00:00,  1.79it/s]


Train loss: 0.5725470986582485, Train accuracy: 0.6981595092024541
Validation loss: 0.4764804974490521, Validation accuracy: 0.7683823529411764
Epoch 2/3


100%|██████████| 204/204 [01:53<00:00,  1.80it/s]


Train loss: 0.4103083444722727, Train accuracy: 0.8119631901840492
Validation loss: 0.4473759853372387, Validation accuracy: 0.7769607843137255
Epoch 3/3


100%|██████████| 204/204 [01:53<00:00,  1.80it/s]


Train loss: 0.2333682452638944, Train accuracy: 0.9085889570552148
Validation loss: 0.5452328455798766, Validation accuracy: 0.7598039215686274


Loss is a metric that quantifies how well a model’s predictions match the actual labels. It measures the difference between the predicted outputs and the true outputs.

Lower loss values generally indicate better model performance as it means the predictions are closer to the actual labels.

# Code in one cell

In [2]:
import pandas as pd
import re
import numpy as np
import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
torch.cuda.empty_cache()
# Loading data
data = []
with open('msr_paraphrase_train.txt', 'r') as file:
    next(file)
    for line in file:
        split_line = line.strip().split('\t')
        if len(split_line) == 5:
            data.append(split_line)
        else:
            print(f"Skipping line due to incorrect number of columns: {line}")

columns = ["Quality", "#1 ID", "#2 ID", "#1 String", "#2 String"]
df = pd.DataFrame(data, columns=columns)
df['Quality'] = df['Quality'].astype(int)

data = []
with open('msr_paraphrase_test.txt', 'r') as file:
    next(file)
    for line in file:
        split_line = line.strip().split('\t')
        if len(split_line) == 5:
            data.append(split_line)
        else:
            print(f"Skipping line due to incorrect number of columns: {line}")

columns = ["Quality", "#1 ID", "#2 ID", "#1 String", "#2 String"]
df_test = pd.DataFrame(data, columns=columns)
df_test['Quality'] = df_test['Quality'].astype(int)

# Function to clean text data
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['#1 String Cleaned'] = df['#1 String'].apply(clean_text)
df['#2 String Cleaned'] = df['#2 String'].apply(clean_text)

# Applying stop words removal and lemmatization
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word.lower() not in stop_words])
    return text

df['#1 String Processed'] = df['#1 String Cleaned'].apply(preprocess_text)
df['#2 String Processed'] = df['#2 String Cleaned'].apply(preprocess_text)

# Define a custom dataset for loading the data
class ParaphraseDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text1 = row['#1 String Processed']
        text2 = row['#2 String Processed']
        label = int(row['Quality'])

        encoding = self.tokenizer.encode_plus(
            text1,
            text2,
            max_length=self.max_len,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create DataLoader for training and validation
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = ParaphraseDataset(train_df, tokenizer, max_len=128)
val_dataset = ParaphraseDataset(val_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Define the BERT model
class ParaphraseModel(nn.Module):
    def __init__(self):
        super(ParaphraseModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs.loss, outputs.logits

# Initialize model, loss function, and optimizer
model = ParaphraseModel()
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    losses = []
    correct_predictions = 0

    for data in tqdm(data_loader):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)

        optimizer.zero_grad()
        loss, logits = model(input_ids, attention_mask, labels)
        
        #predictions
        _, preds = torch.max(logits, dim=1)

        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        correct_predictions += torch.sum(preds == labels)

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader, criterion, device):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            loss, logits = model(input_ids, attention_mask, labels)
            _, preds = torch.max(logits, dim=1)

            losses.append(loss.item())
            correct_predictions += torch.sum(preds == labels)

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Train the model
epochs = 3

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_acc, train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    print(f'Train loss: {train_loss}, Train accuracy: {train_acc}')

    val_acc, val_loss = eval_model(model, val_loader, criterion, device)
    print(f'Validation loss: {val_loss}, Validation accuracy: {val_acc}')

torch.save(model.state_dict(), "best_model_BERT.pth")


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


100%|██████████| 204/204 [01:53<00:00,  1.79it/s]


Train loss: 0.5770857813311558, Train accuracy: 0.6806748466257669
Validation loss: 0.4743262202131982, Validation accuracy: 0.7708333333333334
Epoch 2/3


100%|██████████| 204/204 [01:53<00:00,  1.80it/s]


Train loss: 0.44783832059771406, Train accuracy: 0.7889570552147239
Validation loss: 0.4705883343430126, Validation accuracy: 0.7512254901960784
Epoch 3/3


100%|██████████| 204/204 [01:53<00:00,  1.80it/s]


Train loss: 0.2910658292947154, Train accuracy: 0.8812883435582822
Validation loss: 0.5576268259216758, Validation accuracy: 0.7745098039215687
