#### LOAD THE DATA

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import json

def load_jsonl(file_name):
    data = []
    with open(file_name, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Load each dataset
train_data = load_jsonl('/content/drive/My Drive/Colab Notebooks/snli_1.0_train.jsonl')
dev_data = load_jsonl('/content/drive/My Drive/Colab Notebooks/snli_1.0_dev.jsonl')
test_data = load_jsonl('/content/drive/My Drive/Colab Notebooks/snli_1.0_test.jsonl')


In [None]:
train_data.head(2)

Unnamed: 0,annotator_labels,captionID,gold_label,pairID,sentence1,sentence1_binary_parse,sentence1_parse,sentence2,sentence2_binary_parse,sentence2_parse
0,[neutral],3416050480.jpg#4,neutral,3416050480.jpg#4r1n,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,A person is training his horse for a competition.,( ( A person ) ( ( is ( ( training ( his horse...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
1,[contradiction],3416050480.jpg#4,contradiction,3416050480.jpg#4r1c,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,"A person is at a diner, ordering an omelette.",( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...


In [None]:
train_df = train_data.drop(train_data[train_data['gold_label']=='-'].index)
train_df = train_df.groupby('gold_label', group_keys=False).apply(lambda group: group.sample(frac=0.15, random_state=42))
test_df = test_data.drop(test_data[test_data['gold_label']=='-'].index)
eval_df = dev_data.drop(dev_data[dev_data['gold_label']=='-'].index)

print("Train Size: ", train_df.size)
print("Test Size: ",test_df.size)
print("Eval Size: ", eval_df.size)

Train Size:  824050
Test Size:  98240
Eval Size:  98420


#### DATA CLEANING

In [None]:
# Removing rows with any null values
train_df.dropna(inplace=True)

# Stripping leading/trailing whitespace from text columns
train_df['sentence1'] = train_df['sentence1'].str.strip()
train_df['sentence2'] = train_df['sentence2'].str.strip()

# Optionally, remove rows with empty strings in key text columns
train_df = train_df[(train_df['sentence1'] != '') & (train_df['sentence2'] != '')]

# Remove any special characters or unwanted whitespace
# This can be customized based on the specific requirements of your dataset
train_df['sentence1'] = train_df['sentence1'].str.replace(r'\s+', ' ', regex=True)
train_df['sentence2'] = train_df['sentence2'].str.replace(r'\s+', ' ', regex=True)

# Handle any other specific cleaning requirements you might have
# For example, converting text to lowercase (if necessary)
train_df['sentence1'] = train_df['sentence1'].str.lower()
train_df['sentence2'] = train_df['sentence2'].str.lower()

In [None]:
# Display general information about the dataset
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82405 entries, 331445 to 409353
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   annotator_labels        82405 non-null  object
 1   captionID               82405 non-null  object
 2   gold_label              82405 non-null  object
 3   pairID                  82405 non-null  object
 4   sentence1               82405 non-null  object
 5   sentence1_binary_parse  82405 non-null  object
 6   sentence1_parse         82405 non-null  object
 7   sentence2               82405 non-null  object
 8   sentence2_binary_parse  82405 non-null  object
 9   sentence2_parse         82405 non-null  object
dtypes: object(10)
memory usage: 6.9+ MB


#### PREPROCESS THE DATA

Tokenize the data using RoBERTa's tokenizer.

In [None]:
# Install Transformers only if it is not there
!pip show transformers || pip install transformers

Name: transformers
Version: 4.35.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 


In [None]:
# Import the nexessary libraries
from transformers import RobertaTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

Let us try to find the ideal max_length for the tokenized sequence. In the below code we are checking the length distribution of the sentence data.

In [None]:
from transformers import RobertaTokenizer
import pandas as pd

tokenizer13 = RobertaTokenizer.from_pretrained('roberta-base')

df = train_df
# Function to get the length of tokenized sentence
def length_of_tokenized(sentence):
    return len(tokenizer13.encode(sentence, add_special_tokens=True))

# Apply this function to your DataFrame
df['premise_length'] = df['sentence1'].apply(length_of_tokenized)
df['hypothesis_length'] = df['sentence2'].apply(length_of_tokenized)


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
# Descriptive statistics
premise_stats = df['premise_length'].describe()
hypothesis_stats = df['hypothesis_length'].describe()

# Display the statistics
print("Premise Length Statistics:\n", premise_stats)
print("\nHypothesis Length Statistics:\n", hypothesis_stats)


Premise Length Statistics:
 count    82405.000000
mean        16.745489
std          6.302005
min          4.000000
25%         12.000000
50%         15.000000
75%         20.000000
max         89.000000
Name: premise_length, dtype: float64

Hypothesis Length Statistics:
 count    82405.000000
mean        10.587901
std          3.398658
min          3.000000
25%          8.000000
50%         10.000000
75%         12.000000
max         55.000000
Name: hypothesis_length, dtype: float64


In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_length = 200

def tokenize_roberta(dataframe):
    return tokenizer(dataframe['sentence1'].tolist(),
                     dataframe['sentence2'].tolist(),
                     max_length=max_length,
                     padding='max_length',
                     truncation=True,
                     return_tensors='pt')

train_encodings = tokenize_roberta(train_df)
eval_encodings = tokenize_roberta(eval_df)
test_encodings = tokenize_roberta(test_df)

#### PREPARE DATASET AND DATALOADERS

In PyTorch, a Dataset is an abstract class representing a dataset. We should thus create a custom dataset which inherits Dataset and override the following methods:
 - __init__ to initialize the dataset object,
 - __len__ to return the size of the dataset, and
 - __getitem__ to support the indexing such that dataset[i] can be used to get the ith sample.

This also makes the dataset compatible with PyTorch's DataLoader class, which can load the data in batches, shuffle it, and manage parallel data loading.

In [None]:
class SNLIDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Convert labels to numeric values and create datasets
label_dict = {'entailment': 0, 'contradiction': 1, 'neutral': 2}

train_dataset = SNLIDataset(train_encodings, train_df['gold_label'].map(label_dict).tolist())
dev_dataset = SNLIDataset(eval_encodings, eval_df['gold_label'].map(label_dict).tolist())
test_dataset = SNLIDataset(test_encodings, test_df['gold_label'].map(label_dict).tolist())

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


#### RoBERTa MODEL

Next we initialize the RoBERTa model for sequence classification. The model is configured to output three labels (entailment, contradiction, neutral).

In [None]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##### ABOUT ROBERTA

**Pre-trained Language Model**<br/>
RoBERTa, like BERT, is a pre-trained language model which means it has been trained on a large corpus of text in an unsupervised manner. This pre-training involves predicting masked words in a sentence, allowing the model to understand the context and relationships between words.

**Transformer Architecture**<br/>
RoBERTa utilizes the Transformer architecture, which is made up of layers of self-attention mechanisms. These mechanisms allow the model to weigh the importance of different words in a sentence relative to each other.

**Input Representation**<br/>
For input, RoBERTa expects tokenized text sequences. In the case of the SNLI dataset, this is the tokenized premise and hypothesis. These tokenized sequences are prepended with a special classification token \[CLS\] (in BERT's case) or \<s\> in RoBERTa's case, which stands for the start of a sequence, and separated by a separator token \[SEP\] or \</s>.

**Token Embeddings**<br/>
The tokenized input is converted into embeddings. These are high-dimensional vectors that represent each token in a continuous space where semantically similar tokens are closer together.

**Positional Encodings**<br/>
Since the Transformer architecture does not inherently process sequence data in order, it requires positional encodings to give the model information about the position of the tokens in the sequence.

**Self-Attention Layers**<br/>
Once the model has the token embeddings and positional encodings, it processes this information through multiple layers of self-attention and feed-forward neural networks. The self-attention mechanism allows the model to focus on different parts of the input sequence when predicting a particular word, effectively understanding the context and relationships between words.

**Fine-tuning for Specific Tasks**<br/>
The pre-trained RoBERTa model can then be fine-tuned for specific tasks such as sequence classification, which is our case with the SNLI dataset. Fine-tuning involves adding a classification layer on top of the pre-trained model and training it on a labeled dataset with a specific task in mind. For the SNLI dataset, the task is to predict the relationship between the premise and hypothesis (entailment, contradiction, or neutral).

**Training Process**<br/>
During training, the model takes the input sequences and outputs a vector of logits representing the likelihood of each class. A softmax layer then converts these logits into probabilities.

**Backpropagation and Optimization**<br/>
The model is trained using backpropagation and an optimization algorithm (usually Adam or a variant). The loss function, often cross-entropy for classification tasks, measures the difference between the predicted probabilities and the true labels. The optimizer minimizes this loss across the training examples.

**Model Adjustments**<br/>
During fine-tuning, only a small amount of weight adjustment is needed compared to the pre-training phase. The model leverages the knowledge it gained during pre-training to adapt to the specifics of the new task with relatively little additional training.

In summary, RoBERTa processes input text by converting it into token embeddings, using self-attention to model the relationships between tokens, and then applying a classification layer to predict the correct label for the task. Fine-tuning tailors the model to the specific nuances of the dataset and task at hand.




#### TRAINING

Next step is to define the Training Loop

In [None]:
def evaluate(model, val_loader):
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    correct_predictions = 0

    with torch.no_grad():  # No need to track gradients for validation
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            predictions = outputs.logits.argmax(dim=-1)
            correct_predictions += (predictions == labels).sum().item()

    return val_loss / len(val_loader), correct_predictions / len(val_loader.dataset)

In [None]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import get_linear_schedule_with_warmup

num_epochs = 5
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_loader) * num_epochs

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the loss function, which is CrossEntropyLoss for classification tasks
loss_fn = CrossEntropyLoss()

# Move the model to the GPU if available
device = torch.device('cuda') #if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)
# Set the model to training mode
model.train()

for epoch in range(num_epochs):  # num_epochs is the total number of epochs
    # Initialize variables to track progress
    train_loss = 0
    train_correct_predictions = 0

    for batch in train_loader:  # train_loader is your DataLoader
        # Step 1: Move the batch data to the GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()

        # Step 2: Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        # Step 3: Compute loss
        loss = outputs.loss
        train_loss += loss.item()

        # Step 4: Backward pass
        loss.backward()

        # Step 5: Update parameters
        optimizer.step()

        # Step 6: Step the scheduler
        scheduler.step()

        # Calculate the accuracy
        predictions = outputs.logits.argmax(dim=-1)
        train_correct_predictions += (predictions == labels).sum().item()

    # Calculate the average loss and accuracy over the training data
    avg_train_loss = train_loss / len(train_loader)
    train_accuracy = train_correct_predictions / len(train_loader.dataset)

    # Evaluate on the validation set after every epoch
    avg_val_loss, val_accuracy = evaluate(model, dev_loader)

    # Print training/validation statistics
    print(f'Epoch {epoch+1}/{num_epochs}')
    print(f'Training Loss: {avg_train_loss:.3f}, Training Accuracy: {train_accuracy:.3f}')
    print(f'Validation Loss: {avg_val_loss:.3f}, Validation Accuracy: {val_accuracy:.3f}')

Epoch 1/5
Training Loss: 0.552, Training Accuracy: 0.781
Validation Loss: 0.367, Validation Accuracy: 0.863
Epoch 2/5
Training Loss: 0.347, Training Accuracy: 0.876
Validation Loss: 0.355, Validation Accuracy: 0.862
Epoch 3/5
Training Loss: 0.204, Training Accuracy: 0.933
Validation Loss: 0.360, Validation Accuracy: 0.879
Epoch 4/5
Training Loss: 0.099, Training Accuracy: 0.972
Validation Loss: 0.448, Validation Accuracy: 0.882


#### TESTING THE MODEL

Lets hope for the best :)
Changing in the hopes of it not prompting me about code run

In [None]:

model.eval()  # Set the model to evaluation mode

test_loss = 0
test_correct_predictions = 0

for batch in test_loader:
    # Move the batch data to the GPU
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    # Disable gradient computation
    with torch.no_grad():
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        test_loss += loss.item()

        # Calculate the accuracy
        predictions = outputs.logits.argmax(dim=-1)
        test_correct_predictions += (predictions == labels).sum().item()

# Calculate the average loss and accuracy over the test set
avg_test_loss = test_loss / len(test_loader)
test_accuracy = test_correct_predictions / len(test_loader.dataset)

# Print test statistics
print(f'Test Loss: {avg_test_loss:.3f}, Test Accuracy: {test_accuracy:.3f}')


NameError: ignored